<a href="https://colab.research.google.com/github/FarazHeydar/Breast-Cancer-Detection/blob/main/Breast_Cancer_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Breast Cancer Detection


## **Data Loading and Exploratory Data Analysis (EDA)**
In this section, we import the necessary libraries and load the Breast Cancer Wisconsin dataset from Scikit-Learn. We perform the following steps:
1.  Data Loading: Convert the raw data into a Pandas DataFrame.
2.  Basic Inspection: View the first few rows, summary statistics, and data types.
3.  Visualization:
    * Target Distribution: Check if the dataset is balanced between Malignant and Benign cases.
    * Feature Distributions: Histograms to understand the spread of specific features.
    * Correlation Matrix: A heatmap to analyze relationships between the first 10 features.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gradio as gr
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.datasets import load_breast_cancer

# --- Data Loading & Preprocessing ---

# Loading the data
try:
    print("Loading Breast Cancer Dataset...")
    data = load_breast_cancer()

    # Create DataFrame for features
    X = pd.DataFrame(data.data, columns=data.feature_names)

    # Create DataFrame for target labels
    Y = pd.DataFrame(data.target, columns=["target"])
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: Dataset file not found. Please check the path.")
    exit()

print("Initial Data Exploration:")

# First 5 Rows
print("\nFirst 5 Rows:")
print(X.head())
print(Y.head())

# Summary Statistics
print("\nSummary Statistics:")
print(X.describe())
print(Y.describe())

# Column Descriptions (Info)
print("\nColumn Descriptions (Info):")
X.info()
Y.info()

# Data Summary
print("\nData Summary:")
print(f"X shape: {X.shape}, y shape: {Y.shape}")
print(f"y value counts:\n{Y['target'].value_counts()}")

# Visualization 1: Target Variable Distribution
print("\nTarget Variable Distribution")
plt.figure(figsize=(8, 6))
sns.countplot(x=Y['target']) # Access the 'target' column as a Series
plt.title('Distribution of Target Class')
plt.xlabel('Class (0: Malignant, 1: Benign)')
plt.ylabel('Count')
plt.xticks([0, 1], ['Malignant (0)', 'Benign (1)'])
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# Visualization 2: Feature Distributions (Histograms for key features)
print("\nSample Feature Distributions")
sample_features = ['mean radius', 'mean texture', 'worst radius', 'worst concavity']
plt.figure(figsize=(8, 6))
for i, feature in enumerate(sample_features, 1):
    plt.subplot(2, 2, i)
    sns.histplot(X[feature], kde=True)
    plt.title(f'Distribution of {feature}')
plt.tight_layout()
plt.show()

# Visualization 3: Correlation Heatmap (for first 10 features (mean))
print("\nFeature Correlation Heatmap")
corr_matrix = X.iloc[:, :10].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix (Mean of Features)')
plt.show()

## **Model Training, Evaluation, and Comparison**
In this section, we build and evaluate five different machine learning models: Logistic Regression, SVM, Random Forest, Decision Tree, and K-Nearest Neighbors (KNN).

Key Steps:
1.  Data Splitting: 75% for training, 25% for testing.
2.  Scaling: Applied `StandardScaler` to normalize feature values (essential for SVM and KNN).
3.  Training: Fitting models on the training set.
4.  Single Split Evaluation: Calculating Accuracy and F1 Score on the test set.
5.  Cross-Validation: Performing 5-Fold Cross-Validation to ensure model stability.
6.  Visualization:
    * Bar Chart: Comparing Test Accuracy vs. Cross-Validation Scores.
    * Confusion Matrices: Heatmaps showing True Positives, True Negatives, etc.

In [None]:
# --- Model Training ---

# Splitting the data into training data & Testing data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=2)
print(X.shape, X_train.shape, X_test.shape)

# Adding Standardization (Scaling)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# --- Training and Validation ---
print("\n Models are training... .")

# Dictionary to store results for plotting
results_for_plot = []
conf_matrices = {}

# Logistic Regression
lr_model = LogisticRegression(max_iter=5000)
lr_model.fit(X_train, Y_train.values.ravel())
X_train_prediction = lr_model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)

X_test_prediction = lr_model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)
f1_lr = f1_score(Y_test, X_test_prediction, pos_label=0)
cm_lr = confusion_matrix(Y_test, X_test_prediction)

results_for_plot.append({"Algorithm": "Logistic Regression", "Accuracy": test_data_accuracy, "F1 Score": f1_lr})
conf_matrices["Logistic Regression"] = cm_lr

# SVM
svm_model = SVC()
svm_model.fit(X_train, Y_train.values.ravel())
X_train_prediction = svm_model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)

X_test_prediction_svm = svm_model.predict(X_test)
test_data_accuracy_svm = accuracy_score(Y_test, X_test_prediction_svm)
f1_svm = f1_score(Y_test, X_test_prediction_svm, pos_label=0)
cm_svm = confusion_matrix(Y_test, X_test_prediction_svm)

results_for_plot.append({"Algorithm": "SVM", "Accuracy": test_data_accuracy_svm, "F1 Score": f1_svm})
conf_matrices["SVM"] = cm_svm

# Random Forest
rf_model = RandomForestClassifier(random_state=2)
rf_model.fit(X_train, Y_train.values.ravel())
X_train_prediction = rf_model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)

X_test_prediction_rf = rf_model.predict(X_test)
test_data_accuracy_rf = accuracy_score(Y_test, X_test_prediction_rf)
f1_rf = f1_score(Y_test, X_test_prediction_rf, pos_label=0)
cm_rf = confusion_matrix(Y_test, X_test_prediction_rf)

results_for_plot.append({"Algorithm": "Random Forest", "Accuracy": test_data_accuracy_rf, "F1 Score": f1_rf})
conf_matrices["Random Forest"] = cm_rf

# Decision Tree
dt_model = DecisionTreeClassifier(random_state=2)
dt_model.fit(X_train, Y_train.values.ravel())
X_train_prediction = dt_model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)

X_test_prediction_dt = dt_model.predict(X_test)
test_data_accuracy_dt = accuracy_score(Y_test, X_test_prediction_dt)
f1_dt = f1_score(Y_test, X_test_prediction_dt, pos_label=0)
cm_dt = confusion_matrix(Y_test, X_test_prediction_dt)

results_for_plot.append({"Algorithm": "Decision Tree", "Accuracy": test_data_accuracy_dt, "F1 Score": f1_dt})
conf_matrices["Decision Tree"] = cm_dt

# KNN
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, Y_train.values.ravel())
X_train_prediction = knn_model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)

X_test_prediction_knn = knn_model.predict(X_test)
test_data_accuracy_knn = accuracy_score(Y_test, X_test_prediction_knn)
f1_knn = f1_score(Y_test, X_test_prediction_knn, pos_label=0)
cm_knn = confusion_matrix(Y_test, X_test_prediction_knn)

results_for_plot.append({"Algorithm": "KNN", "Accuracy": test_data_accuracy_knn, "F1 Score": f1_knn})
conf_matrices["KNN"] = cm_knn

print("\n Models are trained... .")

# 5-FOLD CROSS-VALIDATION
print("\n starting 5-Fold cross-validation... .")

models = {
    "Logistic Regression": lr_model,
    "SVM": svm_model,
    "Random Forest": rf_model,
    "Decision Tree": dt_model,
    "K-NN": knn_model
}

cv_results = []

for model_name, model in models.items():
    # Use a pipeline to ensure proper scaling within each fold
    pipeline = make_pipeline(StandardScaler(), model)
    scores = cross_val_score(pipeline, X, Y.values.ravel(), cv=5, scoring='accuracy')
    mean_score = scores.mean()
    cv_results.append(mean_score)

results_list = []

print("\n 5-Fold cross-validation is done.")

print("\n Generating final summary table... .")

for name, model in models.items():
    # --- Part 1: Single Split Metrics (No re-training) ---
    # Models are already fitted on X_train, so we just predict

    # Training Accuracy
    train_pred = model.predict(X_train)
    train_acc = accuracy_score(Y_train, train_pred)

    # Testing Accuracy
    test_pred = model.predict(X_test)
    test_acc = accuracy_score(Y_test, test_pred)

    # F1 Score (for the Malignant class)
    f1 = f1_score(Y_test, test_pred, pos_label=0)

    # --- Part 2: Cross-Validation Calculation ---
    # We build a pipeline to handle scaling for every fold
    # Note: cross_val_score uses a copy of the model, leaving the original intact
    pipeline = make_pipeline(StandardScaler(), model)

    # Using raw X and Y for the 5-fold split
    cv_scores = cross_val_score(pipeline, X, Y.values.ravel(), cv=5, scoring='accuracy')
    cv_mean = cv_scores.mean()

    # Saving results
    results_list.append({
        "Algorithm": name,
        "Training Accuracy": train_acc,
        "Testing Accuracy": test_acc,
        "F1 Score (Malignant)": f1,
        "5-Fold CV Mean": cv_mean
    })

# Creating the final table
final_table = pd.DataFrame(results_list)

# Setting display format (4 decimal places)
pd.options.display.float_format = '{:.4f}'.format

print("\n" + "="*80)
print("FINAL PERFORMANCE SUMMARY TABLE")
print("="*80)
print(final_table)
print("="*80)

# --- PLOTTING SECTION ---
print("\n Plotting: \n")

# 1. Bar Chart Comparison (Accuracy & F1 Score)
results_df = pd.DataFrame(results_for_plot)
results_df["5-Fold CV Mean"] = cv_results # Add CV results to dataframe

# Plotting Accuracy vs CV
fig, ax = plt.subplots(figsize=(10, 6))
width = 0.35
x = np.arange(len(results_df["Algorithm"]))

rects1 = ax.bar(x - width/2, results_df["Accuracy"], width, label='Test Accuracy (Single Split)')
rects2 = ax.bar(x + width/2, results_df["5-Fold CV Mean"], width, label='5-Fold CV Mean Accuracy')

ax.set_ylabel('Accuracy')
ax.set_title('Comparison of Single Split Accuracy vs. 5-Fold CV Mean')
ax.set_xticks(x)
ax.set_xticklabels(results_df["Algorithm"])
ax.set_ylim(0.85, 1.0) # Zooming in on the upper range to see differences clearly
ax.legend()

# Adding labels on top of bars
def autolabel(rects):
    for rect in rects:
        height = rect.get_height()
        ax.annotate(f'{height:.4f}',
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom', fontsize=9)

autolabel(rects1)
autolabel(rects2)

plt.tight_layout()
plt.show() # Display first plot

print("\n")

# 2. Confusion Matrices Heatmaps
fig, axes = plt.subplots(1, 5, figsize=(20, 4))
fig.suptitle('Confusion Matrices for All Models (Test Set)', fontsize=16)

model_names = ["Logistic Regression", "SVM", "Random Forest", "Decision Tree", "KNN"]

for i, name in enumerate(model_names):
    cm = conf_matrices[name]
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i], cbar=False)
    axes[i].set_title(name)
    axes[i].set_xlabel('Predicted')
    axes[i].set_ylabel('Actual')
    axes[i].set_xticklabels(['Malignant (0)', 'Benign (1)'])
    axes[i].set_yticklabels(['Malignant (0)', 'Benign (1)'])

plt.tight_layout()
plt.subplots_adjust(top=0.85) # Adjust spacing for main title
plt.show() # Display second plot

## **Interactive Deployment with Gradio**
This section builds a user-friendly interface to test the trained models in real-time.

Features:
1.  Model Selection: Dropdown menu to choose which algorithm to use for prediction.
2.  Input Fields: Number inputs for all 30 breast cancer features (initialized with default values).
3.  Styling: Custom CSS is applied to support both Light Mode and Dark Mode for a polished look.
4.  Prediction Logic: Takes inputs, reshapes them for the model, and outputs "Malignant" (Red) or "Benign" (Blue).

In [None]:
# --- The Predictive System ---
# Install the necessary dependency to load the GUI
!pip install gradio

# Ensure your models (lr_model, svm_model, etc.) and 'X' are loaded from the previous cells.
models = {
    "Logistic Regression": lr_model,
    "SVM": svm_model,
    "Random Forest": rf_model,
    "Decision Tree": dt_model,
    "K-NN": knn_model
}
feature_names = X.columns.tolist()

# Default input values (sample mean/typical values)
default_input_data = (13.54, 14.36, 87.46, 566.3, 0.09779, 0.08129, 0.06664, 0.04781,
                      0.1885, 0.05766, 0.2699, 0.7886, 2.058, 23.56, 0.008462, 0.0146,
                      0.02387, 0.01315, 0.0198, 0.0023, 15.11, 19.26, 99.7, 711.2,
                      0.144, 0.1773, 0.239, 0.1288, 0.2977, 0.07259)

def predict_logic(model_choice, *features):
    """
    Handles the prediction logic based on user input from Gradio.
    """
    # 1. Select the model
    selected_model = models[model_choice]

    # 2. Format inputs (reshape to 2D array)
    input_data = np.asarray(features).reshape(1, -1)

    # 3. Predict
    prediction = selected_model.predict(input_data)

    # 4. Determine Text & Color based on prediction class
    if prediction[0] == 0:
        result_text = "Malignant"
        color = "#ff0000" # RED
    else:
        result_text = "Benign"
        color = "#2196f3" # BRIGHT BLUE

    # 5. Return HTML Span (Box styling is handled by CSS)
    return f"<span style='color: {color}; font-size: 32px;'>{result_text}</span>"

# --- CSS STYLING ---
custom_css = """
/* ============================================================
   1. LIGHT MODE
   ============================================================ */
body:not(.dark), body:not(.dark) .gradio-container {
    background-color: #ffffff !important;
    color: #4a148c !important;
}
body:not(.dark) #input-scroll-container {
    background-color: #fcfcfc !important;
    border: 1px solid #ce93d8 !important;
}
body:not(.dark) input[type="number"] {
    background-color: #ffffff !important;
    border: 1px solid #ce93d8 !important;
    color: #4a148c !important;
}
body:not(.dark) label span {
    color: #6a1b9a !important;
}
body:not(.dark) #header-box {
    background-color: #f3e5f5 !important;
    border: 2px solid #ce93d8 !important;
}
body:not(.dark) #header-text p {
    color: #6a1b9a !important;
}
body:not(.dark) #predict-btn {
    background-color: #ab47bc !important;
    background-image: linear-gradient(to right, #ab47bc, #8e24aa) !important;
}
body:not(.dark) h3 {
    color: #6a1b9a !important;
}
/* Light Mode Result Box styling */
body:not(.dark) #result-box {
    background-color: #fcfcfc;
    border: 2px solid #ce93d8;
    color: #4a148c;
}

/* ============================================================
   2. DARK MODE
   ============================================================ */
.dark, .dark .gradio-container {
    background-color: #0f0b29 !important;
    color: #ff66b2 !important;
}
.dark #input-scroll-container {
    background-color: #1a1a40 !important;
    border: 1px solid #7c43bd !important;
}
.dark input[type="number"] {
    background-color: #0f0b29 !important;
    border: 1px solid #7c43bd !important;
    color: #ffffff !important;
}
.dark label span {
    color: #ff80ab !important;
}
.dark #header-box {
    background-color: #4a148c !important;
    border: 2px solid #7c43bd !important;
}
.dark #header-text p {
    color: #ff80ab !important;
}
.dark #predict-btn {
    background-color: #ec407a !important;
    background-image: linear-gradient(to right, #ec407a, #d81b60) !important;
}
.dark h3 {
    color: #ff80ab !important;
}
/* Dark Mode Result Box styling */
.dark #result-box {
    background-color: #1a1a40;
    border: 2px solid #7c43bd;
    color: #ff80ab;
}

/* ============================================================
   3. UNIVERSAL LAYOUT
   ============================================================ */
#header-box {
    padding: 15px;
    border-radius: 8px;
    text-align: center;
    margin-bottom: 25px;
}
#header-text p {
    font-size: 28px;
    font-weight: 800;
    margin: 0;
}
#input-scroll-container {
    max-height: 450px;
    overflow-y: scroll !important;
    overflow-x: hidden !important;
    padding: 10px;
    padding-top: 0px;
    border-radius: 8px;
    display: block;
}
#input-scroll-container::-webkit-scrollbar { width: 12px; }
#input-scroll-container::-webkit-scrollbar-track { background: transparent; }
#input-scroll-container::-webkit-scrollbar-thumb {
    background-color: #ec407a;
    border-radius: 6px;
    border: 2px solid transparent;
    background-clip: content-box;
}
body:not(.dark) #input-scroll-container::-webkit-scrollbar-thumb {
    background-color: #ab47bc;
}
#predict-btn {
    color: white !important;
    border: none;
    font-weight: bold;
    font-size: 20px;
    padding: 10px;
    border-radius: 8px;
    margin-top: 20px;
}

/* Result Box Base Styling */
#result-box {
    text-align: center;
    margin-top: 20px;
    padding: 20px;
    border-radius: 8px;
    font-weight: bold;
    font-size: 24px;
}

h3 {
    margin-bottom: 30px;
    margin-top: 15px;
    font-weight: bold;
    position: sticky;
    top: 0;
    background-color: inherit;
    z-index: 10;
    padding-bottom: 5px;
}
"""

ribbon_url = "https://i.ebayimg.com/images/g/pxsAAOSw5TZZnup6/s-l1200.jpg"

# --- UI LAYOUT ---
with gr.Blocks(css=custom_css) as demo:

    # Header
    with gr.Row(elem_id="header-box"):
        gr.Markdown("# Breast Cancer Prediction System", elem_id="header-text")

    with gr.Row():
        # Left: Inputs
        with gr.Column(scale=2):
            with gr.Column(elem_id="input-scroll-container"):
                gr.Markdown("### Enter the parameters:")
                input_components = []
                for i, name in enumerate(feature_names):
                    inp = gr.Number(
                        value=default_input_data[i],
                        label=f"{name.replace('_', ' ').title()}",
                    )
                    input_components.append(inp)

        # Right: Image
        with gr.Column(scale=1):
             gr.HTML(f"""
                <div style="display: flex; justify-content: center; align-items: center; height: 100%; margin-top: 80px;">
                    <img src="{ribbon_url}" style="width: 60%; opacity: 0.9; border-radius: 8px;">
                </div>
            """)

    # Bottom Section
    with gr.Row():
        model_dropdown = gr.Dropdown(
            choices=list(models.keys()),
            value=list(models.keys())[0],
            label="Select Model"
        )

    predict_btn = gr.Button("Predict Diagnosis", elem_id="predict-btn")

    # HTML component for colored result
    result_output = gr.HTML(
        value="<span style='opacity: 0.7;'>Result will appear here</span>",
        elem_id="result-box"
    )

    predict_btn.click(
        fn=predict_logic,
        inputs=[model_dropdown] + input_components,
        outputs=result_output
    )

demo.launch(share=True)