# Breast Cancer Classification using Machine Learning
This notebook implements various machine learning models to classify breast cancer data. The workflow includes:
1. Data Loading & Exploration
2. Data Preprocessing
3. Training SVM & Ensemble Models
4. Model Evaluation
5. Visualization of Results using SHAP


## Importing Required Libraries

In [34]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from imblearn.over_sampling import SMOTE
import shap
import math
import dagshub
import mlflow
import mlflow.sklearn
import dvc
import dvc.api
import numpy as np
import os

## Data Loading and Exploration
This function loads the dataset, assigns column names, and provides an overview.
It also checks for missing values and displays class distribution.

In [14]:
def load_and_explore_data(filepath='../Data/wdbc.data'):
    print("Loading dataset...")
    columns = [
        "ID", "Class",
        "Radius_Mean", "Texture_Mean", "Perimeter_Mean", "Area_Mean", "Smoothness_Mean", "Compactness_Mean",
        "Concavity_Mean", "ConcavePoints_Mean", "Symmetry_Mean", "FractalDimension_Mean",
        "Radius_SE", "Texture_SE", "Perimeter_SE", "Area_SE", "Smoothness_SE", "Compactness_SE",
        "Concavity_SE", "ConcavePoints_SE", "Symmetry_SE", "FractalDimension_SE",
        "Radius_Worst", "Texture_Worst", "Perimeter_Worst", "Area_Worst", "Smoothness_Worst",
        "Compactness_Worst", "Concavity_Worst", "ConcavePoints_Worst", "Symmetry_Worst", "FractalDimension_Worst"
    ]
    df = pd.read_csv(filepath,  header=None, names=columns)

    df['Class'] = df['Class'].map({'M': 1, 'B': 0})

    print("\nDataset Overview:")
    print(f"Shape: {df.shape}")
    print(f"\nClass distribution:")
    print(df['Class'].value_counts())
    print(f"Fraud percentage: {df['Class'].mean() * 100:.4f}%")

    print("\nChecking for missing values:")
    print(df.isnull().sum().any())


    print("\nBasic statistics for anonymized features:")
    print(df.describe())

    return df

## Data Preprocessing
This function prepares the data for model training:
- Removes the ID column
- Splits data into train-test sets
- Standardizes the features
- Handles class imbalance using SMOTE

In [15]:
def preprocess_data(df):
    print("\nPreprocessing data...")

    #drop the ID column and label
    X = df.drop('Class', axis=1).drop('ID',axis=1)
    y = df['Class']

    # Split data into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    print(f"Training set shape: {X_train.shape}")
    print(f"Testing set shape: {X_test.shape}")

    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Handle imbalanced data with SMOTE (only for training data)
    print("Applying SMOTE to handle class imbalance...")
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

    print(f"After SMOTE - Training set shape: {X_train_resampled.shape}")
    print(f"Class distribution after SMOTE: {pd.Series(y_train_resampled).value_counts()}")

    return X_train_resampled, X_test_scaled, y_train_resampled, y_test, scaler, X_train.columns


## Training SVM Models
Trains SVM models with three different kernels: Linear, RBF, and Polynomial.

In [16]:
#initialized dagshub
dagshub.init(repo_owner='AnasAljaour', repo_name='AIDE505-FinalProject', mlflow=True)

In [30]:
def train_svm_models(X_train, y_train, X_test, y_test):
    print("\nTraining SVM models with different kernels...")
    svm_models = {
        'SVM (Linear)': SVC(kernel='linear', probability=True, random_state=42),
        'SVM (RBF)': SVC(kernel='rbf', probability=True, random_state=42),
        'SVM (Polynomial)': SVC(kernel='poly', degree=3, probability=True, random_state=42)
        }
    
    os.makedirs("ConfusionMatrix", exist_ok=True)
    os.makedirs('ClassificationReports', exist_ok=True)    
    dvc_version = dvc.api.get_url("../Data/wdbc.data", rev="HEAD")  # or use a specific revision
        
    for name, model in svm_models.items():
        with mlflow.start_run(run_name = name):
            print(f"Training {name}...")
            model.fit(X_train, y_train)
            y_pred =  model.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            kernel_type = model.get_params()["kernel"]
            mlflow.log_param("dataset_version", dvc_version)
            mlflow.log_param("kernel", kernel_type)
            mlflow.log_param("accuracy", accuracy)
            mlflow.sklearn.log_model(model, artifact_path=name)
            cm = confusion_matrix(y_test, y_pred)
            cm_df = pd.DataFrame(cm, index=["Actual Negative (N)", "Actual Positive (P)"],
                                   columns=["Predicted Negative (N)", "Predicted Positive (P)"])
            cm_df.to_csv(f"./ConfusionMatrix/confusion_matrix_{name}.csv", index=True)
            mlflow.log_artifact(f"./ConfusionMatrix/confusion_matrix_{name}.csv")

            report = classification_report(y_test, y_pred, output_dict=True)
            report_df = pd.DataFrame(report).transpose()  # Convert to DataFrame for better readability
            report_path = f"ClassificationReports/classification_report_{name}.csv"
            report_df.to_csv(report_path, index=True)
            mlflow.log_artifact(report_path)

    return svm_models

## Training Ensemble Models
Implements Bagging, Random Forest, and Boosting using Decision Trees.
Also combines SVMs in a Voting Classifier.

In [31]:
def train_ensemble_models(X_train, y_train,X_test, y_test, svm_models):
    print("\nTraining ensemble models...")

    num_models = 20
    sample_size = 0.5
    feature_sample_size = 0.8
    dvc_version = dvc.api.get_url("../Data/wdbc.data", rev="HEAD")  # or use a specific revision
    np.random.seed(0)

    

    # for normal bagging using decision tree
    with mlflow.start_run(run_name = 'Bagging using Decision tree'):
        bagging_clf = BaggingClassifier(
            estimator=DecisionTreeClassifier(),
            n_estimators=num_models,
            max_features=feature_sample_size,
            max_samples=sample_size,
            random_state=42
        )
        bagging_clf.fit(X_train, y_train)
        y_pred =  bagging_clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)
        cm_df = pd.DataFrame(cm, index=["Actual Negative (N)", "Actual Positive (P)"],
                                   columns=["Predicted Negative (N)", "Predicted Positive (P)"])
        cm_df.to_csv(f"./ConfusionMatrix/confusion_matrix_Bagging.csv", index=True)
        mlflow.log_artifact("./ConfusionMatrix/confusion_matrix_Bagging.csv")
        mlflow.log_param("n_estimators", num_models)
        mlflow.log_param("max_features", feature_sample_size)
        mlflow.log_param("max_samples", sample_size)
        mlflow.log_param("accuracy", accuracy)
        mlflow.log_param('dataset_version', dvc_version)
        mlflow.sklearn.log_model(bagging_clf, "BaggingClassifierModel")
        report = classification_report(y_test, y_pred, output_dict=True)
        report_df = pd.DataFrame(report).transpose()  # Convert to DataFrame for better readability
        report_path = f"ClassificationReports/classification_report_bagging.csv"
        report_df.to_csv(report_path, index=True)
        mlflow.log_artifact(report_path)
        

    with mlflow.start_run(run_name = 'RandomForestClassifier'):
        rf_clf = RandomForestClassifier(
            n_estimators=num_models,
            max_features=feature_sample_size,
            max_samples=sample_size,
            random_state=42
        )
         
        rf_clf.fit(X_train, y_train)
        y_pred = rf_clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_param("n_estimators", num_models)
        mlflow.log_param("max_features", feature_sample_size)
        mlflow.log_param("max_samples", sample_size)
        mlflow.log_param("accuracy", accuracy)
        mlflow.log_param('dataset_version', dvc_version)
        mlflow.sklearn.log_model(rf_clf, "RandomForestClassifier")
        cm = confusion_matrix(y_test, y_pred)
        cm_df = pd.DataFrame(cm, index=["Actual Negative (N)", "Actual Positive (P)"],
                                   columns=["Predicted Negative (N)", "Predicted Positive (P)"])
        cm_df.to_csv(f"./ConfusionMatrix/confusion_matrix_RandomForestClassifier.csv", index=True)
        mlflow.log_artifact("./ConfusionMatrix/confusion_matrix_RandomForestClassifier.csv")


        report = classification_report(y_test, y_pred, output_dict=True)
        report_df = pd.DataFrame(report).transpose()  
        report_path = f"ClassificationReports/classification_report_rm.csv"
        report_df.to_csv(report_path, index=True)
        mlflow.log_artifact(report_path)

    with mlflow.start_run(run_name='Boosting using Decision tree'):
        boosting_clf = AdaBoostClassifier(
            estimator=DecisionTreeClassifier(),
            n_estimators=num_models,
            random_state=42
            )
        boosting_clf.fit(X_train, y_train)
        y_pred = boosting_clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        
        mlflow.log_param("n_estimators", num_models)
        mlflow.log_param("accuracy", accuracy)
        mlflow.log_param('dataset_version', dvc_version)
        mlflow.sklearn.log_model(boosting_clf, "BoostingClassifierModel")
        cm = confusion_matrix(y_test, y_pred)
        cm_df = pd.DataFrame(cm, index=["Actual Negative (N)", "Actual Positive (P)"],
                                   columns=["Predicted Negative (N)", "Predicted Positive (P)"])
        cm_df.to_csv(f"./ConfusionMatrix/confusion_matrix_Boosting.csv", index=True)
        mlflow.log_artifact("./ConfusionMatrix/confusion_matrix_Boosting.csv")

        report = classification_report(y_test, y_pred, output_dict=True)
        report_df = pd.DataFrame(report).transpose()  
        report_path = f"ClassificationReports/classification_report_boosting.csv"
        report_df.to_csv(report_path, index=True)
        mlflow.log_artifact(report_path)


    

    all_models = {
        'SVM (Linear)': svm_models['SVM (Linear)'],
        'SVM (RBF)': svm_models['SVM (RBF)'],
        'SVM (Polynomial)': svm_models['SVM (Polynomial)'],
        'Bagging': bagging_clf,
        'Boosting': boosting_clf,
        'Random Forest': rf_clf,
    }
    

    return all_models

## Model Evaluation
Evaluates models using classification reports and confusion matrices.

In [32]:
def evaluate_models(models, X_test, y_test):
    print("\nEvaluating models...")

    results = {}

    for name, model in models.items():
        print(f"\nEvaluating {name}:")

        
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1]

        
        print(classification_report(y_test, y_pred))

        
        cm = confusion_matrix(y_test, y_pred)
        print(cm)
        results[name] = {
            'y_pred': y_pred,
            'y_prob': y_prob,
            'confusion_matrix': cm,
        }

    return results

## Running the Workflow
Now we execute all the steps in sequence.

In [35]:
df = load_and_explore_data()
X_train, X_test, y_train, y_test, scaler, feature_names = preprocess_data(df)
svm_models = train_svm_models(X_train, y_train, X_test, y_test)
all_models = train_ensemble_models(X_train, y_train,X_test, y_test, svm_models)
evaluation_results = evaluate_models(all_models,  X_test, y_test)
print("\nExecution completed! Check the output directory for evaluation metrics and explanations.")

Loading dataset...

Dataset Overview:
Shape: (569, 32)

Class distribution:
Class
0    357
1    212
Name: count, dtype: int64
Fraud percentage: 37.2583%

Checking for missing values:
False

Basic statistics for anonymized features:
                 ID       Class  Radius_Mean  Texture_Mean  Perimeter_Mean  \
count  5.690000e+02  569.000000   569.000000    569.000000      569.000000   
mean   3.037183e+07    0.372583    14.127292     19.289649       91.969033   
std    1.250206e+08    0.483918     3.524049      4.301036       24.298981   
min    8.670000e+03    0.000000     6.981000      9.710000       43.790000   
25%    8.692180e+05    0.000000    11.700000     16.170000       75.170000   
50%    9.060240e+05    0.000000    13.370000     18.840000       86.240000   
75%    8.813129e+06    1.000000    15.780000     21.800000      104.100000   
max    9.113205e+08    1.000000    28.110000     39.280000      188.500000   

         Area_Mean  Smoothness_Mean  Compactness_Mean  Concavity_



🏃 View run SVM (Linear) at: https://dagshub.com/AnasAljaour/AIDE505-FinalProject.mlflow/#/experiments/0/runs/e695ea0aa4344469907ac6085ce382f7
🧪 View experiment at: https://dagshub.com/AnasAljaour/AIDE505-FinalProject.mlflow/#/experiments/0
Training SVM (RBF)...




🏃 View run SVM (RBF) at: https://dagshub.com/AnasAljaour/AIDE505-FinalProject.mlflow/#/experiments/0/runs/c70cda5c4ebf45ba9527e32e79d04e68
🧪 View experiment at: https://dagshub.com/AnasAljaour/AIDE505-FinalProject.mlflow/#/experiments/0
Training SVM (Polynomial)...




🏃 View run SVM (Polynomial) at: https://dagshub.com/AnasAljaour/AIDE505-FinalProject.mlflow/#/experiments/0/runs/1d7c7bd1831642c695a66f9189b75f26
🧪 View experiment at: https://dagshub.com/AnasAljaour/AIDE505-FinalProject.mlflow/#/experiments/0

Training ensemble models...




🏃 View run Bagging using Decision tree at: https://dagshub.com/AnasAljaour/AIDE505-FinalProject.mlflow/#/experiments/0/runs/908224f66d8c47a7a8c5826c0e2b0b51
🧪 View experiment at: https://dagshub.com/AnasAljaour/AIDE505-FinalProject.mlflow/#/experiments/0




🏃 View run RandomForestClassifier at: https://dagshub.com/AnasAljaour/AIDE505-FinalProject.mlflow/#/experiments/0/runs/76119866719d4b18a5666dbeb7e57e00
🧪 View experiment at: https://dagshub.com/AnasAljaour/AIDE505-FinalProject.mlflow/#/experiments/0




🏃 View run Boosting using Decision tree at: https://dagshub.com/AnasAljaour/AIDE505-FinalProject.mlflow/#/experiments/0/runs/b8e7639c4925422b879dcd38813082dc
🧪 View experiment at: https://dagshub.com/AnasAljaour/AIDE505-FinalProject.mlflow/#/experiments/0

Evaluating models...

Evaluating SVM (Linear):
              precision    recall  f1-score   support

           0       0.97      0.99      0.98        72
           1       0.98      0.95      0.96        42

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

[[71  1]
 [ 2 40]]

Evaluating SVM (RBF):
              precision    recall  f1-score   support

           0       0.97      0.99      0.98        72
           1       0.98      0.95      0.96        42

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

