# Breast Cancer Classification - Model Training Notebook

**Models Implemented:**
1. Logistic Regression
2. Decision Tree Classifier
3. K-Nearest Neighbors (KNN)
4. Naive Bayes
5. Random Forest
6. XGBoost

**Metrics Evaluated:**
- Accuracy
- AUC Score
- Precision
- Recall
- F1-Score
- Matthews Correlation Coefficient (MCC)

## 1. Required Libraries

In [None]:
import numpy as np
import pandas as pd
import pickle
import os
import warnings
warnings.filterwarnings('ignore')

# Scikit-learn imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, roc_auc_score, precision_score, 
                             recall_score, f1_score, matthews_corrcoef,
                             confusion_matrix, classification_report)

# Classification models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb


## 2. Load and Prepare Data

In [None]:
def load_data(filepath='breast-cancer.csv'):
    """Load and prepare the breast cancer dataset from CSV"""
    df = pd.read_csv(filepath)
    
    # Drop ID column if exists
    if 'id' in df.columns:
        df = df.drop('id', axis=1)
    
    # Separate features and target
    X = df.drop('diagnosis', axis=1)
    y = df['diagnosis']
    
    # Encode target variable (M=1, B=0)
    le = LabelEncoder()
    y = le.fit_transform(y)
    
    print(f"Target classes: {le.classes_}")
    
    return X, y

# Load the dataset
X, y = load_data()
print(f"\n Dataset loaded successfully")
print(f"  - Features shape: {X.shape}")
print(f"  - Number of samples: {X.shape[0]}")
print(f"  - Number of features: {X.shape[1]}")

# Display class distribution
y_series = pd.Series(y)
print(f"\nClass distribution:")
print(f"  - Benign (0): {(y == 0).sum()} samples ({(y == 0).sum()/len(y)*100:.1f}%)")
print(f"  - Malignant (1): {(y == 1).sum()} samples ({(y == 1).sum()/len(y)*100:.1f}%)")

Target classes: ['B' 'M']

 Dataset loaded successfully
  - Features shape: (569, 30)
  - Number of samples: 569
  - Number of features: 30

Class distribution:
  - Benign (0): 357 samples (62.7%)
  - Malignant (1): 212 samples (37.3%)


### Split and Scale Data

In [None]:
def split_data(X, y, test_size=0.2, random_state=42):
    """Split data into training and testing sets"""
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )
    
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test, scaler

# Split and scale data
print("Splitting data into training and testing sets (80-20 split)")
X_train, X_test, y_train, y_test, scaler = split_data(X, y)

print(f"\n Data split done successfully")
print(f"  - Training set shape: {X_train.shape}")
print(f"  - Test set shape: {X_test.shape}")
print(f"  - Features scaled using StandardScaler")

Splitting data into training and testing sets (80-20 split)

 Data split done successfully
  - Training set shape: (455, 30)
  - Test set shape: (114, 30)
  - Features scaled using StandardScaler


## 3. Define Models Configuration

Initializing all 6 classification models with optimal parameters

In [None]:
def calculate_metrics(y_true, y_pred, y_pred_proba):
    """Calculate all evaluation metrics"""
    metrics = {
        'Accuracy': accuracy_score(y_true, y_pred),
        'AUC': roc_auc_score(y_true, y_pred_proba),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1': f1_score(y_true, y_pred),
        'MCC': matthews_corrcoef(y_true, y_pred)
    }
    return metrics

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Naive Bayes': GaussianNB(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, verbosity=0)
}

print("Models initialized:")
for model_name in models.keys():
    print(f"  - {model_name}")

Models initialized:
  - Logistic Regression
  - Decision Tree
  - KNN
  - Naive Bayes
  - Random Forest
  - XGBoost


## 4. Train the Models

Train all 6 classification models on the training data

In [None]:
import time

print("=" * 70)
print("TRAINING ALL MODELS")
print("=" * 70)

results = {}
training_times = {}

for model_name, model in models.items():
    start_time = time.time()
    
    print(f"\n ----- {model_name} -----")
    
    # Train model
    model.fit(X_train, y_train)
    
    # Predictions on test set
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    metrics = calculate_metrics(y_test, y_pred, y_pred_proba)
    
    # Confusion matrix and classification report
    cm = confusion_matrix(y_test, y_pred)
    cr = classification_report(y_test, y_pred, output_dict=True)
    
    training_time = time.time() - start_time
    training_times[model_name] = training_time
    
    results[model_name] = {
        'model': model,
        'metrics': metrics,
        'confusion_matrix': cm,
        'classification_report': cr,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }
    
    print(f"    Training time: {training_time:.4f}s")

print("\n" + "=" * 70)
print("✓ ALL MODELS TRAINED SUCCESSFULLY!")
print("=" * 70)

TRAINING ALL MODELS

 ----- Logistic Regression -----
    Accuracy: 0.9649
    AUC Score: 0.9960
    Precision: 0.9750
    Recall: 0.9286
    F1-Score: 0.9512
    MCC Score: 0.9245
    Training time: 0.0251s

 ----- Decision Tree -----
    Accuracy: 0.9298
    AUC Score: 0.9246
    Precision: 0.9048
    Recall: 0.9048
    F1-Score: 0.9048
    MCC Score: 0.8492
    Training time: 0.0203s

 ----- KNN -----
    Accuracy: 0.9561
    AUC Score: 0.9823
    Precision: 0.9744
    Recall: 0.9048
    F1-Score: 0.9383
    MCC Score: 0.9058
    Training time: 0.0275s

 ----- Naive Bayes -----
    Accuracy: 0.9211
    AUC Score: 0.9891
    Precision: 0.9231
    Recall: 0.8571
    F1-Score: 0.8889
    MCC Score: 0.8292
    Training time: 0.0154s

 ----- Random Forest -----
    Accuracy: 0.9737
    AUC Score: 0.9929
    Precision: 1.0000
    Recall: 0.9286
    F1-Score: 0.9630
    MCC Score: 0.9442
    Training time: 0.2099s

 ----- XGBoost -----
    Accuracy: 0.9737
    AUC Score: 0.9940
    Precisi

## 5. Evaluate Model Performance

Display comprehensive evaluation metrics for all models

In [None]:
# Create results dataframe
results_list = []
for model_name, result in results.items():
    row = {'Model': model_name}
    row.update(result['metrics'])
    results_list.append(row)

results_df = pd.DataFrame(results_list)

print("\n" + "=" * 70)
print("✓ MODELS EVALUATION COMPLETED!")
print("=" * 70)


MODEL PERFORMANCE COMPARISON
              Model  Accuracy      AUC  Precision   Recall       F1      MCC
Logistic Regression  0.964912 0.996032   0.975000 0.928571 0.951220 0.924518
      Decision Tree  0.929825 0.924603   0.904762 0.904762 0.904762 0.849206
                KNN  0.956140 0.982308   0.974359 0.904762 0.938272 0.905824
        Naive Bayes  0.921053 0.989087   0.923077 0.857143 0.888889 0.829162
      Random Forest  0.973684 0.992890   1.000000 0.928571 0.962963 0.944155
            XGBoost  0.973684 0.994048   1.000000 0.928571 0.962963 0.944155

 TOP PERFORMERS

1️⃣ Highest Accuracy: Random Forest
   Accuracy: 0.9737

2️⃣ Highest AUC Score: Logistic Regression
   AUC: 0.9960

3️⃣ Highest F1-Score: Random Forest
   F1: 0.9630


## 6. Save Trained Models

Serialize and save all trained models to disk for deployment

In [None]:
def save_models(results, model_dir='model'):
    """Save trained models to disk"""
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    
    for model_name, result in results.items():
        model_path = os.path.join(model_dir, f"{model_name.replace(' ', '_')}_model.pkl")
        with open(model_path, 'wb') as f:
            pickle.dump(result['model'], f)
        print(f" Saved {model_name} to {model_path}")

# Save all models
print("\n" + "=" * 70)
print("SAVING TRAINED MODELS")
print("=" * 70)
save_models(results)
print("\n - All models saved successfully!")


SAVING TRAINED MODELS
 Saved Logistic Regression to model\Logistic_Regression_model.pkl
 Saved Decision Tree to model\Decision_Tree_model.pkl
 Saved KNN to model\KNN_model.pkl
 Saved Naive Bayes to model\Naive_Bayes_model.pkl
 Saved Random Forest to model\Random_Forest_model.pkl
 Saved XGBoost to model\XGBoost_model.pkl

 - All models saved successfully!


## 7. Export Results

Save evaluation metrics and test data to CSV files

In [None]:
# Save results to CSV
csv_file = 'model_results.csv'
results_df.to_csv(csv_file, index=False)
print(f"\n Results saved to {csv_file}")

# Save test data for Streamlit app
test_data_path = 'test_data.csv'
df = pd.read_csv('breast-cancer.csv')
feature_names = [col for col in df.columns if col not in ['id', 'diagnosis']]
X_test_df = pd.DataFrame(X_test, columns=feature_names)
X_test_df['diagnosis'] = y_test
X_test_df.to_csv(test_data_path, index=False)
print(f" Test data saved to {test_data_path}")

print("\n" + "=" * 70)
print("✅ TRAINING PIPELINE COMPLETED SUCCESSFULLY!")
print("=" * 70)
print(f"\nGenerated Files:")
print(f"  - model_results.csv - Model evaluation metrics")
print(f"  - test_data.csv - Test dataset for Streamlit app")
print(f"  - model/ directory - Saved trained models")
print(f"\n Ready for deployment!")


 Results saved to model_results.csv
 Test data saved to test_data.csv

✅ TRAINING PIPELINE COMPLETED SUCCESSFULLY!

Generated Files:
  - model_results.csv - Model evaluation metrics
  - test_data.csv - Test dataset for Streamlit app
  - model/ directory - Saved trained models

 Ready for deployment!
