# Model Training and Tuning

This notebook covers the training and tuning of various machine learning models for the Wisconsin Breast Cancer dataset.

In [33]:
# Import necessary libraries and functions
import pandas as pd
import numpy as np
import sys
import os

# Get the current notebook's directory
current_dir = os.path.dirname(os.path.abspath("__file__"))

# Construct the path to the scripts directory
utils_dir = os.path.join(current_dir, '..', 'scripts')

# Add the utils directory to the system path
sys.path.append(utils_dir)

#Load functions from the script train.py
from train import load_features_data, tune_model, train_model, evaluate_model, save_model


## Load Preprocessed Data

In [34]:
# Load the preprocessed data
X_train, X_test, y_train, y_test, selected_features  = load_features_data('../data/selected_features_data.npz')

# Apply Synthetic Minority Over-sampling Technique (SMOTE) to handle class imbalance
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
print("y_train distribution before SMOTE:", np.bincount(y_train))
print("y_train distribution after SMOTE:", np.bincount(y_train_balanced))
print("y_test distribution:", np.bincount(y_test))

y_train distribution before SMOTE: [286 169]
y_train distribution after SMOTE: [286 286]
y_test distribution: [71 43]


## Prepare parameters for the GridSearch in cross validation to tune the models

In [35]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cross_decomposition import PLSRegression
from sklearn.neural_network import MLPClassifier

#Upper bond for the PLS  GridSearch
n_components_upper_bound = min(X_train.shape[0], X_train.shape[1])

#Models and parameters for GridSearch
models = {
    'Random Forest': RandomForestClassifier(random_state=42, class_weight='balanced'),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'PLS Regression': PLSRegression(),
    'Neural Network': MLPClassifier(hidden_layer_sizes=(50,), activation='relu', solver='adam', random_state=42)
    }
    
param_grids = {
    'Random Forest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'class_weight': ['balanced', {0: 1, 1: 3}, {0: 1, 1: 5}]
        },
    'Gradient Boosting': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        },
    'PLS Regression': {
        'n_components': [i for i in range(2, n_components_upper_bound + 1)]
        },
    'Neural Network': {
        'hidden_layer_sizes': [(10,), (20,), (50,), (10,10), (20,20)],
        'max_iter': [500, 1000],  
        'alpha': [0.0001, 0.001, 0.01],
        'learning_rate_init': [0.001, 0.01, 0.1]
        }
    }


## Tune, train and evaluate models

In [None]:
results = {}
for name, model in models.items():
    print(f"Tuning {name}...")
    tuned_model = tune_model(model, param_grids[name], X_train_balanced, y_train_balanced)
    print(f"Training {name}...")
    trained_model = train_model(tuned_model, X_train_balanced, y_train_balanced)
    print(f"Evaluating {name}...")
    is_regression = True if name == 'PLS Regression' else False
    metrics = evaluate_model(trained_model, X_test, y_test, name, is_regression=is_regression)
    results[name] = metrics
    save_model(trained_model, f'../models/{name.lower().replace(" ", "_")}_model.pkl')
    print("Model training, tuning, and evaluation completed.")

Tuning Random Forest...


## Save Evaluation Metrics

In [None]:
# Save evaluation metrics
results_df = pd.DataFrame(results).T
os.makedirs('../results', exist_ok=True)  # Create the directory if it doesn't exist
results_df.to_csv('../results/evaluation_metrics.csv', index=True)
results_df

## Summary

In this notebook, we have trained and evaluated various machine learning models, we also saved the trained models and evaluation results for further analysis.