# Libraries

In [127]:
# Basic libraries
import numpy as np
import pandas as pd

# Feature Selection
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold

# Scalers
from sklearn.preprocessing import StandardScaler

# Performance Metrics
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score

# Classifiers
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier

# NN
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

# Functions

In [128]:
class Dataset:
    def __init__(self, features, labels):
        self.features = torch.tensor(features, dtype = torch.float32)
        self.labels = torch.tensor(labels, dtype = torch.float32)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

In [129]:
class SimpleNN(nn.Module):
    def __init__(self, input_size, output_size, layer_sizes, activation_funcs):
        super(SimpleNN, self).__init__()

        self.layers = nn.ModuleList()
        self.batch_norms = nn.ModuleList()
        self.dropouts = nn.ModuleList()
        self.activation_funcs = activation_funcs

        self.batch_norm1 = nn.BatchNorm1d(input_size)
        # input layer
        self.layers.append(nn.utils.weight_norm(nn.Linear(input_size, layer_sizes[0])))
        self.batch_norms.append(nn.BatchNorm1d(layer_sizes[0]))
        self.dropouts.append(nn.Dropout(0.2))

        # Hidden layers
        for i in range(1, len(layer_sizes)):
            self.layers.append(nn.utils.weight_norm(nn.Linear(layer_sizes[i-1], layer_sizes[i])))
            if i < len(layer_sizes) - 1:
                self.batch_norms.append(nn.BatchNorm1d(layer_sizes[i]))
            self.dropouts.append(nn.Dropout(0.5))

        # Output layer
        self.output = nn.utils.weight_norm(nn.Linear(layer_sizes[-1], output_size))


    def forward(self, x):
        x = self.batch_norm1(x)
        x = self.dropouts[0](x)
        x = self.layers[0](x)
        x = self.activation_funcs[0](x)

        for i in range(1, len(self.layers)):
            if i < len(self.batch_norms):
                x = self.batch_norms[i](x)
            x = self.dropouts[i](x)
            x = self.layers[i](x)
            if i < len(self.activation_funcs):
                x = self.activation_funcs[i](x)

        x = self.output(x)
        return x

In [130]:
def grid_search(configs, dataset):

    best_config = None
    best_score = None
    
    scores = []
    layers = []
    activations = []

    for config in configs:
        best_score = 0
        layer_size = config['layer_sizes']
        activation = config['activations']

        model = SimpleNN(
            input_size = INPUT_SIZE, output_size = OUTPUT_SIZE,
            layer_sizes = config['layer_sizes'],  activation_funcs = config['activations']
        )

        print(f"Layer sizes: {layer_size}, Activations: {[type(act).__name__ for act in activation]}")

        score, loss, all_f1_scores = train_and_evaluate_model(model, dataset)

        if score > best_score:
            best_score = score
            best_config = config
            
        scores.append(all_f1_scores)
        layers.append(layer_size)
        activations.append([type(act).__name__ for act in activation])

    print(f"Best F1 Score: {best_score:.2f}, Layer sizes: {layer_size}, Activations: {[type(act).__name__ for act in activation]}")
    
    
    
    
    return best_config, best_score, scores, layers, activations

In [131]:
def local_search(base_config, dataset, iterations=10):
    def get_neighbors(configuration):
        neighbors = []

        # Example: Add a layer
        neighbors.append({
            'layer_sizes': configuration['layer_sizes'] + [64],
            'activations': configuration['activations'] + [nn.ReLU()]
        })

        # Example: Remove a layer
        if len(configuration['layer_sizes']) > 1:
            neighbors.append({
                'layer_sizes': configuration['layer_sizes'][:-1],
                'activations': configuration['activations'][:-1]
            })
        return neighbors

    def get_model(config):
        layer_size = config['layer_sizes']
        activation = config['activations']
        model = SimpleNN(INPUT_SIZE, OUTPUT_SIZE, layer_size, activation)

        return model

    current_configuration = base_config
    model = get_model(current_configuration)
    current_score, current_loss, _ = train_and_evaluate_model(model, dataset)

    while True:
        neighbors = get_neighbors(current_configuration)
        any_improvement = False

        for neighbor_config in neighbors:
            neighbor_model = get_model(neighbor_config)
            score, loss, _ = train_and_evaluate_model(neighbor_model, dataset)

            if score > current_score:
                current_configuration = neighbor_config
                current_score = score
                any_improvement = True
                break  # Move to the first improving neighbor

        if not any_improvement:
            break  # No improvement found, stop the search

    return current_configuration, current_score


In [132]:
def train_fn(model, optimizer, scheduler, loss_fn, dataloader):
    model.train()
    final_loss = 0

    for data, target in dataloader:
        optimizer.zero_grad()
        output = model(data)
        loss = loss_fn(output, target.long())
        loss.backward()
        optimizer.step()
        scheduler.step()

        final_loss += loss.item()
    final_loss /= len(dataloader)

    return final_loss

In [133]:
def valid_fn(model, loss_fn, dataloader):
    model.eval()
    final_loss = 0
    final_f1_score = 0

    for data, target in dataloader:
        outputs = model(data)
        loss = loss_fn(outputs, target.long())
        final_loss += loss.item()

        preds = outputs.argmax(dim=1).detach().cpu().numpy()
        final_f1_score += (f1_score(target.cpu().numpy(), preds.round(), average='weighted'))

    final_f1_score /= len(dataloader)
    final_loss /= len(dataloader)

    return final_loss, final_f1_score

In [134]:
# Function to create and train a network
def train_and_evaluate_model(model, dataset, n_splits = 5):


    labels = [label for _, label in dataset]
    kfold = StratifiedKFold(n_splits = n_splits, shuffle=True)
    
    all_f1_scores = []

    for fold, (train_ids, valid_ids) in enumerate(kfold.split(range(len(dataset)), labels)):

        # Splitting the dataset
        train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
        valid_subsampler = torch.utils.data.SubsetRandomSampler(valid_ids)

        # Creating data loaders
        train_loader = DataLoader(dataset, batch_size = BATCH_SIZE, sampler = train_subsampler)
        valid_loader = DataLoader(dataset, batch_size = BATCH_SIZE, sampler = valid_subsampler)

        # initialization
        model.to(DEVICE)

        optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
        scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3,
                                  max_lr=1e-2, epochs=EPOCHS, steps_per_epoch=len(train_loader))
        loss_fn = nn.CrossEntropyLoss()
        early_stopping_steps = EARLY_STOPPING_STEPS
        early_step = 0


        # Training loop
        best_loss = np.inf
        for epoch in range(EPOCHS):

            train_loss = train_fn(model, optimizer,scheduler, loss_fn, train_loader)
            valid_loss, score = valid_fn(model, loss_fn, valid_loader)

            if valid_loss < best_loss:
                best_loss = valid_loss
                f1_score = score

                torch.save(model.state_dict(), f"fold{fold}_.pth")

            elif(EARLY_STOP == True):
                early_step += 1

            if (early_step >= early_stopping_steps):
                break


        print(f"Completed training fold {fold}, and best f1 score is {f1_score:.2f}")
        all_f1_scores.append(f1_score)

    return f1_score, best_loss, all_f1_scores 

In [135]:
def load_best_model_and_predict(model, dataset, fold, model_path):
    # Load the best model for a specific fold
    model.load_state_dict(torch.load(model_path))
    model.to(DEVICE)
    model.eval()

    # Create DataLoader for the test set
    test_loader = DataLoader(dataset, batch_size = BATCH_SIZE)

    all_predictions = []
    with torch.no_grad():
        for inputs, _ in test_loader:
            inputs = inputs.to(DEVICE)

            # Forward pass
            outputs = model(inputs)

            # Convert the model output to probabilities using softmax
            probabilities = torch.nn.functional.softmax(outputs, dim = 1)

            # Get the predicted class for each sample
            _, predictions = torch.max(probabilities, 1)

            # Convert predictions to a NumPy array and append to the list
            all_predictions.extend(predictions.cpu().numpy())

    return all_predictions

In [136]:
def calculate_f1_scores(all_fold_predictions, dataset):
    f1_scores = []

    for fold, predictions in enumerate(all_fold_predictions):
        f1 = f1_score(dataset.labels, predictions, average = 'weighted')
        f1_scores.append(f1)

    return f1_scores

# Train

In [137]:
# Load the dataset
processed_df = pd.read_csv('breast-cancer-diagnostic.shuf.lrn.csv')

# Split the dataset into features and labels
feature_names = list(filter(lambda x: x not in ['ID', 'class'], processed_df.columns))

X = processed_df[feature_names].values
y = processed_df['class'].values

# Create a Dataset object
dataset = Dataset(X, y)

In [138]:
# HyperParameters

INPUT_SIZE = len(processed_df[feature_names].columns)
OUTPUT_SIZE = len(processed_df['class'].unique())

DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE = 'cpu'
EPOCHS = 25
BATCH_SIZE = 128
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-5
EARLY_STOPPING_STEPS = 10
EARLY_STOP = False

print(DEVICE)

cpu


## Simple NN

In [139]:
# Define the pipeline
def pipeline(dataset, configs):
    
    # Perform grid search
    best_configuration, best_score, scores, layers, activations = grid_search(configs, dataset)
    
    print('\n')
    print(f"Best Configuration for the model: {best_configuration}")
    print(f"Best F1-Score for the model: {round(100 * best_score, 2)}%")


    ## Creating Dataframe with results 
    grid_results_df = pd.DataFrame(columns = ['Layers', 'Activations'] + [f'Fold{i+1}' for i in range(5)])
    grid_results_df['Layers'] = layers
    grid_results_df['Activations'] = activations


    for i in range(0, 5):
        vals = [lst[i] for lst in scores]
        grid_results_df[f'Fold{i+1}'] = vals


    # Perform local search
    best_configuration = {'layer_sizes': [64, 64], 'activations': [nn.ReLU(), nn.ReLU()]}
    final_config, final_score = local_search(best_configuration, dataset)
    
    print(f"Final Configuration for the model: {final_config}")
    print(f"Final F1-Score for the model: {round(100 * final_score, 2)}%")

    # Train and evaluate the final model on the test set for all folds
    all_fold_predictions = []

    for fold in range(0, 5):  # Assuming 5 folds
        model_path = f"fold{fold}_.pth"
        print(model_path)
        
        model = SimpleNN(
            input_size = INPUT_SIZE, output_size=OUTPUT_SIZE,
            layer_sizes = final_config['layer_sizes'], activation_funcs = final_config['activations']
        )

        # Train the model
        train_and_evaluate_model(model, dataset)

        # Load the best model and make predictions
        predictions = load_best_model_and_predict(model, dataset, fold, model_path)

        # Append predictions to the list
        all_fold_predictions.append(predictions)
        
        
    # Calculate F1 scores for each fold
    f1_scores = calculate_f1_scores(all_fold_predictions, dataset)

    # Put F1 scores into a dataframe
    f1_scores_df = pd.DataFrame({'Fold': [f'Fold{i+1}' for i in range(5)], 'F1 Score': f1_scores})

    return grid_results_df, all_fold_predictions, f1_scores_df

In [140]:
configs = [
                {'layer_sizes': [64], 'activations': [nn.ReLU()]},
                {'layer_sizes': [64, 64], 'activations': [nn.ReLU(), nn.ReLU()]},
                {'layer_sizes': [128, 64], 'activations': [nn.Tanh(), nn.ReLU()]},
                {'layer_sizes': [128, 64], 'activations': [nn.Tanh(), nn.Tanh()]},
                {'layer_sizes': [64, 64, 32], 'activations': [nn.ReLU(), nn.Tanh(), nn.ReLU()]},
                {'layer_sizes': [128, 128, 64], 'activations': [nn.ReLU(), nn.ReLU(), nn.ReLU()]},
                {'layer_sizes': [128, 128, 64], 'activations': [nn.Tanh(), nn.ReLU(), nn.ReLU()]}]

grid_results_df, pipeline_results, f1_scores_df = pipeline(dataset, configs)
grid_results_df

Layer sizes: [64], Activations: ['ReLU']


Completed training fold 0, and best f1 score is 0.95
Completed training fold 1, and best f1 score is 0.98
Completed training fold 2, and best f1 score is 1.00
Completed training fold 3, and best f1 score is 1.00
Completed training fold 4, and best f1 score is 1.00
Layer sizes: [64, 64], Activations: ['ReLU', 'ReLU']
Completed training fold 0, and best f1 score is 0.96
Completed training fold 1, and best f1 score is 1.00
Completed training fold 2, and best f1 score is 1.00
Completed training fold 3, and best f1 score is 1.00
Completed training fold 4, and best f1 score is 0.98
Layer sizes: [128, 64], Activations: ['Tanh', 'ReLU']
Completed training fold 0, and best f1 score is 0.95
Completed training fold 1, and best f1 score is 0.95
Completed training fold 2, and best f1 score is 1.00
Completed training fold 3, and best f1 score is 1.00
Completed training fold 4, and best f1 score is 1.00
Layer sizes: [128, 64], Activations: ['Tanh', 'Tanh']
Completed training fold 0, and best f1 score

Unnamed: 0,Layers,Activations,Fold1,Fold2,Fold3,Fold4,Fold5
0,[64],[ReLU],0.94761,0.982362,1.0,1.0,1.0
1,"[64, 64]","[ReLU, ReLU]",0.964912,1.0,1.0,1.0,0.982376
2,"[128, 64]","[Tanh, ReLU]",0.94761,0.946397,1.0,1.0,1.0
3,"[128, 64]","[Tanh, Tanh]",0.982362,1.0,0.964912,1.0,1.0
4,"[64, 64, 32]","[ReLU, Tanh, ReLU]",0.982362,0.982537,0.964564,1.0,1.0
5,"[128, 128, 64]","[ReLU, ReLU, ReLU]",0.965208,0.964912,1.0,1.0,1.0
6,"[128, 128, 64]","[Tanh, ReLU, ReLU]",0.982362,0.982362,1.0,1.0,0.982376


In [141]:
f1_scores_df

Unnamed: 0,Fold,F1 Score
0,Fold1,0.964912
1,Fold2,0.978947
2,Fold3,0.992969
3,Fold4,0.992969
4,Fold5,0.989464


## MLP

In [142]:
# Define the MLPClassifier
mlp = MLPClassifier(max_iter=100)

# Define the parameter grid for grid search
param_grid = {
    'classifier__hidden_layer_sizes': [(50,50), (100,), (100,50,25)],
    'classifier__activation': ['relu', 'tanh', 'logistic'],
    'classifier__alpha': [0.0001, 0.001, 0.01],
}

# Create a pipeline with scaling if needed
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), feature_names),  # Scale numeric features if needed
    ])

pipeline = Pipeline([
    ('classifier', mlp)
])

# Create a StratifiedKFold cross-validator
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform grid search with cross-validation and compute F1 score for each fold
f1_scorer = make_scorer(f1_score, average='binary')  # 'binary' for binary classification
grid_search = GridSearchCV(pipeline, param_grid, scoring=f1_scorer, cv=cv, verbose=1, n_jobs=-1)
grid_search.fit(X, y)

# Print the best parameters and the corresponding F1 score
print("Best Parameters: ", grid_search.best_params_)
print("Best F1 Score: ", grid_search.best_score_)

# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Compute F1 score for each fold using cross_val_score
f1_scores = cross_val_score(best_model, X, y, cv=cv, scoring=f1_scorer)

# Create a DataFrame to store F1 scores for each fold
f1_df = pd.DataFrame({'Fold': range(1, len(f1_scores)+1), 'F1 Score': f1_scores})

# Display the DataFrame
f1_df

Fitting 5 folds for each of 27 candidates, totalling 135 fits


Best Parameters:  {'classifier__activation': 'relu', 'classifier__alpha': 0.001, 'classifier__hidden_layer_sizes': (100, 50, 25)}
Best F1 Score:  0.8961518496629104


Unnamed: 0,Fold,F1 Score
0,1,0.745098
1,2,0.9
2,3,0.871795
3,4,0.826087
4,5,0.846154


## LightGBM

In [143]:
# Define the LGBMClassifier
lgbm = LGBMClassifier()

# Define the parameter grid for grid search
param_grid = {
    'classifier__learning_rate': [0.001, 0.01, 0.1],
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [3, 5, 7],
}

# Create a pipeline with scaling if needed
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), feature_names),  # Scale numeric features if needed
    ])

pipeline = Pipeline([
    ('classifier', lgbm)
])

# Create a StratifiedKFold cross-validator
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform grid search with cross-validation and compute F1 score for each fold
f1_scorer = make_scorer(f1_score, average='binary')  # 'binary' for binary classification
grid_search = GridSearchCV(pipeline, param_grid, scoring=f1_scorer, cv=cv, verbose=1, n_jobs=-1)
grid_search.fit(X, y)

# Print the best parameters and the corresponding F1 score
print("Best Parameters: ", grid_search.best_params_)
print("Best F1 Score: ", grid_search.best_score_)

# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Compute F1 score for each fold using cross_val_score
f1_scores = cross_val_score(best_model, X, y, cv=cv, scoring=f1_scorer)

# Create a DataFrame to store F1 scores for each fold
f1_df = pd.DataFrame({'Fold': range(1, len(f1_scores)+1), 'F1 Score': f1_scores})

# Display the DataFrame
f1_df

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[LightGBM] [Info] Number of positive: 108, number of negative: 177
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002664 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2849
[LightGBM] [Info] Number of data points in the train set: 285, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.378947 -> initscore=-0.494019
[LightGBM] [Info] Start training from score -0.494019
Best Parameters:  {'classifier__learning_rate': 0.1, 'classifier__max_depth': 7, 'classifier__n_estimators': 200}
Best F1 Score:  0.9363084842356834
[LightGBM] [Info] Number of positive: 87, number of negative: 141
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000288 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2274
[LightGBM] [Info] Number of data points

Unnamed: 0,Fold,F1 Score
0,1,1.0
1,2,0.883721
2,3,0.930233
3,4,0.913043
4,5,0.954545
