# Training a classifier on GLCM features

In [1]:
import os
import re
import numpy as np
import nibabel as nib
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from skimage.feature import graycomatrix, graycoprops
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedGroupKFold
from pathlib import Path
import pandas as pd
from scipy.stats import ttest_ind
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform
import numpy as np
from sklearn.model_selection import cross_validate
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, make_scorer
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, make_scorer

In [2]:
pd.set_option('display.max_columns', None)

## Loading the data

Since we failed to implement the GLCM extractor in the previous Milestone we will load the data from the file uploaded to the website.

In [3]:
data = np.load("slice_glcm1d.npz", allow_pickle=True)

In [4]:
data.files

['slice_features', 'slice_meta', 'features_rankin_idx']

## Data exploration

In [5]:
df_features = pd.DataFrame(data['slice_features'])

In [6]:
df_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23
0,3912.728918,10969810.0,38704.669443,2122.761852,236.130245,0.799733,8.95841,1.280547,153.858318,0.668218,0.65588,0.986222,0.941925,-0.382916,0.87457,0.00052,58.659495,0.149956,3.108626,0.818609,0.276677,117.318989,2.763986,589.723024
1,7511.568847,31389240.0,-19143.340098,4128.827862,188.601517,0.912418,8.283005,1.331034,117.985636,0.640403,0.624592,0.988914,0.944994,-0.483568,0.95391,0.000812,80.784854,0.085426,3.834779,0.918409,0.139321,161.569708,3.40702,1079.357345
2,8858.349761,34936070.0,-197109.877954,3355.497691,223.661489,0.874257,8.822815,1.381786,143.925152,0.639497,0.624148,0.987052,0.942295,-0.431027,0.919476,0.000775,89.861605,0.186931,3.461811,0.892283,0.405126,179.72321,3.044884,894.789795
3,9830.155057,16017920.0,-118166.64812,1794.384095,208.654995,0.789844,7.819523,1.29723,146.325976,0.689305,0.676263,0.988036,0.949242,-0.344585,0.826478,0.000654,97.126247,0.315186,2.823842,0.817015,0.548032,194.252495,2.435366,500.759772
4,9932.448352,22108790.0,-151369.529909,1948.802779,209.900402,0.803567,7.611905,1.27718,150.267237,0.705191,0.693021,0.987998,0.950864,-0.350157,0.8243,0.000604,97.455314,0.344902,2.748613,0.826901,0.57719,194.910628,2.368467,539.675795


In [7]:
df_features.shape

(7414, 24)

In [8]:
for element in data['slice_meta']:
    print(element)

['LIDC-IDRI-0001_GT1_1' 1 1 'Malignant']
['LIDC-IDRI-0001_GT1_1' 1 1 'Malignant']
['LIDC-IDRI-0001_GT1_1' 1 1 'Malignant']
['LIDC-IDRI-0001_GT1_1' 1 1 'Malignant']
['LIDC-IDRI-0001_GT1_1' 1 1 'Malignant']
['LIDC-IDRI-0001_GT1_1' 1 1 'Malignant']
['LIDC-IDRI-0001_GT1_1' 1 1 'Malignant']
['LIDC-IDRI-0001_GT1_1' 1 1 'Malignant']
['LIDC-IDRI-0001_GT1_1' 1 1 'NoNod']
['LIDC-IDRI-0003_GT1_2' 3 2 'Malignant']
['LIDC-IDRI-0003_GT1_2' 3 2 'Malignant']
['LIDC-IDRI-0003_GT1_2' 3 2 'Malignant']
['LIDC-IDRI-0003_GT1_2' 3 2 'Malignant']
['LIDC-IDRI-0003_GT1_2' 3 2 'Malignant']
['LIDC-IDRI-0003_GT1_2' 3 2 'Malignant']
['LIDC-IDRI-0003_GT1_2' 3 2 'Malignant']
['LIDC-IDRI-0003_GT1_2' 3 2 'Malignant']
['LIDC-IDRI-0003_GT1_2' 3 2 'Malignant']
['LIDC-IDRI-0003_GT1_2' 3 2 'Malignant']
['LIDC-IDRI-0003_GT1_3' 3 3 'NoNod']
['LIDC-IDRI-0003_GT1_3' 3 3 'NoNod']
['LIDC-IDRI-0003_GT1_3' 3 3 'NoNod']
['LIDC-IDRI-0003_GT1_3' 3 3 'NoNod']
['LIDC-IDRI-0003_GT1_4' 3 4 'NoNod']
['LIDC-IDRI-0003_GT1_4' 3 4 'NoNod']
['L

In [9]:
df_meta = pd.DataFrame(data['slice_meta'], columns=['filename', 'patient_id', 'nodule_id', 'diagnosis'])


In [10]:
df_meta.head(10)

Unnamed: 0,filename,patient_id,nodule_id,diagnosis
0,LIDC-IDRI-0001_GT1_1,1,1,Malignant
1,LIDC-IDRI-0001_GT1_1,1,1,Malignant
2,LIDC-IDRI-0001_GT1_1,1,1,Malignant
3,LIDC-IDRI-0001_GT1_1,1,1,Malignant
4,LIDC-IDRI-0001_GT1_1,1,1,Malignant
5,LIDC-IDRI-0001_GT1_1,1,1,Malignant
6,LIDC-IDRI-0001_GT1_1,1,1,Malignant
7,LIDC-IDRI-0001_GT1_1,1,1,Malignant
8,LIDC-IDRI-0001_GT1_1,1,1,NoNod
9,LIDC-IDRI-0003_GT1_2,3,2,Malignant


In [11]:
df_meta['diagnosis'].unique()

array(['Malignant', 'NoNod', 'Benign'], dtype=object)

In [12]:
labels = df_meta['diagnosis']
labels.head()

0    Malignant
1    Malignant
2    Malignant
3    Malignant
4    Malignant
Name: diagnosis, dtype: object

In [13]:
df_meta.shape

(7414, 4)

In [14]:
df_feature_ranking = pd.DataFrame(data['features_rankin_idx'])

In [15]:
df_feature_ranking.head(10)

Unnamed: 0,0
0,15
1,16
2,21
3,0
4,18
5,2
6,22
7,10
8,20
9,7


In [16]:
df_feature_ranking.shape

(24, 1)

## Getting rid of NoNods

In [17]:
df_binary = df_features[labels != 'NoNod']

In [18]:
df_binary.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23
0,3912.728918,10969810.0,38704.669443,2122.761852,236.130245,0.799733,8.95841,1.280547,153.858318,0.668218,0.65588,0.986222,0.941925,-0.382916,0.87457,0.00052,58.659495,0.149956,3.108626,0.818609,0.276677,117.318989,2.763986,589.723024
1,7511.568847,31389240.0,-19143.340098,4128.827862,188.601517,0.912418,8.283005,1.331034,117.985636,0.640403,0.624592,0.988914,0.944994,-0.483568,0.95391,0.000812,80.784854,0.085426,3.834779,0.918409,0.139321,161.569708,3.40702,1079.357345
2,8858.349761,34936070.0,-197109.877954,3355.497691,223.661489,0.874257,8.822815,1.381786,143.925152,0.639497,0.624148,0.987052,0.942295,-0.431027,0.919476,0.000775,89.861605,0.186931,3.461811,0.892283,0.405126,179.72321,3.044884,894.789795
3,9830.155057,16017920.0,-118166.64812,1794.384095,208.654995,0.789844,7.819523,1.29723,146.325976,0.689305,0.676263,0.988036,0.949242,-0.344585,0.826478,0.000654,97.126247,0.315186,2.823842,0.817015,0.548032,194.252495,2.435366,500.759772
4,9932.448352,22108790.0,-151369.529909,1948.802779,209.900402,0.803567,7.611905,1.27718,150.267237,0.705191,0.693021,0.987998,0.950864,-0.350157,0.8243,0.000604,97.455314,0.344902,2.748613,0.826901,0.57719,194.910628,2.368467,539.675795


In [19]:
df_binary.shape

(2383, 24)

In [20]:
filt_nonod = df_meta[df_meta['diagnosis'] != 'NoNod']

In [21]:
filt_nonod.shape

(2383, 4)

In [22]:
labels = filt_nonod['diagnosis']

In [23]:
labels.unique()

array(['Malignant', 'Benign'], dtype=object)

## Performing the t-test for feature importance

Is this correct? What if we have feature correlations? 

In [24]:
# 'Malignant', 'NoNod', 'Benign'
class_0 = df_binary[labels == 'Benign']  # Subset where class is 0
class_1 = df_binary[labels == 'Malignant']  # Subset where class is 1

# Perform t-test for each feature
p_values = {col: ttest_ind(class_0[col], class_1[col], equal_var=True).pvalue for col in df_features.columns}

# Convert results to DataFrame
feature_importance = pd.DataFrame.from_dict(p_values, orient='index', columns=['p_value'])

# Sort by significance
feature_importance = feature_importance.sort_values(by='p_value')

print(feature_importance.head(24))  # Features with the smallest p-values

         p_value
15  1.041978e-35
16  6.250837e-08
21  6.250837e-08
0   5.315320e-07
18  1.193866e-05
2   1.818947e-05
22  2.126097e-05
10  2.684689e-04
20  3.112284e-04
7   3.398321e-04
9   7.353698e-04
8   8.400409e-03
17  1.363838e-02
5   5.373388e-02
11  5.518758e-02
4   5.645414e-02
23  7.537756e-02
6   8.396037e-02
12  1.022167e-01
3   1.220338e-01
19  2.335962e-01
1   6.855817e-01
14  7.524653e-01
13  8.305820e-01


#### Check if my t_tests are the same as the ones I loaded

In [25]:
all(df_feature_ranking[0] == feature_importance.index)

True

### Filter out p values > 0.05

In [26]:
important_features = feature_importance[feature_importance < 0.05]

In [27]:
important_features.shape

(24, 1)

In [28]:
important_features.head(24)

Unnamed: 0,p_value
15,1.041978e-35
16,6.250837e-08
21,6.250837e-08
0,5.31532e-07
18,1.193866e-05
2,1.818947e-05
22,2.126097e-05
10,0.0002684689
20,0.0003112284
7,0.0003398321


In [29]:
important_features = important_features.dropna()
important_features.shape

(13, 1)

In [30]:
df_binary.columns

RangeIndex(start=0, stop=24, step=1)

In [31]:
df_binary_imp = df_binary[important_features.index]

In [32]:
df_binary_imp.columns

Index([15, 16, 21, 0, 18, 2, 22, 10, 20, 7, 9, 8, 17], dtype='int64')

## Training SVM on Stratified k-fold

In [33]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
  # default parameters
metrics = []

for fold, (train_idx, test_idx) in enumerate(skf.split(df_binary_imp, labels)):
    print(f"\n--- Fold {fold + 1} ---")
    X_train, X_test = df_binary_imp.iloc[train_idx], df_binary_imp.iloc[test_idx]
    y_train, y_test = labels.iloc[train_idx], labels.iloc[test_idx]

    # Feature Scaling
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    clf = SVC()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    # report = classification_report(y_test, y_pred, digits=4, output_dict=True)
    # metrics.append(report)
    # 
    report = classification_report(y_test, y_pred, output_dict=True)
    print(report)

    # Flatten the dictionary into a DataFrame row
    report_df = pd.DataFrame(report).transpose()
    # report_df['Fold'] = fold  # Add fold number

    metrics.append(report_df)

# Combine all fold reports into one DataFrame
df_metrics = pd.concat(metrics, axis=0).reset_index()

# Compute average metrics across folds (excluding support count)
avg_metrics = df_metrics.groupby('index').mean(numeric_only=True)

print("____________________________________________")
print("average metrics: ")
print(avg_metrics)  # Displays properly formatted averaged classification report


--- Fold 1 ---
{'Benign': {'precision': 0.6666666666666666, 'recall': 0.20408163265306123, 'f1-score': 0.3125, 'support': 49.0}, 'Malignant': {'precision': 0.9155844155844156, 'recall': 0.9883177570093458, 'f1-score': 0.950561797752809, 'support': 428.0}, 'accuracy': 0.9077568134171907, 'macro avg': {'precision': 0.7911255411255411, 'recall': 0.5961996948312035, 'f1-score': 0.6315308988764046, 'support': 477.0}, 'weighted avg': {'precision': 0.8900142485048146, 'recall': 0.9077568134171907, 'f1-score': 0.8850166654888936, 'support': 477.0}}

--- Fold 2 ---
{'Benign': {'precision': 0.5555555555555556, 'recall': 0.10204081632653061, 'f1-score': 0.1724137931034483, 'support': 49.0}, 'Malignant': {'precision': 0.905982905982906, 'recall': 0.9906542056074766, 'f1-score': 0.9464285714285714, 'support': 428.0}, 'accuracy': 0.89937106918239, 'macro avg': {'precision': 0.7307692307692308, 'recall': 0.5463475109670036, 'f1-score': 0.5594211822660098, 'support': 477.0}, 'weighted avg': {'precisi

{'Benign': {'precision': 0.7142857142857143, 'recall': 0.20408163265306123, 'f1-score': 0.31746031746031744, 'support': 49.0}, 'Malignant': {'precision': 0.9157667386609071, 'recall': 0.9906542056074766, 'f1-score': 0.9517396184062851, 'support': 428.0}, 'accuracy': 0.909853249475891, 'macro avg': {'precision': 0.8150262264733107, 'recall': 0.597367919130269, 'f1-score': 0.6345999679333012, 'support': 477.0}, 'weighted avg': {'precision': 0.8950695265133506, 'recall': 0.909853249475891, 'f1-score': 0.8865830445145609, 'support': 477.0}}

--- Fold 4 ---
{'Benign': {'precision': 0.7777777777777778, 'recall': 0.14285714285714285, 'f1-score': 0.2413793103448276, 'support': 49.0}, 'Malignant': {'precision': 0.9100642398286938, 'recall': 0.9953161592505855, 'f1-score': 0.9507829977628636, 'support': 427.0}, 'accuracy': 0.907563025210084, 'macro avg': {'precision': 0.8439210088032358, 'recall': 0.5690866510538641, 'f1-score': 0.5960811540538455, 'support': 476.0}, 'weighted avg': {'precision'

In [34]:
(0.6666666666666666 + 0.5555555555555556 + 0.7142857142857143 + 0.7777777777777778 + 0.8461538461538461) / 5

0.7120879120879121

average calculation is correct

In my opinion the model performs well, the most important factor from a patient's perspective, ,Did I find the tumors that are actually Malignant"? (Recall) is almost 100%. However, we can improve on the other metrics to reduce the chance of unnecessary bad news for the patients and eliminate / reduce the cost of further diagnoses.

In [35]:
filt_nonod.head()

Unnamed: 0,filename,patient_id,nodule_id,diagnosis
0,LIDC-IDRI-0001_GT1_1,1,1,Malignant
1,LIDC-IDRI-0001_GT1_1,1,1,Malignant
2,LIDC-IDRI-0001_GT1_1,1,1,Malignant
3,LIDC-IDRI-0001_GT1_1,1,1,Malignant
4,LIDC-IDRI-0001_GT1_1,1,1,Malignant


In [36]:
filt_nonod.shape

(2383, 4)

In [37]:
nodules = filt_nonod['nodule_id']

## Training SVM on Stratified Group K-Fold

In [38]:
cv = StratifiedGroupKFold(n_splits=5)

for fold, (train_idx, test_idx) in enumerate(cv.split(df_binary_imp, labels, nodules)):
    print(f"\n--- Fold {fold + 1} ---")
    X_train, X_test = df_binary_imp.iloc[train_idx], df_binary_imp.iloc[test_idx]
    y_train, y_test = labels.iloc[train_idx], labels.iloc[test_idx]

    # Feature Scaling
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    clf = SVC()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    # report = classification_report(y_test, y_pred, digits=4, output_dict=True)
    # metrics.append(report)
    # 
    report = classification_report(y_test, y_pred, output_dict=True)
    print(report)

    # Flatten the dictionary into a DataFrame row
    report_df = pd.DataFrame(report).transpose()
    # report_df['Fold'] = fold  # Add fold number

    metrics.append(report_df)

# Combine all fold reports into one DataFrame
df_metrics = pd.concat(metrics, axis=0).reset_index()

# Compute average metrics across folds (excluding support count)
avg_metrics = df_metrics.groupby('index').mean(numeric_only=True)

print("____________________________________________")
print("average metrics: ")
print(avg_metrics)  # Displays properly formatted averaged classification report


--- Fold 1 ---
{'Benign': {'precision': 0.5714285714285714, 'recall': 0.23931623931623933, 'f1-score': 0.3373493975903614, 'support': 117.0}, 'Malignant': {'precision': 0.9135922330097087, 'recall': 0.9781704781704782, 'f1-score': 0.9447791164658634, 'support': 962.0}, 'accuracy': 0.8980537534754403, 'macro avg': {'precision': 0.74251040221914, 'recall': 0.6087433587433587, 'f1-score': 0.6410642570281124, 'support': 1079.0}, 'weighted avg': {'precision': 0.8764901492238023, 'recall': 0.8980537534754403, 'f1-score': 0.8789132433347848, 'support': 1079.0}}

--- Fold 2 ---


{'Benign': {'precision': 0.75, 'recall': 0.16216216216216217, 'f1-score': 0.26666666666666666, 'support': 37.0}, 'Malignant': {'precision': 0.9460869565217391, 'recall': 0.9963369963369964, 'f1-score': 0.9705619982158786, 'support': 546.0}, 'accuracy': 0.9433962264150944, 'macro avg': {'precision': 0.8480434782608696, 'recall': 0.5792495792495793, 'f1-score': 0.6186143324412726, 'support': 583.0}, 'weighted avg': {'precision': 0.933642329778507, 'recall': 0.9433962264150944, 'f1-score': 0.9258893956990333, 'support': 583.0}}

--- Fold 3 ---
{'Benign': {'precision': 1.0, 'recall': 0.09803921568627451, 'f1-score': 0.17857142857142858, 'support': 51.0}, 'Malignant': {'precision': 0.8553459119496856, 'recall': 1.0, 'f1-score': 0.9220338983050848, 'support': 272.0}, 'accuracy': 0.8575851393188855, 'macro avg': {'precision': 0.9276729559748428, 'recall': 0.5490196078431373, 'f1-score': 0.5503026634382566, 'support': 323.0}, 'weighted avg': {'precision': 0.8781860311155247, 'recall': 0.857585

### Conclusion:

This method is useful when you have grouped data, like medical records where multiple samples belong to the same patient. It prevents data leakage by ensuring that all samples from a single group are either in the training or test set, but never both.

Our results are slightly worse, possibly because of the small ammout of data leakage prevented

## Hyperparameter tuning

1. Purpose of cv_outer (Outer Loop)
What it does: Splits the data into training + validation (inner loop) and test (holdout) sets.

Why it's needed:

To evaluate the generalization performance of the model trained with the best hyperparameters.

Each fold in the outer loop gives an independent test set to avoid optimistic bias in performance metrics.

2. Purpose of cv_inner (Inner Loop)
What it does: Further splits the training set (from cv_outer) into smaller training/validation sets for hyperparameter tuning (e.g., Grid Search/Random Search/Optuna).

Why it's needed:

To select the best hyperparameters without peeking at the outer test set (prevents data leakage).

Ensures the model is tuned fairly within each outer fold.

### Grid search

In [39]:
xxx to stop from recalculating the upcoming grid search

SyntaxError: invalid syntax (1505234791.py, line 1)

In [None]:
cv_outer = StratifiedGroupKFold(n_splits=5)
cv_inner = StratifiedGroupKFold(n_splits=5)

In [None]:
# Define parameter grid
param_grid = {
    'svc__C': [0.01, 0.1, 1, 10, 100],
    'svc__kernel': ['linear', 'rbf'],
    'svc__gamma': ['scale', 'auto']
}

# param_grid = {
#     'svc__C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization (default=1.0)
#     'svc__kernel': ['linear', 'rbf'],  # Kernel type (default='rbf')
#     'svc__shrinking': [True, False],  # Use shrinking heuristic (default=True)
#     'svc__probability': [True, False],  # Enable probability estimates (default=False)
#     'svc__tol': [1e-3, 1e-4, 1e-5],  # Tolerance for stopping (default=1e-3)
#     'svc__class_weight': [None, 'balanced']  # Handle imbalanced classes (default=None)
# }

metrics = []
cv_outer = StratifiedGroupKFold(n_splits=5)
cv_inner = StratifiedGroupKFold(n_splits=5)

# for fold, (train_idx, test_idx) in enumerate(cv_outer.split(df_binary_imp, labels, nodules)):
#     print(f"\n--- Fold {fold + 1} ---")
#     X_train, X_test = df_binary_imp.iloc[train_idx], df_binary_imp.iloc[test_idx]
#     y_train, y_test = labels.iloc[train_idx], labels.iloc[test_idx]
#     groups_train = nodules.iloc[train_idx]

#     # Pipeline with scaler and SVC
#     pipeline = Pipeline([('scaler', StandardScaler()), ('svc', SVC())])
    
#     # Grid Search
#     clf = GridSearchCV(pipeline, param_grid, cv=cv_inner, scoring='accuracy', n_jobs=-1)
#     clf.fit(X_train, y_train, groups=groups_train)
    
#     # Evaluate
#     y_pred = clf.predict(X_test)
#     report = classification_report(y_test, y_pred, output_dict=True)
#     report_df = pd.DataFrame(report).transpose()
#     metrics.append(report_df)

pipeline = Pipeline([('scaler', StandardScaler()), ('svc', SVC())])

# Initialize grid search with cv_inner
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=cv_inner,  # Inner CV for hyperparameter tuning
    scoring='accuracy',
    n_jobs=-1,
    refit=True, # Refit the best model on the full dataset
    verbose = 1  # get some intermediate info about the training
)

# Fit on ALL data (handles train/test splits internally)
grid_search.fit(df_binary_imp, labels, groups=nodules)

# Best model and params
print("Best params:", grid_search.best_params_)
best_model = grid_search.best_estimator_



Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best params: {'svc__C': 1, 'svc__gamma': 'scale', 'svc__kernel': 'rbf'}


In [None]:

scoring = {
    'accuracy': 'accuracy',
    'precision_malignant': make_scorer(precision_score, pos_label='Malignant'),
    'recall_malignant': make_scorer(recall_score, pos_label='Malignant'),
    'f1_malignant': make_scorer(f1_score, pos_label='Malignant'),
    'precision_benign': make_scorer(precision_score, pos_label='Benign'),
    'recall_benign': make_scorer(recall_score, pos_label='Benign'),
    'f1_benign': make_scorer(f1_score, pos_label='Benign')
}

# Run cross-validation
results = cross_validate(
    best_model,
    df_binary_imp,
    labels,
    cv=cv_outer,
    groups=nodules,
    scoring=scoring,
    n_jobs=-1
)

# Print full results
print("\n=== Detailed Metrics Per Fold ===")
for fold in range(5):
    print(f"\nFold {fold + 1}:")
    print(f"  Accuracy: {results['test_accuracy'][fold]:.4f}")
    print("  Malignant:")
    print(f"    Precision: {results['test_precision_malignant'][fold]:.4f}")
    print(f"    Recall: {results['test_recall_malignant'][fold]:.4f}")
    print(f"    F1: {results['test_f1_malignant'][fold]:.4f}")
    print("  Benign:")
    print(f"    Precision: {results['test_precision_benign'][fold]:.4f}")
    print(f"    Recall: {results['test_recall_benign'][fold]:.4f}")
    print(f"    F1: {results['test_f1_benign'][fold]:.4f}")

# Print averages
print("\n=== Average Metrics ===")
print(f"Accuracy: {np.mean(results['test_accuracy']):.4f} (±{np.std(results['test_accuracy']):.4f})")
print("\nMalignant:")
print(f"  Precision: {np.mean(results['test_precision_malignant']):.4f} (±{np.std(results['test_precision_malignant']):.4f})")
print(f"  Recall: {np.mean(results['test_recall_malignant']):.4f} (±{np.std(results['test_recall_malignant']):.4f})")
print(f"  F1: {np.mean(results['test_f1_malignant']):.4f} (±{np.std(results['test_f1_malignant']):.4f})")
print("\nBenign:")
print(f"  Precision: {np.mean(results['test_precision_benign']):.4f} (±{np.std(results['test_precision_benign']):.4f})")
print(f"  Recall: {np.mean(results['test_recall_benign']):.4f} (±{np.std(results['test_recall_benign']):.4f})")
print(f"  F1: {np.mean(results['test_f1_benign']):.4f} (±{np.std(results['test_f1_benign']):.4f})")


=== Detailed Metrics Per Fold ===

Fold 1:
  Accuracy: 0.8981
  Malignant:
    Precision: 0.9136
    Recall: 0.9782
    F1: 0.9448
  Benign:
    Precision: 0.5714
    Recall: 0.2393
    F1: 0.3373

Fold 2:
  Accuracy: 0.9434
  Malignant:
    Precision: 0.9461
    Recall: 0.9963
    F1: 0.9706
  Benign:
    Precision: 0.7500
    Recall: 0.1622
    F1: 0.2667

Fold 3:
  Accuracy: 0.8576
  Malignant:
    Precision: 0.8553
    Recall: 1.0000
    F1: 0.9220
  Benign:
    Precision: 1.0000
    Recall: 0.0980
    F1: 0.1786

Fold 4:
  Accuracy: 0.9950
  Malignant:
    Precision: 0.9949
    Recall: 1.0000
    F1: 0.9975
  Benign:
    Precision: 1.0000
    Recall: 0.7500
    F1: 0.8571

Fold 5:
  Accuracy: 0.8325
  Malignant:
    Precision: 0.8299
    Recall: 1.0000
    F1: 0.9070
  Benign:
    Precision: 1.0000
    Recall: 0.0833
    F1: 0.1538

=== Average Metrics ===
Accuracy: 0.9053 (±0.0585)

Malignant:
  Precision: 0.9080 (±0.0599)
  Recall: 0.9949 (±0.0085)
  F1: 0.9484 (±0.0326)

Benig

In [None]:
# Aggregate results
df_metrics = pd.concat(metrics, axis=0).reset_index()
avg_metrics = df_metrics.groupby('index').mean(numeric_only=True)
print("\nAverage Metrics (Grid Search):")
print(avg_metrics)


Average Metrics (Grid Search):
              precision    recall  f1-score     support
index                                                  
Benign         0.922727  0.198645  0.296898   49.000000
Malignant      0.905923  0.996466  0.948050  427.600000
accuracy       0.904527  0.904527  0.904527    0.904527
macro avg      0.914325  0.597555  0.622474  476.600000
weighted avg   0.910999  0.904527  0.872081  476.600000


My accuracy metrics just got worse

### Random search

In [None]:
# Define parameter distributions
param_dist = {
    'svc__C': loguniform(1e-3, 1e3),
    'svc__kernel': ['linear', 'rbf'],
    'svc__gamma': ['scale', 'auto'] + list(np.logspace(-3, 3, 5)),
    'svc__shrinking': [True, False],  # Use shrinking heuristic (default=True)
    'svc__probability': [True, False],  # Enable probability estimates (default=False)
    'svc__tol': [1e-3, 1e-4, 1e-5],  # Tolerance for stopping (default=1e-3)
    'svc__class_weight': [None, 'balanced']  # Handle imbalanced classes (default=None)
}

metrics = []
cv = StratifiedGroupKFold(n_splits=5)
pipeline = Pipeline([('scaler', StandardScaler()), ('svc', SVC())])

# Initialize RandomizedSearchCV with OUTER CV
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=20,  # Number of random combinations to try
    cv=StratifiedGroupKFold(n_splits=5),  # Inner CV for hyperparameter tuning
    scoring='accuracy',
    n_jobs=-1,
    random_state=42,
    refit=True  # Refit the best model on the full dataset
)

# Fit on ALL data (handles train/test splits internally)
random_search.fit(df_binary_imp, labels, groups=nodules)

# Best model and params
print("Best params:", random_search.best_params_)
best_model = random_search.best_estimator_

Best params: {'svc__C': np.float64(51.41096648805744), 'svc__class_weight': None, 'svc__gamma': np.float64(0.03162277660168379), 'svc__kernel': 'linear', 'svc__probability': False, 'svc__shrinking': False, 'svc__tol': 0.001}


Save the model for not having to retrain in case of crash

In [None]:
from joblib import dump
from joblib import load

# Save the model to a file
dump(best_model, 'best_model.joblib')

# Later, load it back

bm = load('best_model.joblib')

In [None]:
# Initialize RandomizedSearchCV with OUTER CV. We could not go with the previous way as the grid search
# because we want to avoid changing the model randomly between each fold


scoring = {
    'accuracy': 'accuracy',
    'precision_malignant': make_scorer(precision_score, pos_label='Malignant'),
    'recall_malignant': make_scorer(recall_score, pos_label='Malignant'),
    'f1_malignant': make_scorer(f1_score, pos_label='Malignant'),
    'precision_benign': make_scorer(precision_score, pos_label='Benign'),
    'recall_benign': make_scorer(recall_score, pos_label='Benign'),
    'f1_benign': make_scorer(f1_score, pos_label='Benign')
}

# Run cross-validation
results = cross_validate(
    best_model,
    df_binary_imp,
    labels,
    cv=StratifiedGroupKFold(n_splits=5),
    groups=nodules,
    scoring=scoring,
    n_jobs=-1
)

# Print full results
print("\n=== Detailed Metrics Per Fold ===")
for fold in range(5):
    print(f"\nFold {fold + 1}:")
    print(f"  Accuracy: {results['test_accuracy'][fold]:.4f}")
    print("  Malignant:")
    print(f"    Precision: {results['test_precision_malignant'][fold]:.4f}")
    print(f"    Recall: {results['test_recall_malignant'][fold]:.4f}")
    print(f"    F1: {results['test_f1_malignant'][fold]:.4f}")
    print("  Benign:")
    print(f"    Precision: {results['test_precision_benign'][fold]:.4f}")
    print(f"    Recall: {results['test_recall_benign'][fold]:.4f}")
    print(f"    F1: {results['test_f1_benign'][fold]:.4f}")

# Print averages
print("\n=== Average Metrics ===")
print(f"Accuracy: {np.mean(results['test_accuracy']):.4f} (±{np.std(results['test_accuracy']):.4f})")
print("\nMalignant:")
print(f"  Precision: {np.mean(results['test_precision_malignant']):.4f} (±{np.std(results['test_precision_malignant']):.4f})")
print(f"  Recall: {np.mean(results['test_recall_malignant']):.4f} (±{np.std(results['test_recall_malignant']):.4f})")
print(f"  F1: {np.mean(results['test_f1_malignant']):.4f} (±{np.std(results['test_f1_malignant']):.4f})")
print("\nBenign:")
print(f"  Precision: {np.mean(results['test_precision_benign']):.4f} (±{np.std(results['test_precision_benign']):.4f})")
print(f"  Recall: {np.mean(results['test_recall_benign']):.4f} (±{np.std(results['test_recall_benign']):.4f})")
print(f"  F1: {np.mean(results['test_f1_benign']):.4f} (±{np.std(results['test_f1_benign']):.4f})")


=== Detailed Metrics Per Fold ===

Fold 1:
  Accuracy: 0.9008
  Malignant:
    Precision: 0.9130
    Recall: 0.9823
    F1: 0.9464
  Benign:
    Precision: 0.6136
    Recall: 0.2308
    F1: 0.3354

Fold 2:
  Accuracy: 0.9417
  Malignant:
    Precision: 0.9414
    Recall: 1.0000
    F1: 0.9698
  Benign:
    Precision: 1.0000
    Recall: 0.0811
    F1: 0.1500

Fold 3:
  Accuracy: 0.8576
  Malignant:
    Precision: 0.8553
    Recall: 1.0000
    F1: 0.9220
  Benign:
    Precision: 1.0000
    Recall: 0.0980
    F1: 0.1786

Fold 4:
  Accuracy: 0.9900
  Malignant:
    Precision: 0.9899
    Recall: 1.0000
    F1: 0.9949
  Benign:
    Precision: 1.0000
    Recall: 0.5000
    F1: 0.6667

Fold 5:
  Accuracy: 0.8274
  Malignant:
    Precision: 0.8256
    Recall: 1.0000
    F1: 0.9045
  Benign:
    Precision: 1.0000
    Recall: 0.0556
    F1: 0.1053

=== Average Metrics ===
Accuracy: 0.9035 (±0.0581)

Malignant:
  Precision: 0.9051 (±0.0589)
  Recall: 0.9965 (±0.0071)
  F1: 0.9475 (±0.0324)

Benig

The accuracies are again a little bit worse...

### Optuna

In [None]:
import optuna
from optuna.samplers import TPESampler
from sklearn.metrics import accuracy_score

def objective(trial, X_train, y_train, groups_train):
    # Hyperparameters to optimize
    C = trial.suggest_float("C", 1e-3, 1e3, log=True)
    kernel = trial.suggest_categorical("kernel", ["linear", "rbf"])
    
    # Conditional parameters
    params = {
        "C": C,
        "kernel": kernel,
        "shrinking": trial.suggest_categorical("shrinking", [True, False]),
        "tol": trial.suggest_float("tol", 1e-5, 1e-1, log=True),
        "class_weight": trial.suggest_categorical("class_weight", [None, "balanced"])
    }

    # Model training and validation
    model = SVC(**params, random_state=42)
    scores = cross_val_score(
        model, X_train, y_train, cv=cv_inner, groups=groups_train, scoring="accuracy"
    )
    return scores.mean()

# Initialize components
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_binary_imp)
cv_inner = StratifiedGroupKFold(n_splits=3)  # Define your inner CV

# Optuna study
study = optuna.create_study(direction="maximize", sampler=TPESampler(seed=42))
study.optimize(lambda trial: objective(trial, X_scaled, labels, nodules), n_trials=50)

# Get best model
best_params = study.best_params
best_model = SVC(**best_params, random_state=42)


[I 2025-05-22 15:09:31,358] A new study created in memory with name: no-name-fabbb7a1-d3e4-452e-b41b-23d853a04f4a
[I 2025-05-22 15:09:31,512] Trial 0 finished with value: 0.9048218322635139 and parameters: {'C': 0.1767016940294795, 'kernel': 'linear', 'shrinking': True, 'tol': 1.7073967431528103e-05, 'class_weight': None}. Best is trial 0 with value: 0.9048218322635139.
[I 2025-05-22 15:09:31,855] Trial 1 finished with value: 0.8438383012594753 and parameters: {'C': 17.71884735480682, 'kernel': 'poly', 'shrinking': True, 'tol': 5.415244119402538e-05, 'class_weight': 'balanced', 'gamma_poly': 0.005342937261279773, 'degree': 3, 'coef0': 0.22370578944475894}. Best is trial 0 with value: 0.9048218322635139.
[I 2025-05-22 15:09:32,107] Trial 2 finished with value: 0.8984666286383796 and parameters: {'C': 0.006870101665590026, 'kernel': 'rbf', 'shrinking': True, 'tol': 0.0011400863701127321, 'class_weight': None, 'gamma_rbf': 0.02692646910086179}. Best is trial 0 with value: 0.90482183226351

KeyboardInterrupt: 