In [1]:
import numpy as np
import pandas as pd

from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV,StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from utils import *
from sklearn.metrics import balanced_accuracy_score,f1_score,precision_score,recall_score
from scipy.stats import uniform, randint
random_state = 42

In [2]:
gene_exp_df = read_dataframe_from_pickle("data/processed_data/gene_exp_data.pkl")
label_df = read_dataframe_from_pickle("data/processed_data/label_data.pkl")
print("--" * 80)
print(f"Entries in Gene Expression Dataframe : {len(gene_exp_df)}")
print(f"Entries in Label Dataframe : {len(label_df)}")
labels_with_high_freq_df = remove_low_frequency_labels(label_df,threshold=150)
extracted_data,extracted_label = collect_relevant_data(gene_exp_df_bkp=gene_exp_df,label_df_bkp=labels_with_high_freq_df)
encoded_labels,label_encoder = encode_labels(extracted_label)
print("--" * 80)
print(f"Entries in Extracted Gene Expression Dataframe : {len(extracted_data)}")
print(f"Entries in Extracted Label Dataframe : {len(encoded_labels)}")

DataFrame successfully loaded from data/processed_data/gene_exp_data.pkl
DataFrame successfully loaded from data/processed_data/label_data.pkl
----------------------------------------------------------------------------------------------------------------------------------------------------------------
Entries in Gene Expression Dataframe : 5268
Entries in Label Dataframe : 5268
----------------------------------------------------------------------------------------------------------------------------------------------------------------
Entries in Extracted Gene Expression Dataframe : 4392
Entries in Extracted Label Dataframe : 4392


In [3]:
n_classes = len(np.unique(encoded_labels))

In [4]:

X_train, X_test, y_train, y_test = train_test_split(
    extracted_data, encoded_labels, 
    test_size=0.2, 
    stratify=encoded_labels,  # Critical for imbalanced data
    random_state=random_state
)

In [5]:
cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=random_state)

In [6]:
def build_pipeline(model):
    return Pipeline([
        ('scaler', StandardScaler()),
        ('clf', model)
    ])

In [7]:
models = {
    'logreg': LogisticRegression (multi_class='multinomial',max_iter=5000,random_state=random_state),
    'svm': SVC(probability=True,random_state=random_state),
    'rf': RandomForestClassifier(n_estimators=100, random_state=random_state),
    'lgbm': LGBMClassifier(
                        objective='multiclass', 
                        num_class=n_classes,
                        random_state=random_state,
                        early_stopping_rounds=10,
                        eval_set=[(X_test, y_test)],
    ),         
}

In [8]:
param_grids = {
    'logreg': {
        'clf__C': uniform(0.01, 10),
        'clf__penalty': ['l2'],
        'clf__solver': ['lbfgs'],
        'clf__class_weight': ['balanced']
    },
    'svm': {
        'clf__C': uniform(0.01, 10),
        'clf__gamma': ['scale', 'auto'],
        'clf__kernel': ['rbf'],
        'clf__class_weight': ['balanced']
    },
    'lgbm': {
        'clf__n_estimators': randint(100, 300),
        'clf__max_depth': randint(3, 10),
        'clf__learning_rate': uniform(0.01, 0.3),
        'clf__class_weight': ['balanced'],
        'clf__subsample': uniform(0.7, 0.3),
        'clf__colsample_bytree': uniform(0.7, 0.3),
    },
    'rf': {
        'clf__n_estimators': randint(100, 500),
        'clf__max_depth': randint(5, 30),
        'clf__min_samples_split': randint(2, 20),
        'clf__min_samples_leaf': randint(1, 10),
        'clf__class_weight': ['balanced', 'balanced_subsample']
    },
}


In [9]:
results = {}

In [None]:
for name, model in models.items():
    try:
        print("=="*40)
        print(f"\n🔎 Tuning {name.upper()}")
        pipe = build_pipeline(model)
        fit_params = {}
        if name in ['lgbm']:
            fit_params = {
                'clf__early_stopping_rounds': 10,
                'clf__eval_set': [(X_test, y_test)],
            }
        search = RandomizedSearchCV(
            pipe,
            param_distributions=param_grids[name],
            n_iter=20,
            scoring='f1_macro',
            n_jobs=-1,
            cv=cv,
            verbose=1,
            random_state=random_state,
            return_train_score=True 
        )
        
        search.fit(X_train, y_train, **fit_params)
        print("--"*35)
        print(f"\n✅ Best params for {name.upper()}:")
        print(search.best_params_)
        print(f"\n✅ Best score for {name.upper()}:")
        print(search.best_score_)
        
        # Store the best estimator
        best_model = search.best_estimator_
        best_index = search.best_index_
        train_score = search.cv_results_['mean_train_score'][best_index]
        # Predictions
        y_pred = best_model.predict(X_test)        
        # Evaluation metrics
        # class_report = classification_report(y_test, y_pred, digits=4, output_dict=True)

        results[name] = {
            'best_params': search.best_params_,
            "best_test_score": search.best_score_,
            'best_train_score':train_score,
            'test_balanced_accuracy': balanced_accuracy_score(y_test, y_pred),
            'test_f1_weighted': f1_score(y_test, y_pred, average='weighted'),
            "precision_weighted" : precision_score(y_test, y_pred, average='weighted'),
            "recall_weighted" : recall_score(y_test, y_pred, average='weighted'),
            'best_estimator': best_model,
        }
        print(f"Successfully Completed all cross validations for model : {name.upper()}")
    except Exception as e:
        print(f"Encountered error at {name.upper} :: {e}")
        continue


In [None]:
# Create a summary dataframe for easy comparison
summary = pd.DataFrame({
    model_name: {
        'model_test_score': round(results[model_name]["best_test_score"],5),
        'model_train_score': round(results[model_name]["best_train_score"],5),
        'test_f1_Score(weighted)': round(results[model_name]['test_f1_weighted'],5),
        'test_precision_Score(weighted)': round(results[model_name]['precision_weighted'],5),
        'test_recall_Score(weighted)': round(results[model_name]['recall_weighted'],5),
        'test_balanced_accuracy_score': round(results[model_name]['test_balanced_accuracy'],5),
    } for model_name in results
}).T
summary.to_csv("results/Summary_Metrics_raw.csv",index=False)



📋 Model Performance Summary:
        model_test_score  model_train_score  test_f1_Score(weighted)  \
logreg           0.88564                1.0                  0.92818   

        test_precision_Score(weighted)  test_recall_Score(weighted)  \
logreg                         0.93311                      0.92833   

        test_balanced_accuracy_score  
logreg                       0.89742  


## Model Performance Summary : 

In [None]:
summary.sort_values('model_test_score', ascending=False)

Unnamed: 0,model_test_score,model_train_score,test_f1_Score(weighted),test_precision_Score(weighted),test_recall_Score(weighted),test_balanced_accuracy_score
logreg,0.88564,1.0,0.92818,0.93311,0.92833,0.89742
