## Observations till now : 
### Using PCA to reduce dimentionality : 
60 Principle Components give us 87% balanced acc in svm classifier. The acc score does not increase after that mark even if we increase the number of pcs.
### Usinng KPCA to reduce dimentionality :  
This also shows a 60 Principle Component count to be optimum. Also by cross-validation we can observe that linear kernel is perfromaing better in kpca which is analogous to simple pca. So there is definetly some linearity among the features.
## Goal :
Test the accuracy of different ML Models with Simple PCA with 60 PCs 

In [2]:
import numpy as np
import pandas as pd

from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV,StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from utils import *
from sklearn.metrics import balanced_accuracy_score,f1_score,confusion_matrix
from scipy.stats import uniform, randint

In [59]:
gene_exp_df = read_dataframe_from_pickle("data/processed_data/gene_exp_data.pkl")
label_df = read_dataframe_from_pickle("data/processed_data/label_data.pkl")
print("--" * 80)
print(f"Entries in Gene Expression Dataframe : {len(gene_exp_df)}")
print(f"Entries in Label Dataframe : {len(label_df)}")
labels_with_high_freq_df = remove_low_frequency_labels(label_df,threshold=150)
extracted_data,extracted_label = collect_relevant_data(gene_exp_df_bkp=gene_exp_df,label_df_bkp=labels_with_high_freq_df)
encoded_labels,label_encoder = encode_labels(extracted_label)
print("--" * 80)
print(f"Entries in Extracted Gene Expression Dataframe : {len(extracted_data)}")
print(f"Entries in Extracted Label Dataframe : {len(encoded_labels)}")

DataFrame successfully loaded from data/processed_data/gene_exp_data.pkl
DataFrame successfully loaded from data/processed_data/label_data.pkl
----------------------------------------------------------------------------------------------------------------------------------------------------------------
Entries in Gene Expression Dataframe : 5268
Entries in Label Dataframe : 5268
----------------------------------------------------------------------------------------------------------------------------------------------------------------
Entries in Extracted Gene Expression Dataframe : 4392
Entries in Extracted Label Dataframe : 4392


In [60]:
random_state = 42
n_classes = len(np.unique(encoded_labels))

In [61]:
X_train, X_test, y_train, y_test = train_test_split(
    extracted_data, encoded_labels, 
    test_size=0.2, 
    stratify=encoded_labels,  # Critical for imbalanced data
    random_state=random_state
)

In [63]:
cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=random_state)

In [64]:
def build_pipeline(model):
    return Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=60, random_state=random_state)),
        ('clf', model)
    ])

In [67]:
models = {
    'logreg': LogisticRegression (multi_class='multinomial',max_iter=5000),
    'svm': SVC(probability=True),
    'xgb': XGBClassifier(
        objective='multi:softprob',
        num_class=n_classes,
        eval_metric='mlogloss',
        verbosity=0
    ),
    'lgbm': LGBMClassifier(
                        objective='multiclass', 
                        num_class=n_classes,
                        random_state=random_state,
                        early_stopping_rounds=10,
                        eval_set=[(X_test, y_test)],
    ),         
    'rf': RandomForestClassifier(n_estimators=100, random_state=random_state),

    'mlp': MLPClassifier(hidden_layer_sizes=(100,), 
                        max_iter=300,
                        validation_fraction=0.2 ,
                        random_state=random_state)
}

In [68]:
param_grids = {
    'logreg': {
        'clf__C': uniform(0.01, 10),
        'clf__penalty': ['l2'],
        'clf__solver': ['lbfgs'],
        'clf__class_weight': ['balanced']
    },
    'svm': {
        'clf__C': uniform(0.01, 10),
        'clf__gamma': ['scale', 'auto'],
        'clf__kernel': ['rbf'],
        'clf__class_weight': ['balanced']
    },
    'xgb': {
        'clf__n_estimators': randint(100, 300),
        'clf__max_depth': randint(3, 10),
        'clf__learning_rate': uniform(0.01, 0.3),
    },
    'lgbm': {
        'clf__n_estimators': randint(100, 300),
        'clf__max_depth': randint(3, 10),
        'clf__learning_rate': uniform(0.01, 0.3),
        'clf__class_weight': ['balanced'],
        'clf__subsample': uniform(0.7, 0.3),
        'clf__colsample_bytree': uniform(0.7, 0.3),
    },
    'rf': {
        'clf__n_estimators': randint(100, 500),
        'clf__max_depth': randint(5, 30),
        'clf__min_samples_split': randint(2, 20),
        'clf__min_samples_leaf': randint(1, 10),
        'clf__class_weight': ['balanced', 'balanced_subsample']
    },
    'mlp': {
        'clf__hidden_layer_sizes': [(100,), (100, 50), (200, 100, 50)],
        'clf__activation': ['relu', 'tanh'],
        'clf__alpha': uniform(0.0001, 0.01),
        'clf__learning_rate': ['constant', 'adaptive'],
        'clf__learning_rate_init': uniform(0.001, 0.01),
        'clf__early_stopping': [True],
    }
}


In [69]:
results = {}

In [None]:
for name, model in models.items():
    if name not in ["xgb"]:
        continue
    try:
        print("=="*40)
        print(f"\n🔎 Tuning {name.upper()}")
        pipe = build_pipeline(model)
        fit_params = {}
        if name in ['xgb', 'lgbm']:
            fit_params = {
                'clf__early_stopping_rounds': 10,
                'clf__eval_set': [(X_test, y_test)],
            }
        search = RandomizedSearchCV(
            pipe,
            param_distributions=param_grids[name],
            n_iter=25,
            scoring='f1_macro',
            n_jobs=6,
            cv=cv,
            verbose=1,
            random_state=random_state,
            return_train_score=True 
        )
        
        search.fit(X_train, y_train, **fit_params)
        print("--"*35)
        print(f"\n✅ Best params for {name.upper()}:")
        print(search.best_params_)
        print(f"\n✅ Best score for {name.upper()}:")
        print(search.best_score_)
        
        # Store the best estimator
        best_model = search.best_estimator_
        
        # Predictions
        y_pred = best_model.predict(X_test)
        y_proba = best_model.predict_proba(X_test)
        
        # Evaluation metrics
        class_report = classification_report(y_test, y_pred, digits=4, output_dict=True)

        results[name] = {
            'best_params': search.best_params_,
            'best_score': search.best_score_,
            'test_accuracy': balanced_accuracy_score(y_test, y_pred),
            'test_f1_macro': f1_score(y_test, y_pred, average='macro'),
            'test_f1_weighted': f1_score(y_test, y_pred, average='weighted'),
            'class_report': class_report,
            'confusion_matrix': confusion_matrix(y_test, y_pred),
            'best_estimator': best_model,
            'y_proba': y_proba
        }
        print(f"Successfully Completed all cross validations for model : {name.upper()}")
    except Exception as e:
        print(f"Encountered error at {name.upper} :: {e}")
        continue
# Create a summary dataframe for easy comparison
summary = pd.DataFrame({
    model_name: {
        'CV F1 Macro': results[model_name]['best_score'],
        'Test F1 Macro': results[model_name]['test_f1_macro'],
        'Test F1 Weighted': results[model_name]['test_f1_weighted'],
        'Test Accuracy': results[model_name]['test_accuracy']
    } for model_name in results
}).T
summary.to_csv("Summary_Metrics.csv",index=False)
print("\n📋 Model Performance Summary:")
print(summary.sort_values('Test F1 Macro', ascending=False))

In [None]:
for model, performance in results.items():
    print(model)
    for key,value in performance.items():
        if key in ["confusion_matrix","y_proba","best_estimator","class_report"]:
            continue
        print(f"{key} :: {value}")

logreg
best_params :: {'clf__C': np.float64(0.21584494295802448), 'clf__class_weight': 'balanced', 'clf__penalty': 'l2', 'clf__solver': 'lbfgs'}
best_score :: 0.8431889547360782
test_accuracy :: 0.8863999955338829
test_f1_macro :: 0.8827624368906615
test_f1_weighted :: 0.9123178801532476
svm
best_params :: {'clf__C': np.float64(3.0561376917337064), 'clf__class_weight': 'balanced', 'clf__gamma': 'scale', 'clf__kernel': 'rbf'}
best_score :: 0.869812051474097
test_accuracy :: 0.8832150594347234
test_f1_macro :: 0.8790503177479092
test_f1_weighted :: 0.9133025643218163
rf
best_params :: {'clf__class_weight': 'balanced_subsample', 'clf__max_depth': 29, 'clf__min_samples_leaf': 3, 'clf__min_samples_split': 6, 'clf__n_estimators': 406}
best_score :: 0.867516826619347
test_accuracy :: 0.8785418199152417
test_f1_macro :: 0.8758963165943556
test_f1_weighted :: 0.9068325800959735
mlp
best_params :: {'clf__activation': 'tanh', 'clf__alpha': np.float64(0.002096737821583597), 'clf__early_stopping': 

In [3]:
t = {
    "model" : ["logreg","svm","rf","mlp"],
    "best_score" : [    0.8431889547360782,
                        0.869812051474097,
                        0.867516826619347,
                        0.8634794004906915],
    "test_accuracy":[   0.8863999955338829,
                        0.8832150594347234,
                        0.8785418199152417,
                        0.8896447171988987,],
    "test_f1_macro": [  0.8827624368906615,
                        0.8790503177479092,
                        0.8758963165943556,
                        0.8908134580211068,],
    "test_f1_weighted":[0.9123178801532476,
                        0.9133025643218163,
                        0.9068325800959735,
                        0.9176375648897099,]                                               
}
df = pd.DataFrame(t)
df.to_csv("results/Summary_Metrics.csv")

In [5]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

metrics = ["Best Score", "Test Accuracy", "F1 Macro", "F1 Weighted"]
fig = go.Figure()

# Add one trace per model
for i, model in enumerate(t["model"]):
    values = [
        t["best_score"][i],
        t["test_accuracy"][i],
        t["test_f1_macro"][i],
        t["test_f1_weighted"][i]
    ]
    # Close the loop for radar chart
    values += [values[0]]
    fig.add_trace(go.Scatterpolar(
        r=values,
        theta=metrics + [metrics[0]],  # loop back
        fill='toself',
        name=model
    ))

fig.update_layout(
    title="Model Comparison - Radar Chart",
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0.8, 1.0]
        )
    ),
    showlegend=True
)

fig.show()