In [86]:
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.svm import SVC
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split,RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import balanced_accuracy_score,f1_score,precision_score,recall_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy.stats import gaussian_kde
from utils import *
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from scipy.stats import randint, uniform

import networkx as nx


In [5]:
gene_exp_df = read_dataframe_from_pickle("data/processed_data/gene_exp_data.pkl")
label_df = read_dataframe_from_pickle("data/processed_data/label_data.pkl")
print("--" * 80)
print(f"Entries in Gene Expression Dataframe : {len(gene_exp_df)}")
print(f"Entries in Label Dataframe : {len(label_df)}")
labels_with_high_freq_df = remove_low_frequency_labels(label_df,threshold=150)
extracted_data,extracted_label = collect_relevant_data(gene_exp_df_bkp=gene_exp_df,label_df_bkp=labels_with_high_freq_df)
encoded_labels,label_encoder = encode_labels(extracted_label)
print("--" * 80)
print(f"Entries in Extracted Gene Expression Dataframe : {len(extracted_data)}")
print(f"Entries in Extracted Label Dataframe : {len(encoded_labels)}")

DataFrame successfully loaded from data/processed_data/gene_exp_data.pkl
DataFrame successfully loaded from data/processed_data/label_data.pkl
----------------------------------------------------------------------------------------------------------------------------------------------------------------
Entries in Gene Expression Dataframe : 5268
Entries in Label Dataframe : 5268
----------------------------------------------------------------------------------------------------------------------------------------------------------------
Entries in Extracted Gene Expression Dataframe : 4392
Entries in Extracted Label Dataframe : 4392


In [168]:
def compute_variance_inflation_threshold(X_scaled,vif_thresholds):
# Apply MinMax scaling
    vif_xtemp = {}
    X_temp = X_scaled.copy()
    iteration = 1

    while True:
        print(f"Iteration {iteration}")
        vif_df = pd.DataFrame()
        vif_df["Feature"] = X_temp.columns
        vif_df["VIF"] = [variance_inflation_factor(X_temp.values, i) for i in range(X_temp.shape[1])]
        
        max_vif = vif_df["VIF"].max()

        for thresh in vif_thresholds:
            if thresh not in vif_xtemp and max_vif <= thresh:
                vif_xtemp[thresh] = X_temp.copy()
                print(f"✅ Cached X_temp at threshold {thresh} with {X_temp.shape[1]} features")

        high_vif = vif_df[vif_df["VIF"] > vif_thresholds[0]]  # base threshold

        if high_vif.empty:
            break

        total_features = X_temp.shape[1]
        drop_k = max(1, total_features // 40)
        drop_count = min(drop_k, high_vif.shape[0])
        
        drop_features = high_vif.sort_values("VIF", ascending=False).head(drop_count)["Feature"].tolist()
        X_temp = X_temp.drop(columns=drop_features)

        # print(f"Max VIF: {max_vif:.2f} | Features left: {len(X_temp.columns)}")
        iteration += 1

    # Ensure all remaining thresholds are cached
    for thresh in vif_thresholds:
        if thresh not in vif_xtemp:
            vif_xtemp[thresh] = X_temp.copy()
    return vif_xtemp


In [6]:
X = extracted_data.copy()
y = encoded_labels
selection_threshold = 0.014
scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
del X
var_thresh = VarianceThreshold(threshold=selection_threshold)
X_var_filtered = var_thresh.fit_transform(X_scaled)
kept_columns = X_scaled.columns[var_thresh.get_support()]
X_var_selected = pd.DataFrame(X_var_filtered, columns=kept_columns)
print(f"\n🔍 Evaluating Selection Threshold : {selection_threshold} | {X_var_selected.shape[1]} features after VarianceThreshold.")
print("**" * 40)
print("Proceeding to VIF Step")
vif_xtemp = compute_variance_inflation_threshold(X=X_var_selected,vif_thresholds=[20]) 
del X_var_selected


🔍 Evaluating Selection Threshold : 0.014 | 471 features after VarianceThreshold.
********************************************************************************
Proceeding to VIF Step
Starting Iteration 1
Max VIF: 110.70 | Features left: 460
Starting Iteration 2
Max VIF: 51.87 | Features left: 449
Starting Iteration 3
Max VIF: 41.37 | Features left: 438
Starting Iteration 4
Max VIF: 37.64 | Features left: 428
Starting Iteration 5
Max VIF: 34.15 | Features left: 418
Starting Iteration 6
Max VIF: 32.18 | Features left: 408
Starting Iteration 7
Max VIF: 29.93 | Features left: 398
Starting Iteration 8
Max VIF: 27.78 | Features left: 389
Starting Iteration 9
Max VIF: 27.04 | Features left: 380
Starting Iteration 10
Max VIF: 26.02 | Features left: 371
Starting Iteration 11
Max VIF: 24.67 | Features left: 362
Starting Iteration 12
Max VIF: 23.55 | Features left: 353
Starting Iteration 13
Max VIF: 22.36 | Features left: 345
Starting Iteration 14
Max VIF: 21.34 | Features left: 337
Starting I

In [27]:
X_vif_selection_features_df = vif_xtemp.get(20)

In [33]:
X = extracted_data.copy()
n_components=60
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA(n_components=n_components, random_state=42)
X_pca_extracted_features = pca.fit_transform(X_scaled)
X_pca_extracted_features_df = pd.DataFrame(X_pca_extracted_features,)
del X,X_scaled

In [30]:
print(type(X_vif_selection_features_df),type(X_pca_extracted_features_df))

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'>


In [34]:
X_vif_selection_features_df = X_vif_selection_features_df.rename(columns= {col: f'vif_{i+1}' for i, col in enumerate(X_vif_selection_features_df.columns)})
X_pca_extracted_features_df = X_pca_extracted_features_df.rename(columns= {col: f'pca_{i+1}' for i, col in enumerate(X_pca_extracted_features_df.columns)})

In [43]:
def check_collinearity(X1, X2, threshold=0.8):
    combined = pd.concat([X1, X2], axis=1)
    corr_matrix = combined.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    high_corr_pairs = []
    for column in upper.columns:
        for row in upper.index:
            if pd.notnull(upper.loc[row, column]) and upper.loc[row, column] > threshold:
                high_corr_pairs.append((row, column,))
    
    return high_corr_pairs

In [45]:
def check_collinearity_pools(X1, X2, threshold=0.8):
    combined = pd.concat([X1, X2], axis=1)
    corr_matrix = combined.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    # Create graph
    G = nx.Graph()

    # Add edges for highly correlated pairs
    for column in upper.columns:
        for row in upper.index:
            if pd.notnull(upper.loc[row, column]) and upper.loc[row, column] > threshold:
                G.add_edge(row, column)
    
    # Find connected components (pools)
    pools = list(nx.connected_components(G))
    
    return pools

In [None]:
colinear_groups = check_collinearity_pools(X_vif_selection_features_df,X_pca_extracted_features_df,threshold=0.8)
for i,pool in enumerate(colinear_groups):
    print(f"Size of group {i+1} : {len(pool)}")

In [63]:
combined_df = pd.concat([X_vif_selection_features_df, X_pca_extracted_features_df], axis=1)
features_to_drop = []

for pool in colinear_groups:
    pool = list(pool)
    pool.sort()  # To have consistent order
    keep_feature = pool[0]  # Select first feature
    drop_features = pool[1:]  # All others
    features_to_drop.extend(drop_features)
final_combined_df = combined_df.drop(columns=features_to_drop)


In [78]:
results = []
threshold_list = [0.6,0.65,0.7,0.75,0.8,0.85]

In [79]:
combined_df = pd.concat([X_vif_selection_features_df, X_pca_extracted_features_df], axis=1)
for i in threshold_list:
    print("=="*50)
    print("Strting for threshold : ",i)
    colinear_groups = check_collinearity_pools(X_vif_selection_features_df,X_pca_extracted_features_df,threshold=i)
    features_to_drop = []
    for pool in colinear_groups:
        pool = list(pool)
        pool.sort()  # To have consistent order
        keep_feature = pool[0]  # Select first feature
        drop_features = pool[1:]  # All others
        features_to_drop.extend(drop_features)
    print(f"Dropping {len(features_to_drop)*100/len(combined_df.columns) : 0.2f}% Features")
    final_combined_df = combined_df.drop(columns=features_to_drop)
    for run in range(3):
        print("--"*50)
        print("Starting Run",run+1)
        X = combined_df.copy()
        y = encoded_labels
        model_train_scaler = StandardScaler()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=(run+1)*10+2)
        del X
        print(f"Model Training :: X_train size : {len(X_train)} | y_train size : {len(y_train)}")
        print(f"Model Training :: X_test size : {len(X_test)} | y_test size : {len(y_test)}")
        model = SVC(kernel='rbf', C=3, class_weight='balanced', gamma='scale')
        model.fit(X_train,y_train)
        # Predictions
        y_pred = model.predict(X_test)        
        y_pred_train = model.predict(X_train)        

        result = {
            "run":run+1,
            "colinearity_threshold":i,
            'model_score':round(model.score(X_test,y_test),4),
            'train_f1_macro':round(f1_score(y_train, y_pred_train, average='macro'),4),
            'test_balanced_accuracy': round(balanced_accuracy_score(y_test, y_pred),4),
            'test_f1_macro': round(f1_score(y_test, y_pred, average='macro'),4),
            'test_f1_weighted': round(f1_score(y_test, y_pred, average='weighted'),4),
            "precision_weighted" : round(precision_score(y_test, y_pred, average='weighted'),4),
            "recall_weighted" : round(recall_score(y_test, y_pred, average='weighted'),4),
        }
        print(f"Threshold : {i} | Run {run+1} | RESULTS STORED SUCCESSFULLY")
        results.append(result)
results_df = pd.DataFrame(results)
results_df.to_csv("results/Combined_Added_Featureset.csv",index=False)

Strting for threshold :  0.6
Dropping  40.66% Features
----------------------------------------------------------------------------------------------------
Starting Run 1
Model Training :: X_train size : 3513 | y_train size : 3513
Model Training :: X_test size : 879 | y_test size : 879
Threshold : 0.6 | Run 1 | RESULTS STORED SUCCESSFULLY
----------------------------------------------------------------------------------------------------
Starting Run 2
Model Training :: X_train size : 3513 | y_train size : 3513
Model Training :: X_test size : 879 | y_test size : 879
Threshold : 0.6 | Run 2 | RESULTS STORED SUCCESSFULLY
----------------------------------------------------------------------------------------------------
Starting Run 3
Model Training :: X_train size : 3513 | y_train size : 3513
Model Training :: X_test size : 879 | y_test size : 879
Threshold : 0.6 | Run 3 | RESULTS STORED SUCCESSFULLY
Strting for threshold :  0.65
Dropping  30.43% Features
-------------------------------

## Variation : 2
Applying PCA over VIF Selected Features

In [169]:
results = []

In [170]:
X = extracted_data.copy()
y = encoded_labels
# Apply MinMax scaling
scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
del X
vif_thresholds = [15,20,25,30]
selection_thresholds = [0.014,0.016,0.018]
pca_n_component_list = list(range(40,101,10))
# vif_thresholds = [100]
# selection_thresholds = [0.022]
# pca_n_component_list = [30]
for selection_threshold in selection_thresholds:
    print("\n\n")
    print("##"*40)
    var_thresh = VarianceThreshold(threshold=selection_threshold)
    X_var_filtered = var_thresh.fit_transform(X_scaled)
    kept_columns = X_scaled.columns[var_thresh.get_support()]
    X_var_selected = pd.DataFrame(X_var_filtered, columns=kept_columns)
    print(f"Step : Variance Based Selection | Selection Threshold : {selection_threshold}")
    # print("**" * 40)
    # print(f"{X_var_selected.shape[1]} features selected")
    vif_xtemp = compute_variance_inflation_threshold(X_scaled=X_var_selected,vif_thresholds=vif_thresholds) 
    del X_var_selected
    print("**" * 40)
    for run in range(3):
        for vif_thresh, X_temp in vif_xtemp.items():
            # Step 3: Train/Test and Evaluate
            print(f"Step : Variance Inflation Threshold Based Selection | VIF Threshold : {vif_thresh}")
            X_train, X_test, y_train, y_test = train_test_split(X_temp, y, test_size=0.2, random_state=(run*10)+2)
            scaler = StandardScaler()
            scaler.fit(X_train)  # no reassignment needed
            X_train_scaled = scaler.transform(X_train)
            X_test_scaled = scaler.transform(X_test)
            for pc_count in pca_n_component_list:
                print("--"*40)
                print(f"Step : PCA Based Extraction | Run : {run+1} | PCA n_components : {pc_count}")
                if pc_count > min(len(X_train), len(X_train.columns)):
                    print(f"Skipping PC count {pc_count} | Low Feature Count | {min(len(X_train), len(X_train.columns))}")
                    continue
                pca_transformer = PCA(n_components=pc_count)
                pca_transformer.fit(X_train_scaled)

                X_train_scaled_pca = pca_transformer.transform(X_train_scaled)
                X_test_scaled_pca = pca_transformer.transform(X_test_scaled)

                model = SVC(kernel='rbf', C=3, class_weight='balanced', gamma='scale',random_state=(run*10)+3)
                model.fit(X_train_scaled_pca,y_train)
                y_pred = model.predict(X_test_scaled_pca)
                y_pred_train = model.predict(X_train_scaled_pca)        

                # Output results
                results.append({
                    "experiment_number":run+1,
                    "Selection Threshold": selection_threshold,
                    "VIFThreshold": vif_thresh,
                    "PCA_n_Components":pc_count,
                    'train_f1_macro':f1_score(y_train, y_pred_train, average='macro'),
                    'test_f1_macro':f1_score(y_test, y_pred, average='macro'),
                    'test_balanced_accuracy': balanced_accuracy_score(y_test, y_pred),
                    'test_f1_weighted': f1_score(y_test, y_pred, average='weighted'),
                    "precision_weighted" : precision_score(y_test, y_pred, average='weighted'),
                    "recall_weighted" : recall_score(y_test, y_pred, average='weighted'),
                })
                print("=="*40)
                print("\nLast Recorded Config")
                print(f"Run {run+1} | Selection Treshold : {selection_threshold} | VIF Threshold : {vif_thresh} | PCA n_Component : {pc_count}")





################################################################################
Step : Variance Based Selection | Selection Threshold : 0.014
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Iteration 6
Iteration 7
✅ Cached X_temp at threshold 30 with 408 features
Iteration 8
Iteration 9
Iteration 10
Iteration 11
✅ Cached X_temp at threshold 25 with 371 features
Iteration 12
Iteration 13
Iteration 14
Iteration 15
Iteration 16
✅ Cached X_temp at threshold 20 with 329 features
Iteration 17
Iteration 18
Iteration 19
Iteration 20
Iteration 21
Iteration 22
Iteration 23
Iteration 24
Iteration 25
✅ Cached X_temp at threshold 15 with 269 features
********************************************************************************
Step : Variance Inflation Threshold Based Selection | VIF Threshold : 30
--------------------------------------------------------------------------------
Step : PCA Based Extraction | Run : 1 | PCA n_components : 40

Last Recorded Config
Run 1 | Selection T

In [171]:
1/0

ZeroDivisionError: division by zero

In [173]:
df= pd.DataFrame(results)
df.to_csv("results/Combination_pca_over_vif_multi_run.csv")

In [174]:
chart_df = pd.read_csv("results/Combination_pca_over_vif_multi_run.csv")

In [175]:
chart_df.drop(columns="Unnamed: 0",inplace=True)

In [176]:
t = chart_df.groupby(["Selection Threshold",'VIFThreshold','PCA_n_Components'])[['train_f1_macro', 'test_f1_macro',
       'test_balanced_accuracy', 'test_f1_weighted', 'precision_weighted',
       'recall_weighted']].mean()

In [177]:
t["precision_weighted"] = np.round(t["precision_weighted"]*100,2)
t["recall_weighted"] = np.round(t["recall_weighted"]*100,2)
t["train_f1_macro"] = np.round(t["train_f1_macro"]*100,2)
t["test_f1_macro"] = np.round(t["test_f1_macro"]*100,2)
t["test_balanced_accuracy"] = np.round(t["test_balanced_accuracy"]*100,2)
t["test_f1_weighted"] = np.round(t["test_f1_weighted"]*100,2)

In [178]:
table = t.reset_index()
table.to_csv("results/Presentable_output/pca_over_vif/results.csv",index=False)

In [180]:
table.columns

Index(['Selection Threshold', 'VIFThreshold', 'PCA_n_Components',
       'train_f1_macro', 'test_f1_macro', 'test_balanced_accuracy',
       'test_f1_weighted', 'precision_weighted', 'recall_weighted'],
      dtype='object')

In [179]:
np.max(table["test_f1_macro"])

np.float64(89.3)

In [181]:
import plotly.express as px

fig = px.scatter(table,
                 x='PCA_n_Components',
                 y='test_f1_macro',
                 size='Selection Threshold',
                 color='VIFThreshold',
                 title="Selection Threshold vs Test F1 Macro (Bubble Size = PCA Components)")
fig.show()


In [182]:
l = list(range(40,101,10))  # example value
# Filter the dataframe
for chosen_pca in l:
    df_filtered = table[table['PCA_n_Components'] == chosen_pca]
    # Create the line plot
    fig = px.line(df_filtered, 
                x='VIFThreshold', 
                y='test_f1_macro', 
                color='Selection Threshold', 
                markers=True,
                title=f"Test F1 Macro vs Selection Threshold for PCA Components = {chosen_pca}")

    fig.show()


In [None]:
table.columns

Index(['Selection Threshold', 'VIFThreshold', 'PCA_n_Components',
       'train_f1_macro', 'test_f1_macro', 'test_balanced_accuracy',
       'test_f1_weighted', 'precision_weighted', 'recall_weighted'],
      dtype='object')

In [None]:
l = [0.014,0.016]  # example value
# Filter the dataframe
for chosen_pca in l:
    df_filtered = table[table['Selection Threshold'] == chosen_pca]
    # Create the line plot
    fig = px.line(df_filtered, 
                x='PCA_n_Components', 
                y='test_f1_macro', 
                color='VIFThreshold', 
                markers=True,
                title=f"Test F1 Macro vs Selection Threshold for PCA Components = {chosen_pca}")

    fig.show()
