In [1]:
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.svm import SVC
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split,RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import balanced_accuracy_score,f1_score,precision_score,recall_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy.stats import gaussian_kde
from utils import *
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from scipy.stats import randint, uniform
import networkx as nx


In [2]:
gene_exp_df = read_dataframe_from_pickle("data/processed_data/gene_exp_data.pkl")
label_df = read_dataframe_from_pickle("data/processed_data/label_data.pkl")
print("--" * 80)
print(f"Entries in Gene Expression Dataframe : {len(gene_exp_df)}")
print(f"Entries in Label Dataframe : {len(label_df)}")
labels_with_high_freq_df = remove_low_frequency_labels(label_df,threshold=150)
extracted_data,extracted_label = collect_relevant_data(gene_exp_df_bkp=gene_exp_df,label_df_bkp=labels_with_high_freq_df)
encoded_labels,label_encoder = encode_labels(extracted_label)
print("--" * 80)
print(f"Entries in Extracted Gene Expression Dataframe : {len(extracted_data)}")
print(f"Entries in Extracted Label Dataframe : {len(encoded_labels)}")

DataFrame successfully loaded from data/processed_data/gene_exp_data.pkl
DataFrame successfully loaded from data/processed_data/label_data.pkl
----------------------------------------------------------------------------------------------------------------------------------------------------------------
Entries in Gene Expression Dataframe : 5268
Entries in Label Dataframe : 5268
----------------------------------------------------------------------------------------------------------------------------------------------------------------
Entries in Extracted Gene Expression Dataframe : 4392
Entries in Extracted Label Dataframe : 4392


In [3]:
def compute_variance_inflation_threshold(X_scaled,vif_thresholds):
# Apply MinMax scaling
    vif_xtemp = {}
    X_temp = X_scaled.copy()
    iteration = 1

    while True:
        print(f"Iteration {iteration}")
        vif_df = pd.DataFrame()
        vif_df["Feature"] = X_temp.columns
        vif_df["VIF"] = [variance_inflation_factor(X_temp.values, i) for i in range(X_temp.shape[1])]
        
        max_vif = vif_df["VIF"].max()

        for thresh in vif_thresholds:
            if thresh not in vif_xtemp and max_vif <= thresh:
                vif_xtemp[thresh] = X_temp.copy()
                print(f"✅ Cached X_temp at threshold {thresh} with {X_temp.shape[1]} features")

        high_vif = vif_df[vif_df["VIF"] > vif_thresholds[0]]  # base threshold

        if high_vif.empty:
            break

        total_features = X_temp.shape[1]
        drop_k = max(1, total_features // 40)
        drop_count = min(drop_k, high_vif.shape[0])
        
        drop_features = high_vif.sort_values("VIF", ascending=False).head(drop_count)["Feature"].tolist()
        X_temp = X_temp.drop(columns=drop_features)

        # print(f"Max VIF: {max_vif:.2f} | Features left: {len(X_temp.columns)}")
        iteration += 1

    # Ensure all remaining thresholds are cached
    for thresh in vif_thresholds:
        if thresh not in vif_xtemp:
            vif_xtemp[thresh] = X_temp.copy()
    return vif_xtemp


In [7]:
def check_collinearity_pools(X1, X2, threshold):
    combined = pd.concat([X1, X2], axis=1)
    corr_matrix = combined.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    # Create graph
    G = nx.Graph()

    # Add edges for highly correlated pairs
    for column in upper.columns:
        for row in upper.index:
            if pd.notnull(upper.loc[row, column]) and upper.loc[row, column] > threshold:
                G.add_edge(row, column)
    
    # Find connected components (pools)
    pools = list(nx.connected_components(G))
    
    return pools

In [8]:
pca_n_component_list = list(range(40,101,10))
X = extracted_data.copy()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca_features_df = {}
for n_c in pca_n_component_list:
    pca = PCA(n_components=n_c,random_state=random_state)
    X_pca_extracted_features = pca.fit_transform(X_scaled)
    X_pca_extracted_features_df = pd.DataFrame(X_pca_extracted_features,)
    pca_features_df[n_c] = X_pca_extracted_features_df
    del pca
del X

In [13]:
results=[]

In [14]:
X = extracted_data.copy()
y = encoded_labels
# vif_thresholds = [20,]
# selection_thresholds = [0.018]
# threshold_list = [0.8]
vif_thresholds = [15,20,25,30]
selection_thresholds = [0.014,0.016,0.018]
threshold_list = [0.75,0.8,0.85]
scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
del X
for selection_threshold in selection_thresholds:
    print("=="*40)
    var_thresh = VarianceThreshold(threshold=selection_threshold)
    X_var_filtered = var_thresh.fit_transform(X_scaled)
    kept_columns = X_scaled.columns[var_thresh.get_support()]
    X_var_selected = pd.DataFrame(X_var_filtered, columns=kept_columns)
    print(f"Step : Variance Based Selection | Selection Threshold : {selection_threshold}")
    vif_xtemp = compute_variance_inflation_threshold(X_scaled=X_var_selected,vif_thresholds=vif_thresholds) 
    del X_var_selected
    for vif_thresh, X_vif_selection_features_df in vif_xtemp.items():
        X_vif_selection_features_df = X_vif_selection_features_df.rename(columns= {col: f'vif_{i+1}' for i, col in enumerate(X_vif_selection_features_df.columns)})
        for pc_count, X_pca_extracted_features_df in pca_features_df.items():
            print("--"*40)
            print(f"Step : Constructing Non-Colinear Combined Feature Space | VIF  : {vif_thresh} | PCA : {pc_count}")
            X_pca_extracted_features_df = X_pca_extracted_features_df.rename(columns= {col: f'pca_{i+1}' for i, col in enumerate(X_pca_extracted_features_df.columns)})
            combined_df = pd.concat([X_vif_selection_features_df, X_pca_extracted_features_df], axis=1)
            for i in threshold_list:
                print("Starting for threshold : ",i)
                colinear_groups = check_collinearity_pools(X_vif_selection_features_df,X_pca_extracted_features_df,threshold=i)
                features_to_drop = []
                for pool in colinear_groups:
                    pool = list(pool)
                    pool.sort()  # To have consistent order
                    keep_feature = pool[0]  # Select first feature
                    drop_features = pool[1:]  # All others
                    features_to_drop.extend(drop_features)
                print(f"Dropping {len(features_to_drop)*100/len(combined_df.columns) : 0.2f}% Features")
                final_combined_df = combined_df.drop(columns=features_to_drop)
                for run in range(3):
                    print("**"*40)
                    print(f"Step : Model Training | Run : {run+1}")
                    X = final_combined_df.copy()
                    y = encoded_labels
                    model_train_scaler = StandardScaler()
                    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=(run+1)*10+2)
                    del X
                    print(f"Model Training :: X_train size : {len(X_train)} | y_train size : {len(y_train)}")
                    print(f"Model Training :: X_test size : {len(X_test)} | y_test size : {len(y_test)}")
                    model_train_scaler.fit(X_train)
                    X_train_scaled = model_train_scaler.transform(X_train)
                    X_test_scaled = model_train_scaler.transform(X_test)
                    model = SVC(kernel='rbf', C=3, class_weight='balanced', gamma='scale')
                    model.fit(X_train_scaled,y_train)
                    # Predictions
                    y_pred = model.predict(X_test_scaled)        
                    y_pred_train = model.predict(X_train_scaled)        

                    result = {
                        "run":run+1,
                        "Selection_Threshold":selection_threshold,
                        "VIF_Threshold" : vif_thresh,
                        "PCA_n_Components" : pc_count,
                        "colinearity_threshold":i,
                        'train_f1_macro':round(f1_score(y_train, y_pred_train, average='macro'),4),
                        'test_f1_macro': round(f1_score(y_test, y_pred, average='macro'),4),
                        'test_balanced_accuracy': round(balanced_accuracy_score(y_test, y_pred),4),
                        'test_f1_weighted': round(f1_score(y_test, y_pred, average='weighted'),4),
                        "precision_weighted" : round(precision_score(y_test, y_pred, average='weighted'),4),
                        "recall_weighted" : round(recall_score(y_test, y_pred, average='weighted'),4),
                    }
                    print(f"Threshold : {i} | Run {run+1} | RESULTS STORED SUCCESSFULLY")
                    results.append(result)
            

result_df = pd.DataFrame(results)
result_df.to_csv("results/Detailed_multi_run_pca_added_vif.csv",index=False)

Step : Variance Based Selection | Selection Threshold : 0.014
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Iteration 6
Iteration 7
✅ Cached X_temp at threshold 30 with 408 features
Iteration 8
Iteration 9
Iteration 10
Iteration 11
✅ Cached X_temp at threshold 25 with 371 features
Iteration 12
Iteration 13
Iteration 14
Iteration 15
Iteration 16
✅ Cached X_temp at threshold 20 with 329 features
Iteration 17
Iteration 18
Iteration 19
Iteration 20
Iteration 21
Iteration 22
Iteration 23
Iteration 24
Iteration 25
✅ Cached X_temp at threshold 15 with 269 features
--------------------------------------------------------------------------------
Step : Constructing Non-Colinear Combined Feature Space | VIF  : 30 | PCA : 40
Starting for threshold :  0.75
Dropping  16.29% Features
********************************************************************************
Step : Model Training | Run : 1
Model Training :: X_train size : 3513 | y_train size : 3513
Model Training :: X_test size : 