In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.decomposition import KernelPCA
from sklearn.metrics import make_scorer, balanced_accuracy_score,accuracy_score
from sklearn.model_selection import GridSearchCV
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from utils import *
random_state = 42



In [2]:
gene_exp_df = read_dataframe_from_pickle("data/processed_data/gene_exp_data.pkl")
label_df = read_dataframe_from_pickle("data/processed_data/label_data.pkl")
print("--" * 80)
print(f"Entries in Gene Expression Dataframe : {len(gene_exp_df)}")
print(f"Entries in Label Dataframe : {len(label_df)}")
labels_with_high_freq_df = remove_low_frequency_labels(label_df,threshold=150)
extracted_data,extracted_label = collect_relevant_data(gene_exp_df_bkp=gene_exp_df,label_df_bkp=labels_with_high_freq_df)
encoded_labels,label_encoder = encode_labels(extracted_label)
print("--" * 80)
print(f"Entries in Extracted Gene Expression Dataframe : {len(extracted_data)}")
print(f"Entries in Extracted Label Dataframe : {len(encoded_labels)}")

DataFrame successfully loaded from data/processed_data/gene_exp_data.pkl
DataFrame successfully loaded from data/processed_data/label_data.pkl
----------------------------------------------------------------------------------------------------------------------------------------------------------------
Entries in Gene Expression Dataframe : 5268
Entries in Label Dataframe : 5268
----------------------------------------------------------------------------------------------------------------------------------------------------------------
Entries in Extracted Gene Expression Dataframe : 4392
Entries in Extracted Label Dataframe : 4392


In [None]:
X_std = StandardScaler().fit_transform(extracted_data)
# Fit PCA
pca = PCA().fit(X_std)

In [None]:

# Create interactive scree plot
fig = go.Figure()

# Cumulative explained variance
fig.add_trace(go.Scatter(
    x=[f"PC{i+1}" for i in range(len(pca.explained_variance_ratio_))],
    y=np.cumsum(pca.explained_variance_ratio_),
    name='Cumulative',
    mode='lines',
    line=dict(color='royalblue', width=2),
    marker=dict(size=8)
))

fig.add_hline(y=0.95, line_dash="dot", 
              annotation_text="95% variance", 
              annotation_position="bottom right")
fig.add_hline(y=0.90, line_dash="dot", 
              annotation_text="90% variance", 
              annotation_position="bottom right")
fig.add_hline(y=0.80, line_dash="dot", 
              annotation_text="80% variance", 
              annotation_position="bottom right")

# Customize layout
fig.update_layout(
    title='PCA Explained Variance Ratio',
    xaxis_title='Principal Components',
    yaxis_title='Explained Variance Ratio',
    hovermode="x unified",
    template='plotly_white',
    height=600
)

# Add range slider
# fig.update_xaxes(rangeslider_visible=True)

fig.show()

## To achive 90% variance of the data we need approximately 1350 pcs

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    extracted_data, encoded_labels, 
    test_size=0.2, 
    stratify=encoded_labels,  # Critical for imbalanced data
    random_state=random_state
)

In [None]:
pipeline_pca = PCA(n_components=1350,random_state=random_state)
pipeline_scaler = StandardScaler()
X_scaled_train = pipeline_scaler.fit_transform(X_train)
X_pca_train = pipeline_pca.fit_transform(X_train)

X_scaled_test = pipeline_scaler.transform(X_test)
X_pca_test = pipeline_pca.transform(X_test)


In [None]:
print("Priliminary Reduction :: X_train feature count : ",X_pca_train.shape[1])
print("Priliminary Reduction :: X_test feature count : ",X_pca_test.shape[1])

In [None]:
kpca = KernelPCA(
    # n_components=50, 
    kernel='rbf', 
    gamma=15,    # RBF kernel parameter
    eigen_solver='dense', # Required for eigenvalues
    fit_inverse_transform=False,
    random_state=random_state
)
X_kpca_train = kpca.fit(X_scaled_train)
eigenvalues = kpca.eigenvalues_
explained_variance_ratio = eigenvalues / np.sum(eigenvalues)
cumulative_explained_variance = np.cumsum(explained_variance_ratio)

# Create interactive plot
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Bar plot (individual explained variance)
fig.add_trace(
    go.Bar(
        x=list(range(1, len(explained_variance_ratio)+1)),
        y=explained_variance_ratio,
        name="Individual Ratio",
        marker_color='blue',
        opacity=0.6
    ),
    secondary_y=False
)

# Line plot (cumulative explained variance)
fig.add_trace(
    go.Scatter(
        x=list(range(1, len(cumulative_explained_variance)+1)),
        y=cumulative_explained_variance,
        name="Cumulative Ratio",
        mode='lines+markers',
        line=dict(color='lightgreen', width=2)
    ),
    secondary_y=True
)

# Update layout
fig.update_layout(
    title="Kernel PCA: Explained Variance by Component",
    xaxis_title="Principal Component",
    yaxis_title="Explained Variance Ratio (Individual)",
    yaxis2_title="Explained Variance Ratio (Cumulative)",
    hovermode="x unified",
    template="plotly_white",
    showlegend=True,
    height=500,
    width=800
)

# Add grid and customize axes
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGray', secondary_y=False)
fig.update_yaxes(showgrid=False, secondary_y=True)

# Display the plot
fig.show()

# Print explained variance ratios
print("Explained Variance Ratios per Component:")
for i, ratio in enumerate(explained_variance_ratio):
    print(f"PC {i+1}: {ratio:.4f} (Cumulative: {cumulative_explained_variance[i]:.4f})")

In [14]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import numpy as np

def show(kpca):
    eigenvalues = kpca.lambdas_ if hasattr(kpca, 'lambdas_') else kpca.eigenvalues_
    explained_variance_ratio = eigenvalues / np.sum(eigenvalues)
    cumulative_explained_variance = np.cumsum(explained_variance_ratio) * 100

    fig = make_subplots(specs=[[{"secondary_y": True}]])

    # Bar plot - soft blue
    fig.add_trace(
        go.Bar(
            x=list(range(1, len(explained_variance_ratio)+1)),
            y=explained_variance_ratio,
            name="Individual Ratio",
            marker_color='rgba(100, 149, 237, 0.6)',  # soft cornflower blue
            opacity=0.8
        ),
        secondary_y=False
    )

    # Line plot - soft wine red
    fig.add_trace(
        go.Scatter(
            x=list(range(1, len(cumulative_explained_variance)+1)),
            y=cumulative_explained_variance,
            name="Cumulative Ratio",
            mode='lines',
            line=dict(color='rgba(128, 0, 64, 0.9)', width=2)
        ),
        secondary_y=True
    )

    # Add bold horizontal threshold lines
    threshold_colors = {90: '#2E8B57', 95: '#B22222'}  # sea green and firebrick
    for threshold, color in threshold_colors.items():
        fig.add_shape(
            type="line",
            x0=1, x1=len(cumulative_explained_variance),
            y0=threshold, y1=threshold,
            line=dict(color=color, width=1),
            yref='y2'
        )
        fig.add_annotation(
            x=len(cumulative_explained_variance),
            y=threshold,
            text=f"{threshold}%",
            showarrow=False,
            yref='y2',
            xanchor='left',
            font=dict(color=color, size=13, family="Arial Black")
        )

    # Update layout with softer aesthetics
    fig.update_layout(
        title="Kernel PCA: Explained Variance by Component",
        xaxis_title="Principal Component",
        yaxis_title="Explained Variance Ratio (Individual)",
        yaxis2_title="Explained Variance Ratio (Cumulative)",
        hovermode="x unified",
        template="plotly_white",
        showlegend=True,
        height=500,
        width=850,
        font=dict(family="Helvetica", size=12)
    )

    # Grid and axis styling
    fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')
    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGray', secondary_y=False)
    fig.update_yaxes(showgrid=False, secondary_y=True)

    fig.show()


In [None]:
kpca_rbf = KernelPCA(
    # n_components=100,
    kernel='rbf',
    gamma=0.1,                  # Rule of thumb: 1/(n_features * X.var())
    eigen_solver='dense',       # More stable for small datasets
    fit_inverse_transform=True  # Enable reconstruction
)
X_kpca_train = kpca_rbf.fit(X_pca_train)
show(kpca_rbf)

In [15]:
kpca_poly = KernelPCA(
    # n_components=50,
    kernel='poly',
    degree=2,                   # Degree of polynomial (e.g., 2, 3)
    coef0=1,                    # Influences higher-order terms
    gamma=1e-4             # Auto-scales with data variance
)
X_kpca_train = kpca_poly.fit(X_train)
show(kpca_poly)

In [None]:

kpca_linear = KernelPCA(
    # n_components=2,
    kernel='linear',            # No extra parameters needed
    eigen_solver='arpack'       # Efficient for large datasets
)
X_kpca_train = kpca_linear.fit(X_pca_train)
show(kpca_linear)

In [None]:
kpca_cosine = KernelPCA(
    # n_components=2,
    kernel='cosine',            # No hyperparameters
    eigen_solver='randomized'   # Scalable for large data
)
X_kpca_train = kpca_cosine.fit(X_pca_train)
show(kpca_cosine)

In [None]:
for degree in range(2,5):
    for gamma in [1e-4,1e-3,1e-2,1e-1,1,10]:
        print(f"With Degree {degree} and Gamma {gamma}")
        kpca_poly = KernelPCA(
            kernel='poly',
            degree=degree,                   # Degree of polynomial (e.g., 2, 3)
            coef0=1,                    # Influences higher-order terms
            gamma=gamma              # Auto-scales with data variance
        )
        test_kpca = kpca_poly.fit(X_pca_train)
        show(kpca_poly)

In [None]:
param_grid = {
            'pca__n_components': list(range(1000, 2001, 500)) ,
            "kpca__n_components":list(range(20,51,10)),
        }

pipe = Pipeline([
        ('scaler', StandardScaler()),
        ("pca", PCA(
                    n_components=1350,
                    random_state=random_state
                    )),
        ("kpca",KernelPCA(  
                        kernel='poly', 
                        coef0=1,
                        random_state=random_state,
                        gamma=1e-4,
                        degree=2
                        )),
        ('svm', SVC(probability=True, 
                    class_weight = "balanced",
                    C= 1, 
                    gamma =  'scale', 
                    kernel= 'rbf',
                    random_state=random_state))
    ])

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state)
scoring = {
        'balanced_accuracy': make_scorer(balanced_accuracy_score),
        'accuracy': 'accuracy'
    }
    
# Initialize GridSearchCV
grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=cv,
    scoring=scoring,
    refit='balanced_accuracy',  # Optimize for balanced accuracy
    n_jobs=-1,
    verbose=1
)



In [5]:
X_train.shape

(3513, 20531)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    extracted_data, encoded_labels, 
    test_size=0.2, 
    stratify=encoded_labels,  # Critical for imbalanced data
    random_state=random_state
)
# Fit the grid
grid.fit(X_train, y_train)

In [11]:
print(grid.best_params_)
print(grid.best_score_)
pca_kpca_esitmator = grid.best_estimator_

{'kpca__n_components': 50, 'pca__n_components': 2000}
0.8440135046725862


In [None]:
import plotly.graph_objects as go
cv_results = pd.DataFrame(grid.cv_results_)
cv_results.dropna(inplace=True)
# X-axis: PCA n_components
x = cv_results['param_kpca__n_components']

# Accuracy & Balanced Accuracy
bal_acc = np.round(cv_results['mean_test_balanced_accuracy'],2)
std_bal_acc = np.round(cv_results['std_test_balanced_accuracy'],2)

# Create figure
fig = go.Figure()

# Balanced Accuracy line (pastel green)
fig.add_trace(go.Scatter(
    x=x,
    y=bal_acc,
    mode='lines+markers',
    name='Balanced Accuracy',
    marker=dict(symbol='square', color='rgba(144, 238, 144, 1)'),  # Pastel green
    line=dict(color='rgba(144, 238, 144, 1)')
))

# Confidence band for Balanced Accuracy (light green)
fig.add_trace(go.Scatter(
    x=list(x) + list(x[::-1]),
    y=list(bal_acc + std_bal_acc) + list(bal_acc - std_bal_acc)[::-1],
    fill='toself',
    fillcolor='rgba(144, 238, 144, 0.2)',  # Lighter green
    line=dict(color='rgba(255,255,255,0)'),
    hoverinfo="skip",
    showlegend=False,
    name='±1 std dev'
))

# Layout with soft design
fig.update_layout(
    title='Performance vs PCA Components',
    xaxis_title='PCA n_components',
    yaxis_title='Score',
    legend=dict(x=0.01, y=1.1),
    template='plotly_white',
    font=dict(family="Arial", size=14, color="gray"),
    title_font=dict(size=20, color="black"),
)

fig.show()


In [3]:
param_grid = {
    'reduce__n_components':list(range(20,101,10)),  # For PCA or KPCA
    'reduce__kernel': ['linear', 'rbf', 'poly'],  # Only used for KPCA
    'reduce__gamma': [0.001, 0.01, 0.1],     # Relevant for 'rbf' and 'poly' kernels
    'reduce__degree': [2, 3, 4],   
    'svm__C': [0.1, 1, 10],          # Only used for 'poly' kernel
}

pipe = Pipeline([
        ('scaler', StandardScaler()),
        ("reduce",KernelPCA(  
                        random_state=random_state,
                        )),
        ('svm', SVC(probability=True, 
                    class_weight = "balanced",
                    gamma =  'scale', 
                    kernel= 'rbf',
                    random_state=random_state))
    ])

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state)
scoring = {
        'balanced_accuracy': make_scorer(balanced_accuracy_score),
        'accuracy': 'accuracy'
    }
    



In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    extracted_data, encoded_labels, 
    test_size=0.2, 
    stratify=encoded_labels,  # Critical for imbalanced data
    random_state=random_state
)
# Fit the grid


In [5]:
from tqdm import tqdm
import gc

output_file = 'results/kpca_runs.csv'

# Whether to write CSV header
write_header = True

# Loop through 50 runs safely
for run in tqdm(range(3), desc="GridSearchCV Runs"):
    try:
        # Create GridSearchCV instance
        grid = GridSearchCV(
            estimator=pipe,
            param_grid=param_grid,
            cv=cv,
            scoring=scoring,
            refit='balanced_accuracy',  # Optimize for balanced accuracy
            n_jobs=-1,
            verbose=1
        )

        # Fit model
        grid.fit(X_train, y_train)

        # Extract and format results
        cv_results = pd.DataFrame({
            'run': run,
            'reduce__n_components': grid.cv_results_['param_reduce__n_components'].astype(int),
            "reduce__kernel":grid.cv_results_['param_reduce__kernel'],
            'mean_balanced_accuracy': np.round(grid.cv_results_['mean_test_balanced_accuracy'], 3),
            'std_balanced_accuracy': np.round(grid.cv_results_['std_test_balanced_accuracy'], 3)
        })

        # Append to CSV (write header only on first run)
        cv_results.to_csv(output_file, mode='a', header=write_header, index=False)
        write_header = False  # Disable header after first write

    except Exception as e:
        print(f"\n❌ Error occurred at run {run}: {e}")
    
    finally:
        # Cleanup to avoid memory issues
        del grid
        del cv_results
        gc.collect()

GridSearchCV Runs:   0%|          | 0/3 [00:00<?, ?it/s]

Fitting 3 folds for each of 729 candidates, totalling 2187 fits


GridSearchCV Runs:   0%|          | 0/3 [11:56:48<?, ?it/s]


NameError: name 'cv_results' is not defined

In [None]:
print(grid.best_params_)
print(grid.best_score_)
kpca_esitmator = grid.best_estimator_

{'reduce__degree': 2, 'reduce__gamma': None, 'reduce__kernel': 'linear', 'reduce__n_components': 60, 'svm__C': 1}
0.8725595484443393


In [None]:
y_pred = kpca_esitmator.predict(X_test)

In [None]:
balanced_accuracy_score(y_pred=y_pred,y_true=y_test)

np.float64(0.8506378483127074)

In [None]:
for i in range(len(grid.cv_results_['params'])):
    print(
        f"{grid.cv_results_['params'][i]} => "
        f"Balanced Acc: {grid.cv_results_['mean_test_balanced_accuracy'][i]:.3f} ± "
        f"{grid.cv_results_['std_test_balanced_accuracy'][i]:.3f}"
    )