## Approach 2:
### 1. Use different number of pcs



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, balanced_accuracy_score,accuracy_score
from sklearn.model_selection import GridSearchCV
from utils import *
import gc
from tqdm import tqdm
random_state = 42
import plotly.graph_objects as go


In [2]:
gene_exp_df = read_dataframe_from_pickle("data/processed_data/gene_exp_data.pkl")
label_df = read_dataframe_from_pickle("data/processed_data/label_data.pkl")
print(f"Entries in Gene Expression Dataframe : {len(gene_exp_df)}")
print(f"Entries in Label Dataframe : {len(label_df)}")

DataFrame successfully loaded from data/processed_data/gene_exp_data.pkl
DataFrame successfully loaded from data/processed_data/label_data.pkl
Entries in Gene Expression Dataframe : 5268
Entries in Label Dataframe : 5268


In [3]:
labels_with_high_freq_df = remove_low_frequency_labels(label_df,threshold=150)
extracted_data,extracted_label = collect_relevant_data(gene_exp_df_bkp=gene_exp_df,label_df_bkp=labels_with_high_freq_df)
encoded_labels,label_encoder = encode_labels(extracted_label)

In [4]:
param_grid = {
        'pca__n_components': [10,30,60,90,200,280,350,400,450],
        }



pipe = Pipeline([
        ('scaler', StandardScaler()),
        ("pca", PCA()),
        ('svm', SVC(probability=True, 
                    class_weight = "balanced",
                    C= 1, 
                    gamma =  'scale', 
                    kernel= 'rbf',
                    random_state=random_state))
    ])

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state)
scoring = {
        'balanced_accuracy': make_scorer(balanced_accuracy_score),
        'accuracy': 'accuracy'
    }
    
    # Initialize GridSearchCV
grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=cv,
    scoring=scoring,
    refit='balanced_accuracy',  # Optimize for balanced accuracy
    n_jobs=-1,
    verbose=1
)


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    extracted_data, encoded_labels, 
    test_size=0.2, 
    stratify=encoded_labels,  # Critical for imbalanced data
    random_state=random_state
)
# Fit the grid
grid.fit(X_train, y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


In [6]:
# Get all cross-validation results into a DataFrame
cv_results = pd.DataFrame(grid.cv_results_)

# Show all available columns
print("Available columns in cv_results_:\n", cv_results.columns.tolist())

# Save to CSV for full inspection (optional)
cv_results.to_csv("pca_component_variation_results.csv", index=False)

Available columns in cv_results_:
 ['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_pca__n_components', 'params', 'split0_test_balanced_accuracy', 'split1_test_balanced_accuracy', 'split2_test_balanced_accuracy', 'mean_test_balanced_accuracy', 'std_test_balanced_accuracy', 'rank_test_balanced_accuracy', 'split0_test_accuracy', 'split1_test_accuracy', 'split2_test_accuracy', 'mean_test_accuracy', 'std_test_accuracy', 'rank_test_accuracy']


In [7]:
import plotly.graph_objects as go

# X-axis: PCA n_components
x = cv_results['param_pca__n_components']

# Accuracy & Balanced Accuracy
acc = np.round(cv_results['mean_test_accuracy'],2)
bal_acc = np.round(cv_results['mean_test_balanced_accuracy'],2)
std_bal_acc = np.round(cv_results['std_test_balanced_accuracy'],2)

# Create figure
fig = go.Figure()

# Accuracy line (pastel blue)
fig.add_trace(go.Scatter(
    x=x,
    y=acc,
    mode='lines+markers',
    name='Accuracy',
    marker=dict(symbol='circle', color='rgba(137, 207, 240, 1)'),  # Pastel blue
    line=dict(color='rgba(137, 207, 240, 1)')
))

# Balanced Accuracy line (pastel green)
fig.add_trace(go.Scatter(
    x=x,
    y=bal_acc,
    mode='lines+markers',
    name='Balanced Accuracy',
    marker=dict(symbol='square', color='rgba(144, 238, 144, 1)'),  # Pastel green
    line=dict(color='rgba(144, 238, 144, 1)')
))

# Confidence band for Balanced Accuracy (light green)
fig.add_trace(go.Scatter(
    x=list(x) + list(x[::-1]),
    y=list(bal_acc + std_bal_acc) + list(bal_acc - std_bal_acc)[::-1],
    fill='toself',
    fillcolor='rgba(144, 238, 144, 0.2)',  # Lighter green
    line=dict(color='rgba(255,255,255,0)'),
    hoverinfo="skip",
    showlegend=False,
    name='±1 std dev'
))

# Layout with soft design
fig.update_layout(
    title='Performance vs PCA Components',
    xaxis_title='PCA n_components',
    yaxis_title='Score',
    legend=dict(x=0.01, y=1.1),
    template='plotly_white',
    font=dict(family="Arial", size=14, color="gray"),
    title_font=dict(size=20, color="black"),
)

fig.show()


In [None]:
param_grid = {
        'pca__n_components': [40,60,80,100,120,140,160,180,200],
        }

pipe = Pipeline([
        ('scaler', StandardScaler()),
        ("pca", PCA()),
        ('svm', SVC(probability=True, 
                    class_weight = "balanced",
                    C= 1, 
                    gamma =  'scale', 
                    kernel= 'rbf',
                    random_state=random_state))
    ])

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state)
scoring = {
        'balanced_accuracy': make_scorer(balanced_accuracy_score),
        'accuracy': 'accuracy'
    }
    
    # Initialize GridSearchCV
grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=cv,
    scoring=scoring,
    refit='balanced_accuracy',  # Optimize for balanced accuracy
    n_jobs=-1,
    verbose=1
)


In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    extracted_data, encoded_labels, 
    test_size=0.2, 
    stratify=encoded_labels,  # Critical for imbalanced data
    random_state=random_state
)
# Fit the grid
grid.fit(X_train, y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


In [12]:
cv_results = pd.DataFrame(grid.cv_results_)

# X-axis: PCA n_components
x = cv_results['param_pca__n_components']

# Accuracy & Balanced Accuracy
acc = np.round(cv_results['mean_test_accuracy'],2)
bal_acc = np.round(cv_results['mean_test_balanced_accuracy'],2)
std_bal_acc = np.round(cv_results['std_test_balanced_accuracy'],2)

# Create figure
fig = go.Figure()

# Accuracy line (pastel blue)
fig.add_trace(go.Scatter(
    x=x,
    y=acc,
    mode='lines+markers',
    name='Accuracy',
    marker=dict(symbol='circle', color='rgba(137, 207, 240, 1)'),  # Pastel blue
    line=dict(color='rgba(137, 207, 240, 1)')
))

# Balanced Accuracy line (pastel green)
fig.add_trace(go.Scatter(
    x=x,
    y=bal_acc,
    mode='lines+markers',
    name='Balanced Accuracy',
    marker=dict(symbol='square', color='rgba(144, 238, 144, 1)'),  # Pastel green
    line=dict(color='rgba(144, 238, 144, 1)')
))

# Confidence band for Balanced Accuracy (light green)
fig.add_trace(go.Scatter(
    x=list(x) + list(x[::-1]),
    y=list(bal_acc + std_bal_acc) + list(bal_acc - std_bal_acc)[::-1],
    fill='toself',
    fillcolor='rgba(144, 238, 144, 0.2)',  # Lighter green
    line=dict(color='rgba(255,255,255,0)'),
    hoverinfo="skip",
    showlegend=False,
    name='±1 std dev'
))

# Layout with soft design
fig.update_layout(
    title='Performance vs PCA Components',
    xaxis_title='PCA n_components',
    yaxis_title='Score',
    legend=dict(x=0.01, y=1.1),
    template='plotly_white',
    font=dict(family="Arial", size=14, color="gray"),
    title_font=dict(size=20, color="black"),
)

fig.show()


In [None]:
param_grid = {
    'pca__n_components': list(range(30, 271, 10))  # From 30 to 270 inclusive, step of 10
}


pipe = Pipeline([
        ('scaler', StandardScaler()),
        ("pca", PCA()),
        ('svm', SVC(probability=True, 
                    class_weight = "balanced",
                    C= 1, 
                    gamma =  'scale', 
                    kernel= 'rbf',
                    random_state=random_state))
    ])

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state)
scoring = {
        'balanced_accuracy': make_scorer(balanced_accuracy_score),
        'accuracy': 'accuracy'
    }
    
    # Initialize GridSearchCV
grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=cv,
    scoring=scoring,
    refit='balanced_accuracy',  # Optimize for balanced accuracy
    n_jobs=-1,
    verbose=1
)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    extracted_data, encoded_labels, 
    test_size=0.2, 
    stratify=encoded_labels,  # Critical for imbalanced data
    random_state=random_state
)
# Fit the grid


In [15]:
param_grid = {
    'pca__n_components': list(range(30, 271, 10))  # From 30 to 270 inclusive, step of 10
}


pipe = Pipeline([
        ('scaler', StandardScaler()),
        ("pca", PCA()),
        ('svm', SVC(probability=True, 
                    class_weight = "balanced",
                    C= 1, 
                    gamma =  'scale', 
                    kernel= 'rbf',
                    random_state=random_state))
    ])

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state)
scoring = {
        'balanced_accuracy': make_scorer(balanced_accuracy_score),
        'accuracy': 'accuracy'
    }
    


In [None]:
output_file = 'gridsearch_pca_runs.csv'

# Whether to write CSV header
write_header = True

# Loop through 50 runs safely
for run in tqdm(range(50), desc="GridSearchCV Runs"):
    try:
        # Create GridSearchCV instance
        grid = GridSearchCV(
            estimator=pipe,
            param_grid=param_grid,
            cv=cv,
            scoring=scoring,
            refit='balanced_accuracy',
            n_jobs=2,   # You can reduce this to 1 if needed
            verbose=0
        )

        # Fit model
        grid.fit(X_train, y_train)

        # Extract and format results
        cv_results = pd.DataFrame({
            'run': run,
            'pca__n_components': grid.cv_results_['param_pca__n_components'].astype(int),
            'mean_balanced_accuracy': np.round(grid.cv_results_['mean_test_balanced_accuracy'], 3),
            'std_balanced_accuracy': np.round(grid.cv_results_['std_test_balanced_accuracy'], 3)
        })

        # Append to CSV (write header only on first run)
        cv_results.to_csv(output_file, mode='a', header=write_header, index=False)
        write_header = False  # Disable header after first write

    except Exception as e:
        print(f"\n❌ Error occurred at run {run}: {e}")
    
    finally:
        # Cleanup to avoid memory issues
        del grid
        del cv_results
        gc.collect()

32

In [3]:
# Create figure
df= pd.read_csv("gridsearch_pca_runs.csv")

df.groupby("pca__n_components")["mean_balanced_accuracy"].count().tolist()[0]
fig = go.Figure()

x = df["pca__n_components"].unique().tolist()
bal_acc = df.groupby("pca__n_components")["mean_balanced_accuracy"].mean() * 100
std_bal_acc = df.groupby("pca__n_components")["std_balanced_accuracy"].mean() * 100
runs = df.groupby("pca__n_components")["mean_balanced_accuracy"].count().tolist()[0]
# Balanced Accuracy line (pastel green)
fig.add_trace(go.Scatter(
    x=x,
    y=bal_acc,
    mode='lines+markers',
    name='Accuracy',
    marker=dict(symbol='square', color='rgba(144, 238, 144, 1)'),  # Pastel green
    line=dict(color='rgba(144, 238, 144, 1)')
))

# Confidence band for Balanced Accuracy (light green)
fig.add_trace(go.Scatter(
    x=list(x) + list(x[::-1]),
    y=list(bal_acc + std_bal_acc) + list(bal_acc - std_bal_acc)[::-1],
    fill='toself',
    fillcolor='rgba(144, 238, 144, 0.2)',  # Lighter green
    line=dict(color='rgba(255,255,255,0)'),
    hoverinfo="skip",
    showlegend=False,
    name='±1 std dev'
))

# Add horizontal reference lines at 87, 87.5, and 88
for y in [87, 87.5, 88]:
    fig.add_shape(
        type='line',
        x0=min(x), x1=max(x),
        y0=y, y1=y,
        line=dict(color='LightSlateGray', dash='dash'),
        opacity=0.5
    )
    # Optionally add annotations for those lines
    fig.add_annotation(
        x=max(x),
        y=y,
        text=f'{y}%',
        showarrow=False,
        xanchor='left',
        font=dict(color='gray')
    )

# Layout with soft design
fig.update_layout(
    title=f'Average Accuracy vs Number of Principle Components',
    xaxis_title='PCA n_components',
    yaxis_title='Score (%)',
    legend=dict(x=0.01, y=1.1),
    template='plotly_white',
    font=dict(family="Arial", size=14, color="gray"),
    title_font=dict(size=20, color="black"),
)

fig.show()


# Conclusion:
## Algorithm used : Support Vector Classifier (SVC)
## Balanced Accuracy Scores: 
### for 50 components : ~87%
### for 270 components : ~87.5%

## So increasing principle component size more than 50 does not improve accuracy metric significantly. 