In [9]:
# importing all the required libraries
# pandas helps us to work with datasets, like manipulating it, Helps us to easily explore, clean and analyze it.
# numpy helps us to work in the form of arrays

import pandas as pd
import numpy as np

from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [10]:

# Load training data (400 samples)
raw_training_data = pd.read_csv('train.csv')

# Load labeled subset (150 samples with known labels)
cancer_type_labels = pd.read_csv('train_labels.csv')

#Load test dataset
unseen_test_data = pd.read_csv('test.csv')

# combining the datasets using Id as connector and using inner join which ensures only those elements are connected which have both features and labels 
# i.e only keeping the samples where cancer type is known.
# this step is also essential since it prevents data leakage by separating the labels from features.
training_master_df = pd.merge(raw_training_data, cancer_type_labels, on='Id', how='inner')


'''
X_train = training_master_df[selected_gene_cols].copy()
X_test = unseen_test_data[selected_gene_cols].copy()
# After merge
training_master_df = pd.merge(raw_training_data, cancer_type_labels, on='Id', how='inner')

# If both Class_x and Class_y exist, prefer Class_y
if 'Class_x' in training_master_df.columns and 'Class_y' in training_master_df.columns:
    training_master_df = training_master_df.drop(columns=['Class_x'])

    training_master_df = training_master_df.rename(columns={'Class_y': 'Class'})

training_master_df.head(5)
'''

In [2]:
## step - 1
# We only keep features that are gene expressions, i.e., columns starting with gene.
gene_feature_cols = [col for col in training_master_df.columns if col.startswith("gene_")]



# Step 2 :
# To reduce noise and missing data issues, we drop features where more than 30% of values are missing.
missing_ratio = training_master_df[gene_feature_cols].isnull().mean()
selected_gene_cols = missing_ratio[missing_ratio < 0.3].index.tolist()



# Step 3: Define X and y 
X_train = training_master_df[selected_gene_cols].copy()
X_test = unseen_test_data[selected_gene_cols].copy()
y_train = training_master_df["Class"]
test_ids = unseen_test_data["Id"]

# Add missing count feature (may capture signal!)
X_train["missing_count"] = X_train.isnull().sum(axis=1)
X_test["missing_count"] = X_test.isnull().sum(axis=1)



# Step 4: Filling NAN Values 
## We use median imputation instead of mean, since gene expression data is highly skewed and may have outliers. 
## Median is more robust in such scenarios.
X_train = X_train.fillna(X_train.median())
X_test = X_test.fillna(X_train.median()) 



# Step 5: Scaling
# It is used to standardised the features so that they are on same scale 
# This is done so that model does not prioritises the feature whose values are in a very large scale even though
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



# Step 6: Dimensionality Reduction (PCA)
# Transforming data into principal components making them uncorellated, we remove redundant features,
# this helps us prevent overfitting and capture the most important variance in the data 
# we keep 99% of the variance
pca = PCA(n_components=0.99, random_state=42)  # Retain more variance
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print(f"PCA reduced {X_train_scaled.shape[1]} → {X_train_pca.shape[1]} features.")



# Step 7: Model Training 
# We went with RandomForestClassifier because it gave good results without needing much tuning. 
# It handles noisy and high-dimensional data well, which is perfect for gene expression tasks 
# like this. Even though it doesn’t need scaling or PCA, we still used them — mainly to reduce 
# noise and make the data easier to work with. PCA helped turn thousands of gene features into 
# a smaller set of useful ones. This made things faster and cleaner. Overall, RandomForest 
# gave us reliable performance and was simple enough to trust.

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=25,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train_pca, y_train)



# Step 8: Evaluation on Train
# Just a sanity check — we're testing the model on the training data, not to measure real 
# performance, but just to make sure it's actually learning and not doing something totally 
# random.
train_preds = rf.predict(X_train_pca)
f1 = f1_score(y_train, train_preds, average='macro')
print(f"Training Macro F1 Score: {f1:.4f}")



NameError: name 'training_master_df' is not defined

In [16]:
submission = pd.DataFrame({
    "Id": test_ids,
    "Class": rf.predict(X_test_pca)
})
submission.to_csv("submission.csv", index=False)
print("✅ submission.csv is ready!")

✅ submission.csv is ready!


######
### THOUGHT

'''
The task was a multi-class classification problem based on high-dimensional gene expression data, where each sample (patient) had thousands of gene features.
The goal was to accurately predict the type of cancer a patient has.
The challenge involved working with sparse labels (only 150 labeled out of 400 training samples), handling over 20,000 gene features with many missing values, and avoiding overfitting due to the high dimensionality and small sample size.
'''

### Tried but Failed
''' First tried Logistic Regression, but performance was too poor to be useful, even after tuning.

Initially used mean to fill missing values, but later switched to median since it’s more robust to outliers, which are common in gene expression data.

Started with StandardScaler but switched to RobustScaler because it handled outliers better.

PCA was first set to retain 95% variance, but increasing it to 99% gave slightly better results, so the change was kept.

Linear and polynomial SVMs were both tested; they were too slow and tended to overfit due to the high-dimensional input.

An ensemble combining SVM and RandomForest was also attempted, but it did not outperform RandomForest alone.

Deeper RandomForests with max_depth=40 were tested but showed signs of overfitting, so max_depth=25 was used instead.

Even though RandomForest doesn’t require scaled or PCA-reduced input, both were applied to reduce noise and maintain compatibility across experiments.
'''