In [36]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, f1_score, confusion_matrix
import numpy as np

In [37]:
import gdown
gdown.download(url="https://drive.google.com/file/d/1Zsg7ZiTWcpvm9IZl72z0DnOiNFu4QgGo/view", output="file.zip", fuzzy=True)

Downloading...
From: https://drive.google.com/uc?id=1Zsg7ZiTWcpvm9IZl72z0DnOiNFu4QgGo
To: /content/file.zip
100%|██████████| 8.66M/8.66M [00:00<00:00, 67.1MB/s]


'file.zip'

In [None]:
!unzip "/content/file.zip" -d "/content"

Archive:  /content/file.zip
replace /content/TASK_2/blinded_test_set.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
# Load the CSV files
train_df = pd.read_csv("/content/TASK_2/train_set.csv")
test_df = pd.read_csv("/content/TASK_2/test_set.csv")
blinded_df  = pd.read_csv("/content/TASK_2/blinded_test_set.csv")

In [None]:
# Remove rows with nan

train_df.replace([np.inf, -np.inf], np.nan, inplace=True)
train_df.dropna(inplace=True)
test_df.replace([np.inf, -np.inf], np.nan, inplace=True)
test_df.dropna(inplace=True)

In [None]:
# 2. Preprocessing function (encoding + scaling)
def preprocess_data(df, fit_scaler=None, fit_encoder=None):
    # Example: get_dummies for categorical encoding (adapt if needed)
    df_encoded = pd.get_dummies(df.drop(columns=['CLASS'], errors='ignore'))

    # Align columns with train features if needed
    if fit_encoder is not None:
        df_encoded = df_encoded.reindex(columns=fit_encoder, fill_value=0)
    else:
        fit_encoder = df_encoded.columns

    # Scale numeric features
    scaler = fit_scaler or StandardScaler()
    df_scaled = scaler.fit_transform(df_encoded) if fit_scaler is None else scaler.transform(df_encoded)

    return df_scaled, scaler, fit_encoder

In [None]:
# 3. Prepare training data
X_train, scaler, encoder = preprocess_data(train_df)
y_train = train_df['CLASS'].values


In [None]:
# 4. Prepare test data (apply same scaler & encoder)
X_test_encoded = pd.get_dummies(test_df.drop(columns=['CLASS'], errors='ignore'))
X_test_encoded = X_test_encoded.reindex(columns=encoder, fill_value=0)
X_test = scaler.transform(X_test_encoded)
y_test = test_df['CLASS'].values

In [None]:
# 5. Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf.fit(X_train, y_train)

In [None]:
# 6. Evaluate on test set
y_pred = rf.predict(X_test)
y_pred_proba = rf.predict_proba(X_test)[:, 1]

# Evaluate on train set
x_pred = rf.predict(X_train)
x_pred_proba = rf.predict_proba(X_train)[:, 1]

In [None]:
accuracy = accuracy_score(y_test, y_pred)
auroc = roc_auc_score(y_test, y_pred_proba)
sensitivity = recall_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
specificity = tn / (tn + fp)
f1 = f1_score(y_test, y_pred)

In [None]:
print(f"Accuracy:    {accuracy:.4f}")
print(f"AUROC:       {auroc:.4f}")
print(f"Sensitivity: {sensitivity:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"F1 Score:    {f1:.4f}")


In [None]:
# Set up cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
# Lists to store scores
accuracy_list = []
auroc_list = []
sensitivity_list = []
specificity_list = []
f1_list = []

In [None]:
# Loop through folds
for train_idx, val_idx in skf.split(X_train, y_train):
    X_fold_train, X_fold_val = X_train[train_idx], X_train[val_idx]
    y_fold_train, y_fold_val = y_train[train_idx], y_train[val_idx]

    # Train Random Forest on current fold
    rf_cv = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
    rf_cv.fit(X_fold_train, y_fold_train)

    # Predict on validation fold
    y_val_pred = rf_cv.predict(X_fold_val)
    y_val_proba = rf_cv.predict_proba(X_fold_val)[:, 1]

    # Metrics
    acc = accuracy_score(y_fold_val, y_val_pred)
    auc = roc_auc_score(y_fold_val, y_val_proba)
    rec = recall_score(y_fold_val, y_val_pred)
    cm = confusion_matrix(y_fold_val, y_val_pred)
    tn, fp, fn, tp = cm.ravel()
    spec = tn / (tn + fp)
    f1 = f1_score(y_fold_val, y_val_pred)

    # Append scores
    accuracy_list.append(acc)
    auroc_list.append(auc)
    sensitivity_list.append(rec)
    specificity_list.append(spec)
    f1_list.append(f1)


In [None]:
# Print average scores
print("Cross-Validation Results (5-fold):")
print(f"Avg Accuracy:    {np.mean(accuracy_list):.4f}")
print(f"Avg AUROC:       {np.mean(auroc_list):.4f}")
print(f"Avg Sensitivity: {np.mean(sensitivity_list):.4f}")
print(f"Avg Specificity: {np.mean(specificity_list):.4f}")
print(f"Avg F1 Score:    {np.mean(f1_list):.4f}")

In [None]:
# 7. Predict on blinded test set
blinded_encoded = pd.get_dummies(blinded_df)
blinded_encoded = blinded_encoded.reindex(columns=encoder, fill_value=0)
blinded_scaled = scaler.transform(blinded_encoded)
blinded_proba = rf.predict_proba(blinded_scaled)

In [None]:
# 8. Save predictions CSV for test set
train_ids = train_df['ID'] if 'ID' in train_df.columns else pd.Series(range(1, len(train_df)+1), name='ID')
test_pred_df = pd.DataFrame({
    'ID': train_ids,
    'class_0': 1 - x_pred_proba,
    'class_1': x_pred_proba
})
test_pred_df.to_csv("train_predictions.csv", index=False)

In [None]:
# 9. Save predictions CSV for test set
test_ids = test_df['ID'] if 'ID' in test_df.columns else pd.Series(range(1, len(test_df)+1), name='ID')
test_pred_df = pd.DataFrame({
    'ID': test_ids,
    'class_0': 1 - y_pred_proba,
    'class_1': y_pred_proba
})
test_pred_df.to_csv("test_predictions.csv", index=False)

In [None]:
# 10. Save predictions CSV for blinded test set
blind_ids = blinded_df['ID'] if 'ID' in blinded_df.columns else pd.Series(range(1, len(blinded_df)+1), name='ID')
blinded_pred_df = pd.DataFrame({
    'ID': blind_ids,
    'class_0': blinded_proba[:, 0],
    'class_1': blinded_proba[:, 1]
})
blinded_pred_df.to_csv("blinded_predictions.csv", index=False)