In [None]:
# =============================================================================
# EXPERIMENT: Random Forest Classifier
# Date: 22-11-25 00:47 started
# Description: Random Forest with hyperparameter tuning
# =============================================================================

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import warnings
from datetime import datetime
from pathlib import Path

In [2]:
# =============================================================================
# 1. LOAD PREPROCESSED DATA (no need to preprocess again!)
# =============================================================================
print("Loading preprocessed data...")
X = pd.read_pickle('../../data/processed/X_train_processed.pkl')
y = pd.read_pickle('../../data/processed/y_train.pkl')
X_test = pd.read_pickle('../../data/processed/X_test_processed.pkl')
test_ids = pd.read_pickle('../../data/processed/test_ids.pkl')

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

Loading preprocessed data...
X shape: (20885, 55)
y shape: (20885,)


In [3]:
# =============================================================================
# 2. DEFINE EXPERIMENT PARAMETERS
# =============================================================================
EXPERIMENT_NAME = "random_forest_tuned"
MODEL_DESCRIPTION = "Random Forest with RandomizedSearchCV for hyperparameter tuning"

In [5]:
# =============================================================================
# 3. HYPERPARAMETER TUNING - FOCUS ON PREVENTING OVERFITTING
# =============================================================================
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold


print(f"\n{'='*70}")
print(f"EXPERIMENT: {EXPERIMENT_NAME}")
print(f"{'='*70}")
print("\nStrategy: Regularize aggressively to close train-test gap")
print("Key changes:")
print("  - Limit tree depth (max_depth)")
print("  - Require more samples per split (min_samples_split)")
print("  - Require more samples per leaf (min_samples_leaf)")
print("  - Reduce number of features per tree (max_features)")
print("  - Use fewer trees if needed")

# Parameter distribution focused on REDUCING overfitting
param_dist = {
    'n_estimators': [100, 200, 300],  # Fewer trees (faster, less overfitting)
    'max_depth': [5, 7, 10, 12],  # Shallower trees (was 20 before - too deep!)
    'min_samples_split': [20, 50, 100],  # More samples required (was 2 before!)
    'min_samples_leaf': [10, 20, 30],  # More samples per leaf (was 1 before!)
    'max_features': ['sqrt', 'log2', 0.2],  # Fewer features (was 0.3, 0.5)
    'class_weight': ['balanced', 'balanced_subsample'],
    'max_samples': [0.7, 0.8, 0.9]  # Bootstrap sample size (NEW - adds randomness)
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("\nRunning RandomizedSearchCV (trying 40 combinations)...")
print("This may take 10-20 minutes...\n")

random_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42, n_jobs=-1),
    param_distributions=param_dist,
    n_iter=40,
    cv=cv,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=2,
    random_state=42
)

random_search.fit(X, y)

# =============================================================================
# 4. RESULTS
# =============================================================================
print(f"\n{'='*70}")
print("HYPERPARAMETER TUNING RESULTS")
print(f"{'='*70}")

print(f"\nBest parameters found:")
for param, value in random_search.best_params_.items():
    print(f"  {param}: {value}")

print(f"\nBest CV ROC-AUC: {random_search.best_score_:.4f}")

# Show top 5
results_df = pd.DataFrame(random_search.cv_results_)
top_5 = results_df.nsmallest(5, 'rank_test_score')[
    ['params', 'mean_test_score', 'std_test_score']
].copy()
top_5['mean_test_score'] = top_5['mean_test_score'].round(4)
top_5['std_test_score'] = top_5['std_test_score'].round(4)

print("\nTop 5 parameter combinations:")
print(top_5.to_string(index=False))

# =============================================================================
# 5. EVALUATE OVERFITTING
# =============================================================================
print(f"\n{'='*70}")
print("OVERFITTING ANALYSIS")
print(f"{'='*70}")

model = random_search.best_estimator_

# Training performance
y_pred_train = model.predict_proba(X)[:, 1]
train_auc = roc_auc_score(y, y_pred_train)

# CV performance
cv_auc = random_search.best_score_

# Calculate gap
gap = train_auc - cv_auc

print(f"\nTraining ROC-AUC: {train_auc:.4f}")
print(f"Cross-Validation ROC-AUC: {cv_auc:.4f}")
print(f"Gap (Train - CV): {gap:.4f}")

if gap < 0.05:
    print("âœ“ Excellent! Very little overfitting (gap < 0.05)")
elif gap < 0.10:
    print("âœ“ Good! Acceptable overfitting (gap < 0.10)")
elif gap < 0.15:
    print("âš  Warning! Moderate overfitting (gap < 0.15)")
else:
    print("ðŸš¨ Alert! Significant overfitting (gap >= 0.15)")

print(f"\nComparison to previous experiment:")
print(f"  Previous: Train=0.9925, CV=0.8641, Gap=0.1284")
print(f"  Current:  Train={train_auc:.4f}, CV={cv_auc:.4f}, Gap={gap:.4f}")

# =============================================================================
# 6. FEATURE IMPORTANCE
# =============================================================================
print(f"\n{'='*70}")
print("FEATURE IMPORTANCE ANALYSIS")
print(f"{'='*70}")

feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 15 most important features:")
print(feature_importance.head(15).to_string(index=False))

# =============================================================================
# 7. GENERATE PREDICTIONS
# =============================================================================
test_proba = model.predict_proba(X_test)[:, 1]

print(f"\nGenerated probabilities for {len(test_proba)} test samples")
print(f"Probability range: [{test_proba.min():.4f}, {test_proba.max():.4f}]")
print(f"Mean probability: {test_proba.mean():.4f}")


EXPERIMENT: random_forest_tuned

Strategy: Regularize aggressively to close train-test gap
Key changes:
  - Limit tree depth (max_depth)
  - Require more samples per split (min_samples_split)
  - Require more samples per leaf (min_samples_leaf)
  - Reduce number of features per tree (max_features)
  - Use fewer trees if needed

Running RandomizedSearchCV (trying 40 combinations)...
This may take 10-20 minutes...

Fitting 5 folds for each of 40 candidates, totalling 200 fits

HYPERPARAMETER TUNING RESULTS

Best parameters found:
  n_estimators: 300
  min_samples_split: 20
  min_samples_leaf: 20
  max_samples: 0.8
  max_features: sqrt
  max_depth: 12
  class_weight: balanced

Best CV ROC-AUC: 0.8608

Top 5 parameter combinations:
                                                                                                                                                                    params  mean_test_score  std_test_score
           {'n_estimators': 300, 'min_samples_split': 20,

In [6]:
# =============================================================================
# 4. GENERATE PREDICTIONS
# =============================================================================
test_proba = model.predict_proba(X_test)[:, 1]

In [7]:
# =============================================================================
# 5. CREATE SUBMISSION FILE
# =============================================================================
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
submission_filename = f"../../outputs/predictions/RF/{EXPERIMENT_NAME}_{timestamp}.csv"

submission = pd.DataFrame({
    'icustay_id': test_ids,
    'prediction': test_proba
})
submission.to_csv(submission_filename, index=False)
print(f"\nâœ“ Submission saved: {submission_filename}")


âœ“ Submission saved: ../../outputs/predictions/RF/random_forest_tuned_20251122_0058.csv


In [8]:
# =============================================================================
# 9. LOG EXPERIMENT
# =============================================================================
experiment_log = {
    'timestamp': timestamp,
    'experiment_name': EXPERIMENT_NAME,
    'model_type': 'RandomForestClassifier (Regularized)',
    'cv_roc_auc_mean': cv_auc,
    'cv_roc_auc_std': results_df.loc[random_search.best_index_, 'std_test_score'],
    'train_roc_auc': train_auc,
    'overfitting_gap': gap,
    'kaggle_public_score': None,
    'kaggle_private_score': None,
    'parameters': str(random_search.best_params_),
    'description': MODEL_DESCRIPTION,
    'submission_file': submission_filename
}

log_file = '../experiment_log.csv'
log_df = pd.DataFrame([experiment_log])

import os
if os.path.exists(log_file):
    log_df.to_csv(log_file, mode='a', header=False, index=False)
else:
    log_df.to_csv(log_file, index=False)

print(f"âœ“ Experiment logged to {log_file}")

âœ“ Experiment logged to ../experiment_log.csv


In [9]:
# =============================================================================
# 7. SAVE MODEL (OPTIONAL)
# =============================================================================
import pickle


model_filename = f"../../outputs/models/{EXPERIMENT_NAME}_{timestamp}.pkl"
with open(model_filename, 'wb') as f:
    pickle.dump(model, f)
print(f"âœ“ Model saved: {model_filename}")

print(f"\n{'='*70}")
print("EXPERIMENT COMPLETE!")
print(f"{'='*70}")
print(f"\nNext steps:")
print(f"1. Upload {submission_filename} to Kaggle")
print(f"2. Record Kaggle score in {log_file}")
print(f"3. Copy this notebook and modify for next experiment")

âœ“ Model saved: ../../outputs/models/random_forest_tuned_20251122_0058.pkl

EXPERIMENT COMPLETE!

Next steps:
1. Upload ../../outputs/predictions/RF/random_forest_tuned_20251122_0058.csv to Kaggle
2. Record Kaggle score in ../experiment_log.csv
3. Copy this notebook and modify for next experiment
