In [None]:
# =============================================================================
# EXPERIMENT: [DESCRIBE YOUR EXPERIMENT HERE]
# Date: 22-11-2025 avond 00:08 started
# Description: Logistic Regression with C=10, L2 penalty
# =============================================================================

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import warnings
from datetime import datetime
from pathlib import Path

In [2]:
# =============================================================================
# 1. LOAD PREPROCESSED DATA (no need to preprocess again!)
# =============================================================================
print("Loading preprocessed data...")
X = pd.read_pickle('../../data/processed/X_train_processed.pkl')
y = pd.read_pickle('../../data/processed/y_train.pkl')
X_test = pd.read_pickle('../../data/processed/X_test_processed.pkl')
test_ids = pd.read_pickle('../../data/processed/test_ids.pkl')

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

Loading preprocessed data...
X shape: (20885, 55)
y shape: (20885,)


In [3]:
# =============================================================================
# 2. DEFINE EXPERIMENT PARAMETERS
# =============================================================================
EXPERIMENT_NAME = "logreg_gridsearch_v1"
MODEL_DESCRIPTION = "logistic regression with grid search for hyperparameters"

In [6]:
# =============================================================================
# EXPERIMENT: GRID SEARCH - LOGISTIC REGRESSION
# =============================================================================
print("="*70)
print("TUNING LOGISTIC REGRESSION")
print("="*70)

from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Define parameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],  # These support both L1 and L2
    'class_weight': ['balanced', None]
}

# Grid search
grid_search = GridSearchCV(
    LogisticRegression(max_iter=2000, random_state=42),
    param_grid,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='roc_auc',
    n_jobs=-1,
    verbose=2
)

print("Running grid search (this will take 5-10 minutes)...")
grid_search.fit(X, y)

print(f"\n{'='*70}")
print("GRID SEARCH RESULTS")
print(f"{'='*70}")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best CV ROC-AUC: {grid_search.best_score_:.4f}")

# Show top 5 combinations
results_df = pd.DataFrame(grid_search.cv_results_)
top_5 = results_df.nsmallest(5, 'rank_test_score')[
    ['params', 'mean_test_score', 'std_test_score', 'rank_test_score']
]
print("\nTop 5 parameter combinations:")
print(top_5.to_string(index=False))

# Train final model with best parameters
best_logreg = grid_search.best_estimator_
test_proba = best_logreg.predict_proba(X_test)[:, 1]

TUNING LOGISTIC REGRESSION
Running grid search (this will take 5-10 minutes)...
Fitting 5 folds for each of 56 candidates, totalling 280 fits

GRID SEARCH RESULTS
Best parameters: {'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l1', 'solver': 'saga'}
Best CV ROC-AUC: 0.8405

Top 5 parameter combinations:
                                                                        params  mean_test_score  std_test_score  rank_test_score
     {'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l1', 'solver': 'saga'}         0.840512        0.008008                1
    {'C': 0.01, 'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'saga'}         0.840496        0.007785                2
{'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l1', 'solver': 'liblinear'}         0.840460        0.008175                3
     {'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'saga'}         0.840099        0.007855                4
{'C': 0.1, 'class_weight': 'balanced', 'penalt

In [8]:
# =============================================================================
# 5. CREATE SUBMISSION FILE
# =============================================================================
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
submission_filename = f"../../outputs/predictions/logistic_regression/{EXPERIMENT_NAME}_{timestamp}.csv"

submission = pd.DataFrame({
    'icustay_id': test_ids,
    'prediction': test_proba
})
submission.to_csv(submission_filename, index=False)
print(f"\n✓ Submission saved: {submission_filename}")


✓ Submission saved: ../../outputs/predictions/logistic_regression/logreg_gridsearch_v1_20251122_0020.csv


In [9]:
# =============================================================================
# 6. LOG EXPERIMENT (CRITICAL FOR TRACKING!)
# =============================================================================
experiment_log = {
    'timestamp': timestamp,
    'experiment_name': EXPERIMENT_NAME,
    'model_type': 'LogisticRegression',
    'cv_roc_auc_mean': cv_scores.mean(),
    'cv_roc_auc_std': cv_scores.std(),
    'train_roc_auc': train_auc,
    'kaggle_public_score': None,  # Fill in manually after submission
    'kaggle_private_score': None,
    'parameters': str(model.get_params()),
    'description': MODEL_DESCRIPTION,
    'submission_file': submission_filename
}

# Append to experiment log
log_file = '../experiment_log.csv'
log_df = pd.DataFrame([experiment_log])

import os
if os.path.exists(log_file):
    log_df.to_csv(log_file, mode='a', header=False, index=False)
else:
    log_df.to_csv(log_file, index=False)

print(f"✓ Experiment logged to {log_file}")

NameError: name 'cv_scores' is not defined

In [11]:
# =============================================================================
# 7. SAVE MODEL (OPTIONAL)
# =============================================================================
import pickle


model_filename = f"../../outputs/models/{EXPERIMENT_NAME}_{timestamp}.pkl"
with open(model_filename, 'wb') as f:
    pickle.dump(model, f)
print(f"✓ Model saved: {model_filename}")

print(f"\n{'='*70}")
print("EXPERIMENT COMPLETE!")
print(f"{'='*70}")
print(f"\nNext steps:")
print(f"1. Upload {submission_filename} to Kaggle")
print(f"2. Record Kaggle score in {log_file}")
print(f"3. Copy this notebook and modify for next experiment")

NameError: name 'model' is not defined