# 04 - Baseline Models

Goal: establish baseline classifiers without heavy tuning.

Models:
- Logistic Regression (with class weights)
- Decision Tree (with class weights)

Metrics: Precision, Recall, F1, ROC-AUC, Confusion Matrix.


In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from pathlib import Path

PROCESSED_DIR = Path('data/processed')
X_train = pd.read_csv(PROCESSED_DIR / 'X_train_scaled.csv')
X_test = pd.read_csv(PROCESSED_DIR / 'X_test_scaled.csv')
y_train = pd.read_csv(PROCESSED_DIR / 'y_train.csv').squeeze()
y_test = pd.read_csv(PROCESSED_DIR / 'y_test.csv').squeeze()

# Class weights for imbalance
neg, pos = y_train.value_counts()[0], y_train.value_counts()[1]
class_weight = {0: neg/len(y_train), 1: pos/len(y_train)}

models = {
    'log_reg': LogisticRegression(max_iter=1000, class_weight='balanced', n_jobs=-1, random_state=42),
    'decision_tree': DecisionTreeClassifier(class_weight='balanced', random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    probs = model.predict_proba(X_test)[:,1]
    results[name] = {
        'report': classification_report(y_test, preds, output_dict=True),
        'roc_auc': roc_auc_score(y_test, probs),
        'confusion': confusion_matrix(y_test, preds)
    }

results

Store these baseline results for later comparison in `07_model_evaluation.ipynb`.