# 04 - Baseline Models

**Goal:** Train Logistic Regression and Decision Tree on the new dataset.
**Output:** Metrics saved to `results/new_dataset_analysis/metrics`.

In [1]:
import pandas as pd
import json
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

PROCESSED_DIR = Path('../data/processed/new_analysis')
METRICS_DIR = Path('../results/new_dataset_analysis/metrics')
METRICS_DIR.mkdir(parents=True, exist_ok=True)

print("Loading Processed Data...")
X_train = pd.read_csv(PROCESSED_DIR / 'X_train_scaled.csv')
y_train = pd.read_csv(PROCESSED_DIR / 'y_train.csv').squeeze()
X_test = pd.read_csv(PROCESSED_DIR / 'X_test_scaled.csv')
y_test = pd.read_csv(PROCESSED_DIR / 'y_test.csv').squeeze()
print("Data Loaded.")

Loading Processed Data...
Data Loaded.


In [2]:
results = {}

# 1. Logistic Regression
print("Training Logistic Regression...")
lr = LogisticRegression(class_weight='balanced', max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
results['LogisticRegression'] = {
    'report': classification_report(y_test, y_pred_lr, output_dict=True),
    'roc_auc': roc_auc_score(y_test, lr.predict_proba(X_test)[:,1])
}

# 2. Decision Tree
print("Training Decision Tree...")
dt = DecisionTreeClassifier(class_weight='balanced', random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
results['DecisionTree'] = {
    'report': classification_report(y_test, y_pred_dt, output_dict=True),
    'roc_auc': roc_auc_score(y_test, dt.predict_proba(X_test)[:,1])
}

with open(METRICS_DIR / '04_baseline_models_results.json', 'w') as f:
    json.dump(results, f, indent=4)
    
print("Baseline Results Saved.")

Training Logistic Regression...
Training Decision Tree...
Baseline Results Saved.
