# Baseline Models

Testing Logistic Regression and Random Forest as baselines before moving to more complex models.


In [None]:
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

sys.path.append('..')
from utils.metrics import *

SEED = 42
np.random.seed(SEED)


## Load Data


In [None]:
DATA_PATH = '../data/processed/'

if os.path.exists(DATA_PATH + 'X_train.pkl'):
    X_train = pd.read_pickle(DATA_PATH + 'X_train.pkl')
    X_val = pd.read_pickle(DATA_PATH + 'X_val.pkl')
    X_test = pd.read_pickle(DATA_PATH + 'X_test.pkl')
    y_train = pd.read_pickle(DATA_PATH + 'y_train.pkl')
    y_val = pd.read_pickle(DATA_PATH + 'y_val.pkl')
    y_test = pd.read_pickle(DATA_PATH + 'y_test.pkl')
    print("Loaded preprocessed data")
else:
    # temp preprocessing until 02_preprocessing is done
    print("Preprocessed data not found, doing temp preprocessing...")
    from sklearn.preprocessing import StandardScaler
    
    df = pd.read_csv('../data/creditcard.csv')
    
    # chronological split
    train_end = int(len(df) * 0.70)
    val_end = int(len(df) * 0.85)
    
    train_df = df.iloc[:train_end].copy()
    val_df = df.iloc[train_end:val_end].copy()
    test_df = df.iloc[val_end:].copy()
    
    scaler = StandardScaler()
    train_df['Amount_scaled'] = scaler.fit_transform(train_df[['Amount']])
    val_df['Amount_scaled'] = scaler.transform(val_df[['Amount']])
    test_df['Amount_scaled'] = scaler.transform(test_df[['Amount']])
    
    feat_cols = [f'V{i}' for i in range(1, 29)] + ['Amount_scaled']
    
    X_train, y_train = train_df[feat_cols], train_df['Class']
    X_val, y_val = val_df[feat_cols], val_df['Class']
    X_test, y_test = test_df[feat_cols], test_df['Class']

print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")
print(f"Fraud rate: {y_train.mean()*100:.3f}%")


## Logistic Regression


In [None]:
lr = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=SEED)
lr.fit(X_train, y_train)

lr_pred = lr.predict(X_val)
lr_prob = lr.predict_proba(X_val)[:, 1]
lr_metrics = calculate_all_metrics(y_val, lr_pred, lr_prob)

print("Logistic Regression:")
for k, v in lr_metrics.items():
    print(f"  {k}: {v:.4f}")


In [None]:
plot_confusion_matrix(y_val, lr_pred, 'Logistic Regression')
plt.show()


In [None]:
plot_pr_roc_curves(y_val, lr_prob, 'Logistic Regression')
plt.show()


## Random Forest


In [None]:
rf = RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', 
                            random_state=SEED, n_jobs=-1)
rf.fit(X_train, y_train)

rf_pred = rf.predict(X_val)
rf_prob = rf.predict_proba(X_val)[:, 1]
rf_metrics = calculate_all_metrics(y_val, rf_pred, rf_prob)

print("Random Forest:")
for k, v in rf_metrics.items():
    print(f"  {k}: {v:.4f}")


In [None]:
plot_confusion_matrix(y_val, rf_pred, 'Random Forest')
plt.show()


In [None]:
# feature importance
feat_imp = pd.DataFrame({'feature': X_train.columns, 'importance': rf.feature_importances_})
feat_imp = feat_imp.sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feat_imp.head(15), x='importance', y='feature')
plt.title('Top 15 Features')
plt.tight_layout()
plt.show()


## Comparison


In [None]:
print_metrics_table({
    'Logistic Regression': lr_metrics,
    'Random Forest': rf_metrics
})


In [None]:
plot_multiple_pr_curves({'LR': lr_prob, 'RF': rf_prob}, y_val)
plt.show()


## Save


In [None]:
os.makedirs('../models', exist_ok=True)

joblib.dump(lr, '../models/lr_baseline.joblib')
joblib.dump(rf, '../models/rf_baseline.joblib')

results = {
    'y_val': y_val,
    'lr_prob': lr_prob, 'rf_prob': rf_prob,
    'lr_metrics': lr_metrics, 'rf_metrics': rf_metrics
}
joblib.dump(results, '../models/baseline_results.joblib')
print("saved")
