In [None]:
import pandas as pd
import json
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, average_precision_score

# Load data
train = pd.read_csv('data/processed/train_features.csv')
val = pd.read_csv('data/processed/val_features.csv')
with open('data/processed/feature_columns.json', 'r') as f:
    features = json.load(f)

X_train, y_train = train[features], train['Class']
X_val, y_val = val[features], val['Class']

# Baseline: Logistic Regression
lr = LogisticRegression(class_weight='balanced', random_state=42)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict_proba(X_val)[:, 1]
y_pred_bin_lr = (y_pred_lr > 0.5).astype(int)

# Or RandomForest (comment LR if using)
# rf = RandomForestClassifier(class_weight='balanced', random_state=42, n_estimators=100)
# rf.fit(X_train, y_train)
# y_pred_rf = rf.predict_proba(X_val)[:, 1]
# y_pred_bin_rf = rf.predict(X_val)

# Metrics (using LR)
metrics = {
    'precision': precision_score(y_val, y_pred_bin_lr),
    'recall': recall_score(y_val, y_pred_bin_lr),
    'f1': f1_score(y_val, y_pred_bin_lr),
    'roc_auc': roc_auc_score(y_val, y_pred_lr),
    'pr_auc': average_precision_score(y_val, y_pred_lr)
}

with open('data/baseline_metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)
print(metrics)