# Model Building and Training

This notebook covers:
- Data preparation and stratified splitting
- Baseline model (Logistic Regression)
- Ensemble models (Random Forest, XGBoost, LightGBM)
- Cross-validation
- Model comparison and selection

**Author**: Adey Innovations Inc. Data Science Team  
**Date**: December 2025


## 1. Setup and Data Preparation


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (classification_report, confusion_matrix, 
                            precision_recall_curve, average_precision_score,
                            f1_score, precision_score, recall_score, roc_auc_score)
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

# Optional: XGBoost and LightGBM
try:
    from xgboost import XGBClassifier
    XGBOOST_AVAILABLE = True
except ImportError:
    XGBOOST_AVAILABLE = False
    print("XGBoost not available")

try:
    from lightgbm import LGBMClassifier
    LIGHTGBM_AVAILABLE = True
except ImportError:
    LIGHTGBM_AVAILABLE = False
    print("LightGBM not available")

# Add parent directory
import sys
sys.path.append('..')
from src.modeling import *
from src.visualization import plot_confusion_matrix, plot_precision_recall_curve, plot_roc_curve

print("Libraries imported successfully!")


In [None]:
# Load and prepare fraud data (run feature engineering first or load processed data)
from src.data_loader import load_fraud_data, load_ip_to_country, map_ip_to_country
from src.feature_engineering import (create_time_features, create_transaction_velocity_features,
                                      create_device_features, encode_categorical_features,
                                      prepare_features_for_modeling)

# Load data
fraud_df = load_fraud_data('../data/raw/Fraud_Data.csv')
ip_country_df = load_ip_to_country('../data/raw/IpAddress_to_Country.csv')

# Apply feature engineering pipeline
fraud_df = map_ip_to_country(fraud_df, ip_country_df)
fraud_df = create_time_features(fraud_df)
fraud_df = create_transaction_velocity_features(fraud_df)
fraud_df = create_device_features(fraud_df)
fraud_df, _ = encode_categorical_features(fraud_df, ['source', 'browser', 'sex', 'country'])

# Prepare features
X, y = prepare_features_for_modeling(fraud_df, target_col='class')
print(f"Features: {X.shape}, Target: {y.shape}")


In [None]:
# Stratified train-test split
X_train, X_test, y_train, y_test = stratified_train_test_split(X, y, test_size=0.2)

print("Data Split:")
print(f"Training: {X_train.shape[0]} samples")
print(f"Testing: {X_test.shape[0]} samples")
print(f"\nTraining class distribution:")
print(y_train.value_counts())


In [None]:
# Apply SMOTE to training data
X_train_smote, y_train_smote = apply_smote(X_train, y_train)

print("After SMOTE:")
print(f"Training: {X_train_smote.shape[0]} samples")
print(pd.Series(y_train_smote).value_counts())


## 2. Baseline Model - Logistic Regression


In [None]:
# Train Logistic Regression
lr_model = train_logistic_regression(X_train_smote, y_train_smote)

# Evaluate
lr_metrics = evaluate_model(lr_model, X_test, y_test)

print("Logistic Regression Results:")
print("="*50)
print(f"Precision: {lr_metrics['precision']:.4f}")
print(f"Recall: {lr_metrics['recall']:.4f}")
print(f"F1-Score: {lr_metrics['f1_score']:.4f}")
print(f"ROC-AUC: {lr_metrics['roc_auc']:.4f}")
print(f"AUC-PR: {lr_metrics['average_precision']:.4f}")
print("\nClassification Report:")
print(lr_metrics['classification_report'])


In [None]:
# Visualize Logistic Regression results
y_pred_lr = lr_model.predict(X_test)
y_proba_lr = lr_model.predict_proba(X_test)[:, 1]

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_lr)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0])
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')
axes[0].set_title('Confusion Matrix - Logistic Regression')

# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_proba_lr)
axes[1].plot(recall, precision, color='blue', linewidth=2)
axes[1].fill_between(recall, precision, alpha=0.3)
axes[1].set_xlabel('Recall')
axes[1].set_ylabel('Precision')
axes[1].set_title(f'PR Curve (AUC={lr_metrics["average_precision"]:.3f})')

# Feature Importance (coefficients)
importance = pd.DataFrame({
    'feature': X.columns,
    'importance': np.abs(lr_model.coef_[0])
}).sort_values('importance', ascending=False).head(10)
axes[2].barh(importance['feature'], importance['importance'])
axes[2].invert_yaxis()
axes[2].set_xlabel('Absolute Coefficient')
axes[2].set_title('Top 10 Feature Importances')

plt.tight_layout()
plt.show()


## 3. Ensemble Models


In [None]:
# Train Random Forest
rf_model = train_random_forest(X_train_smote, y_train_smote, n_estimators=100, max_depth=10)
rf_metrics = evaluate_model(rf_model, X_test, y_test)

print("Random Forest Results:")
print("="*50)
print(f"Precision: {rf_metrics['precision']:.4f}")
print(f"Recall: {rf_metrics['recall']:.4f}")
print(f"F1-Score: {rf_metrics['f1_score']:.4f}")
print(f"ROC-AUC: {rf_metrics['roc_auc']:.4f}")
print(f"AUC-PR: {rf_metrics['average_precision']:.4f}")


In [None]:
# Train XGBoost (if available)
if XGBOOST_AVAILABLE:
    xgb_model = train_xgboost(X_train_smote, y_train_smote, n_estimators=100, max_depth=6)
    xgb_metrics = evaluate_model(xgb_model, X_test, y_test)
    
    print("XGBoost Results:")
    print("="*50)
    print(f"Precision: {xgb_metrics['precision']:.4f}")
    print(f"Recall: {xgb_metrics['recall']:.4f}")
    print(f"F1-Score: {xgb_metrics['f1_score']:.4f}")
    print(f"ROC-AUC: {xgb_metrics['roc_auc']:.4f}")
    print(f"AUC-PR: {xgb_metrics['average_precision']:.4f}")
else:
    print("XGBoost not available - skipping")


In [None]:
# Train LightGBM (if available)
if LIGHTGBM_AVAILABLE:
    lgb_model = train_lightgbm(X_train_smote, y_train_smote, n_estimators=100, max_depth=6)
    lgb_metrics = evaluate_model(lgb_model, X_test, y_test)
    
    print("LightGBM Results:")
    print("="*50)
    print(f"Precision: {lgb_metrics['precision']:.4f}")
    print(f"Recall: {lgb_metrics['recall']:.4f}")
    print(f"F1-Score: {lgb_metrics['f1_score']:.4f}")
    print(f"ROC-AUC: {lgb_metrics['roc_auc']:.4f}")
    print(f"AUC-PR: {lgb_metrics['average_precision']:.4f}")
else:
    print("LightGBM not available - skipping")


## 4. Model Comparison


In [None]:
# Compare all models
models = {'Logistic Regression': lr_model, 'Random Forest': rf_model}
if XGBOOST_AVAILABLE:
    models['XGBoost'] = xgb_model
if LIGHTGBM_AVAILABLE:
    models['LightGBM'] = lgb_model

comparison_df = compare_models(models, X_test, y_test)
print("Model Comparison:")
print("="*70)
print(comparison_df.to_string(index=False))


In [None]:
# Visualize comparison
fig, ax = plt.subplots(figsize=(12, 6))
metrics = ['Precision', 'Recall', 'F1-Score', 'ROC-AUC', 'AUC-PR']
x = np.arange(len(comparison_df))
width = 0.15

for i, metric in enumerate(metrics):
    ax.bar(x + i*width, comparison_df[metric], width, label=metric)

ax.set_xlabel('Model')
ax.set_ylabel('Score')
ax.set_title('Model Performance Comparison')
ax.set_xticks(x + width * 2)
ax.set_xticklabels(comparison_df['Model'])
ax.legend(loc='lower right')
ax.set_ylim([0, 1])
ax.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()


## 5. Model Selection and Justification


In [None]:
# Select best model based on F1-Score (balances precision and recall)
best_model_name = comparison_df.iloc[0]['Model']
best_model = models[best_model_name]

print("="*70)
print("MODEL SELECTION")
print("="*70)
print(f"""
SELECTED MODEL: {best_model_name}

JUSTIFICATION:
1. Performance: Highest F1-Score balances precision (avoiding false positives 
   that frustrate customers) with recall (catching actual fraud).
   
2. For fraud detection, we prioritize:
   - High Recall: Catching as much fraud as possible
   - Reasonable Precision: Minimizing false alarms
   - AUC-PR: Important for imbalanced datasets
   
3. The {best_model_name} achieves the best balance of these metrics.

4. Additional considerations:
   - Interpretability: {'Higher' if 'Logistic' in best_model_name else 'Lower, but SHAP can help'}
   - Training time: Fast enough for production
   - Inference speed: Suitable for real-time scoring
""")

# Save the best model
save_model(best_model, f'../models/best_model_{best_model_name.lower().replace(" ", "_")}.pkl')
print(f"\nModel saved to ../models/")


# Model Building and Training

This notebook covers:
- Data preparation and stratified splitting
- Baseline model (Logistic Regression)
- Ensemble models (Random Forest, XGBoost, LightGBM)
- Cross-validation
- Model comparison and selection

**Author**: Adey Innovations Inc. Data Science Team  
**Date**: December 2025
