# Task 2 - Model Building and Training

In [1]:
import pandas as pd
import numpy as np
import os
import sys
import ipaddress
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
sys.path.append(os.path.abspath("../Model_Training"))
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, average_precision_score, confusion_matrix, classification_report
import warnings
warnings.filterwarnings("ignore")

In [2]:
os.chdir("..")

In [3]:
from Model_Training.preprocessing import load_data, preprocess_data
from Model_Training.model_utils import train_model, evaluate_model, save_model

In [5]:
datasets = {
        "creditcard": {
            "path": "Data/creditcard.csv",
            "label_col": "Class",
            "model_paths": {
                "logistic": "Models/creditcard_logistic.pkl",
                "ensemble": "Models/creditcard_rf.pkl",
            },
            "report_paths": {
                "logistic": "Outputs/Evaluation_Reports/creditcard_logistic_report.txt",
                "ensemble": "Outputs/Evaluation_Reports/creditcard_rf_report.txt",
            }
        },
        "fraud_data": {
            "path": "Data/fraud_data.csv",
            "label_col": "class",
            "model_paths": {
                "logistic": "Models/frauddata_logistic.pkl",
                "ensemble": "Models/frauddata_rf.pkl",
            },
            "report_paths": {
                "logistic": "Outputs/Evaluation_Reports/frauddata_logistic_report.txt",
                "ensemble": "Outputs/Evaluation_Reports/frauddata_rf_report.txt",
            }
        }
    }

In [None]:
for dataset_name, config in datasets.items():
    print(f"\n--- Processing {dataset_name} ---")
    df = load_data(config["path"])
    if df is None:
            continue
    X_train, X_test, y_train, y_test = preprocess_data(df, config["label_col"])

        # Logistic Regression
    logistic_model = train_model(X_train, y_train, model_type="logistic")
    save_model(logistic_model, config["model_paths"]["logistic"])
    evaluate_model(logistic_model, X_test, y_test, dataset_name, "Logistic Regression", config["report_paths"]["logistic"])

    # Random Forest (Ensemble)
    rf_model = train_model(X_train, y_train, model_type="random_forest")
    save_model(rf_model, config["model_paths"]["ensemble"])
    evaluate_model(rf_model, X_test, y_test, dataset_name, "Random Forest", config["report_paths"]["ensemble"])


--- Processing creditcard ---
Dataset: creditcard
Model: Logistic Regression
F1 Score: 0.9489
AUC-PR: 0.9912
Confusion Matrix:
[[55361  1389]
 [ 4289 52687]]
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.98      0.95     56750
           1       0.97      0.92      0.95     56976

    accuracy                           0.95    113726
   macro avg       0.95      0.95      0.95    113726
weighted avg       0.95      0.95      0.95    113726


