# Phase 3: Model Building 



In [2]:
import pandas as pd
import json
import joblib

# Import the models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

In [3]:
print("Loading data splits...")
X_train = pd.read_parquet('X_train.parquet')
y_train = pd.read_parquet('y_train.parquet').squeeze() # .squeeze() to convert from DataFrame to Series
X_test = pd.read_parquet('X_test.parquet')
y_test = pd.read_parquet('y_test.parquet').squeeze()

print("Loading SMOTE-balanced training data...")
X_train_smote = pd.read_parquet('X_train_smote.parquet')
y_train_smote = pd.read_parquet('y_train_smote.parquet').squeeze()

print("Loading class weight dictionary...")
with open('class_weight_dict.json', 'r') as f:
    class_weight_dict = json.load(f)
    # JSON saves keys as strings, so we convert the '0' and '1' keys back to integers
    class_weight_dict = {int(k): v for k, v in class_weight_dict.items()}

print("All data loaded successfully.")
print("-" * 50)


Loading data splits...
Loading SMOTE-balanced training data...
Loading class weight dictionary...
All data loaded successfully.
--------------------------------------------------


### Step 1: Baseline Model - Logistic Regression 

In [4]:
trained_models = {}
print("--- Training Logistic Regression Models ---")

# Model 1.1: Logistic Regression with Class Weights
print("Training Logistic Regression with class weights...")
lr_weighted = LogisticRegression(solver='liblinear', class_weight=class_weight_dict, random_state=42)
lr_weighted.fit(X_train, y_train)
trained_models['Logistic Regression (Weighted)'] = lr_weighted
print("Done.")

# Model 1.2: Logistic Regression with SMOTE Data
print("Training Logistic Regression with SMOTE data...")
lr_smote = LogisticRegression(solver='liblinear', random_state=42)
lr_smote.fit(X_train_smote, y_train_smote)
trained_models['Logistic Regression (SMOTE)'] = lr_smote
print("Done.")
print("-" * 50)

--- Training Logistic Regression Models ---
Training Logistic Regression with class weights...




Done.
Training Logistic Regression with SMOTE data...
Done.
--------------------------------------------------




### Step 2: Ensemble Model - Random Forest

In [None]:
print("--- Training Random Forest Models ---")

# Model 2.1: Random Forest with Class Weights
# Using n_jobs=-1 to use all available CPU cores for faster training
print("Training Random Forest with class weights...")
rf_weighted = RandomForestClassifier(n_estimators=100, class_weight=class_weight_dict, random_state=42, n_jobs=-1)
rf_weighted.fit(X_train, y_train)
trained_models['Random Forest (Weighted)'] = rf_weighted
print("Done.")

# Model 2.2: Random Forest with SMOTE Data
print("Training Random Forest with SMOTE data...")
rf_smote = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_smote.fit(X_train_smote, y_train_smote)
trained_models['Random Forest (SMOTE)'] = rf_smote
print("Done.")
print("-" * 50)

### Step 3: Advanced Ensemble Model - LightGBM

In [None]:
print("--- Training LightGBM Models ---")

# Model 3.1: LightGBM with Class Weights
# LightGBM uses `class_weight` in the same way as scikit-learn
print("Training LightGBM with class weights...")
lgbm_weighted = LGBMClassifier(class_weight=class_weight_dict, random_state=42)
lgbm_weighted.fit(X_train, y_train)
trained_models['LightGBM (Weighted)'] = lgbm_weighted
print("Done.")

# Model 3.2: LightGBM with SMOTE Data
# Note: LightGBM also has a built-in `is_unbalance=True` parameter which is another alternative.
# Here, we will stick to the specified plan of using the SMOTE-generated data.
print("Training LightGBM with SMOTE data...")
lgbm_smote = LGBMClassifier(random_state=42)
lgbm_smote.fit(X_train_smote, y_train_smote)
trained_models['LightGBM (SMOTE)'] = lgbm_smote
print("Done.")
print("-" * 50)

### Step 4: Save Trained Models to File 

In [None]:
model_filename = 'trained_models.joblib'
joblib.dump(trained_models, model_filename)

print(f"All models have been trained and saved to '{model_filename}'")
print("\n--- Phase 4: Model Building Complete ---")