In [7]:
import pandas as pd
import os
import joblib

# Load the clean dataset from your preprocessing notebook
train_df = pd.read_csv("../data/processed/train_dataset.csv")

X_train = train_df.drop('Label', axis=1)
y_train = train_df['Label']

print(f"Training data loaded. Shape: {X_train.shape}")

Training data loaded. Shape: (2262314, 30)


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# This dictionary holds all the models we'll train
models = {
    "Logistic Regression": Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('clf', LogisticRegression(max_iter=2000))
    ]),
    "Linear SVM": Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('clf', LinearSVC(dual="auto", max_iter=5000, random_state=42))
    ]),
"MLP Classifier": Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('clf', MLPClassifier(hidden_layer_sizes=(100,),max_iter=300,early_stopping=True,validation_fraction=0.1,n_iter_no_change=5,random_state=42))
    ]),
    
    # These models don't need scaling
    "Random Forest": RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=100, eval_metric='logloss', random_state=42),
    "LightGBM": LGBMClassifier(n_estimators=100, random_state=42)
}

In [9]:
print("\nStarting model training...")

output_dir = "../models/"
os.makedirs(output_dir, exist_ok=True)

# Loop through the dictionary, train each model, and save it
for name, model in models.items():
    print(f"--- Training {name} ---")
    
    model.fit(X_train, y_train)
    
    filename = f"{output_dir}{name.replace(' ', '_').lower()}.pkl"
    joblib.dump(model, filename)
    
    print(f"Saved trained model to: {filename}")

print("\nAll models have been trained and saved successfully.")


Starting model training...
--- Training Logistic Regression ---
Saved trained model to: ../models/logistic_regression.pkl
--- Training Linear SVM ---
Saved trained model to: ../models/linear_svm.pkl
--- Training MLP Classifier ---
Saved trained model to: ../models/mlp_classifier.pkl
--- Training Random Forest ---
Saved trained model to: ../models/random_forest.pkl
--- Training XGBoost ---
Saved trained model to: ../models/xgboost.pkl
--- Training LightGBM ---
[LightGBM] [Info] Number of positive: 445223, number of negative: 1817091
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.132944 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6388
[LightGBM] [Info] Number of data points in the train set: 2262314, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.196800 -> initscore=-1.406417
[LightGBM] [Info] Start tr