In [1]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import os

In [2]:
data_path = 'creditcard_10k_balanced.csv'
df = pd.read_csv(data_path)
# df = df.sample(n=10000, random_state=42)

In [3]:
df.shape

(10000, 31)

In [4]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,81072.0,0.152504,-0.529875,1.176965,-1.202292,-0.797599,0.360677,-0.748678,0.153178,-2.342317,...,-0.257429,-0.426723,-0.139358,-0.98751,0.172512,0.135781,0.012291,-0.062251,26.0,0
1,32563.0,-2.19849,1.111433,0.962725,2.770526,0.265426,1.730794,-0.50549,-2.987673,-1.322962,...,3.05252,-0.083206,0.490815,-0.266094,-0.420221,0.340255,0.66737,-0.063418,136.9,0
2,27304.0,-3.532802,-2.228579,-0.903098,3.380612,0.658313,0.899873,1.017552,0.481798,-1.527926,...,0.135297,-0.083653,-0.024973,-1.254693,-0.267784,0.24626,0.61896,-0.047699,526.19,0
3,82271.0,1.253673,0.100741,-0.198298,1.077423,0.457483,0.596458,-0.00627,0.114836,0.50189,...,-0.199677,-0.377854,-0.230479,-1.340867,0.822117,-0.223347,0.032285,0.000885,12.99,0
4,141715.0,-0.186972,1.083575,-0.393382,1.096913,1.929655,-0.722338,1.385346,-0.378026,-0.594949,...,0.004281,0.254329,-0.353016,0.47072,0.165008,-0.405059,0.004793,-0.023925,1.0,0


In [7]:
df[2:1]

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class


In [6]:
target_column = df.columns[-1]
X = df.drop(columns=[target_column])
y = df[target_column]

In [7]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

In [8]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
mlflow.set_experiment("mlaas_model_comparison")

<Experiment: artifact_location='file:///C:/Users/Annish/Documents/MLflow/mlruns/808561087966813408', creation_time=1743701534301, experiment_id='808561087966813408', last_update_time=1743701534301, lifecycle_stage='active', name='mlaas_model_comparison', tags={}>

In [10]:
best_model = None
best_score = 0
best_model_name = ""

In [11]:
models = {
    "SVM": SVC(probability=True),
    "LightGBM": LGBMClassifier(class_weight='balanced')
}


In [None]:
for model_name, model in models.items():
    with mlflow.start_run():
        # Train model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        # Evaluate
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='macro')
        recall = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')
        
        # Log metrics
        mlflow.log_param("model", model_name)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        
        # Save best model  
        if accuracy > best_score:
            best_score = accuracy
            best_model = model
            best_model_name = model_name
            joblib.dump(model, "Notebooks/best_model.pkl")

[LightGBM] [Info] Number of positive: 394, number of negative: 7606
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000668 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


In [13]:
# Save best model
if best_model:
    mlflow.sklearn.log_model(best_model, "best_model")
    print(f"Best model: {best_model_name} with Accuracy: {best_score}")



Best model: LightGBM with Accuracy: 0.991
