In [1]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import os

In [2]:
data_path = 'creditcard.csv'
df = pd.read_csv(data_path)
df = df.sample(n=1000, random_state=42)

In [3]:
df.shape

(1000, 31)

In [4]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
43428,41505.0,-16.526507,8.584972,-18.649853,9.505594,-13.793819,-2.832404,-16.701694,7.517344,-8.507059,...,1.190739,-1.12767,-2.358579,0.673461,-1.4137,-0.462762,-2.018575,-1.042804,364.19,1
49906,44261.0,0.339812,-2.743745,-0.13407,-1.385729,-1.451413,1.015887,-0.524379,0.22406,0.899746,...,-0.213436,-0.942525,-0.526819,-1.156992,0.311211,-0.746647,0.040996,0.102038,520.12,0
29474,35484.0,1.39959,-0.590701,0.168619,-1.02995,-0.539806,0.040444,-0.712567,0.002299,-0.971747,...,0.102398,0.168269,-0.166639,-0.81025,0.505083,-0.23234,0.011409,0.004634,31.0,0
276481,167123.0,-0.432071,1.647895,-1.669361,-0.349504,0.785785,-0.630647,0.27699,0.586025,-0.484715,...,0.358932,0.873663,-0.178642,-0.017171,-0.207392,-0.157756,-0.237386,0.001934,1.5,0
278846,168473.0,2.01416,-0.137394,-1.015839,0.327269,-0.182179,-0.956571,0.043241,-0.160746,0.363241,...,-0.238644,-0.6164,0.347045,0.061561,-0.360196,0.17473,-0.078043,-0.070571,0.89,0


In [5]:
target_column = df.columns[-1]
X = df.drop(columns=[target_column])
y = df[target_column]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
mlflow.set_experiment("mlaas_model_comparison")

<Experiment: artifact_location='file:///C:/Users/Annish/Documents/MLflow/mlruns/808561087966813408', creation_time=1743701534301, experiment_id='808561087966813408', last_update_time=1743701534301, lifecycle_stage='active', name='mlaas_model_comparison', tags={}>

In [9]:
best_model = None
best_score = 0
best_model_name = ""

In [10]:
models = {
    "SVM": SVC(probability=True),
    "LightGBM": LGBMClassifier()
}


In [12]:
for model_name, model in models.items():
    with mlflow.start_run():
        # Train model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        # Evaluate
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='macro')
        recall = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')
        
        # Log metrics
        mlflow.log_param("model", model_name)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        
        # Save best model  
        if accuracy > best_score:
            best_score = accuracy
            best_model = model
            best_model_name = model_name
            joblib.dump(model, "best_model.pkl")

[LightGBM] [Info] Number of positive: 2, number of negative: 798
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000222 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.002500 -> initscore=-5.988961
[LightGBM] [Info] Start training from score -5.988961


In [13]:
# Save best model
if best_model:
    mlflow.sklearn.log_model(best_model, "best_model")
    print(f"Best model: {best_model_name} with Accuracy: {best_score}")



Best model: SVM with Accuracy: 1.0
