In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import mlflow
import mlflow.sklearn
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv("../Data/processed_features_with_target.csv")

data.head()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3742 entries, 0 to 3741
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   total_amount           3742 non-null   float64
 1   avg_amount             3742 non-null   float64
 2   transaction_count      3742 non-null   float64
 3   std_amount             3742 non-null   float64
 4   avg_transaction_hour   3742 non-null   float64
 5   avg_transaction_day    3742 non-null   float64
 6   avg_transaction_month  3742 non-null   float64
 7   CustomerId             3742 non-null   object 
 8   is_high_risk           3742 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 263.2+ KB


In [3]:
X = data.drop(columns=["CustomerId", "is_high_risk"])
y = data["is_high_risk"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

Training set shape: (2993, 7)
Test set shape: (749, 7)


In [5]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Feature scaling done")

Feature scaling done


In [6]:
mlflow.set_experiment("CreditRisk")

with mlflow.start_run(run_name="logistic_regression"):
    lr = LogisticRegression(random_state=42)
    lr.fit(X_train_scaled, y_train)
    y_pred = lr.predict(X_test_scaled)
    y_proba = lr.predict_proba(X_test_scaled)[:, 1]
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)
    
    mlflow.log_param("model_type", "Logistic Regression")
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("roc_auc", roc_auc)
    
    mlflow.sklearn.log_model(lr, "model", registered_model_name="CreditRiskModel")
    print(f"Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}, ROC-AUC: {roc_auc:.4f}")

2025/12/23 12:17:02 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/12/23 12:17:02 INFO mlflow.store.db.utils: Updating database tables
2025/12/23 12:17:02 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/23 12:17:02 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2025/12/23 12:17:03 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/23 12:17:03 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2025/12/23 12:17:47 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/12/23 12:17:47 INFO mlflow.store.db.utils: Updating database tables
2025/12/23 12:17:47 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/23 12:17:47 INFO alembic.runtime.migration: Will assume non-transactional DDL.
Successfully registered model 'CreditRiskModel'.
Created version '1' of model 'CreditRiskModel'.


Accuracy: 0.9079, F1 Score: 0.8779, ROC-AUC: 0.9498


In [8]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

mlflow.set_experiment("CreditRiskModelExperiment")

with mlflow.start_run():
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train_scaled, y_train)

    y_pred = model.predict(X_test_scaled)

    mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred))
    mlflow.log_metric("f1_score", f1_score(y_test, y_pred))
    mlflow.log_metric("roc_auc", roc_auc_score(y_test, y_pred))

    mlflow.sklearn.log_model(
        sk_model=model,
        name="credit_risk_model",
        registered_model_name="CreditRiskModel"
    )

Registered model 'CreditRiskModel' already exists. Creating a new version of this model...
Created version '2' of model 'CreditRiskModel'.


In [9]:
from mlflow.tracking import MlflowClient

client = MlflowClient()
client.transition_model_version_stage(
    name="CreditRiskModel",
    version=2,         
    stage="Production",
    archive_existing_versions=True
)

<ModelVersion: aliases=[], creation_timestamp=1766482355617, current_stage='Production', deployment_job_state=None, description=None, last_updated_timestamp=1766483389951, metrics=None, model_id=None, name='CreditRiskModel', params=None, run_id='32462da60ccf4731a1fd8f1a09f4f063', run_link=None, source='models:/m-000fe749da7d4c878af6a117ae067eb6', status='READY', status_message=None, tags={}, user_id=None, version=2>

In [10]:
import mlflow
from fastapi import FastAPI
import pandas as pd

mlflow.set_tracking_uri("sqlite:///mlflow.db")

app = FastAPI()

model = mlflow.pyfunc.load_model(
    "models:/CreditRiskModel/Production"
)