# Import libraries

In [11]:
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import joblib
import os


# Load and prepare data

In [8]:
def load_data(filepath):
    df = pd.read_csv(filepath)
    X = df.drop(columns=['is_high_risk', 'CustomerId'])
    y = df['is_high_risk']
    return train_test_split(X, y, test_size=0.2, random_state=42)

# Split the input DataFrame 

In [29]:
def split_data(df, target_column='is_high_risk'):

    drop_cols = [
        'TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
        'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId', 'ProductCategory',
        'ChannelId', 'TransactionStartTime'  # drop datetime for now
    ]
    X = df.drop(columns=[target_column] + drop_cols, errors='ignore')
    y = df[target_column]
    return train_test_split(X, y, test_size=0.2, random_state=42)

    

# Train a Logistic Regression model

In [13]:
def train_logistic_regression(X_train, y_train):
    param_grid = {'C': [0.1, 1.0, 10.0]}
    grid = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring='f1')
    grid.fit(X_train, y_train)
    return grid.best_estimator_

# Train a Random Forest Classifier

In [14]:
def train_random_forest(X_train, y_train):
    param_grid = {'n_estimators': [50, 100], 'max_depth': [5, 10, None]}
    grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='f1')
    grid.fit(X_train, y_train)
    return grid.best_estimator_

# Evaluation function

In [15]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    }
    return metrics

# train and log model

In [16]:
def train_and_log_model(name, model, param_grid, X_train, X_test, y_train, y_test):
    with mlflow.start_run(run_name=name):
        grid = GridSearchCV(model, param_grid, cv=3, scoring='f1')
        grid.fit(X_train, y_train)

        best_model = grid.best_estimator_
        y_pred = best_model.predict(X_test)
        y_proba = best_model.predict_proba(X_test)[:, 1]

        metrics = evaluate_model(y_test, y_pred, y_proba)

        for k, v in metrics.items():
            mlflow.log_metric(k, v)

        mlflow.sklearn.log_model(best_model, name)

        model_path = f"./models/{name}.pkl"
        joblib.dump(best_model, model_path)
        print(f"Saved model to {model_path}")

        return best_model, metrics

# load dataset from CSV, train both models, evaluate, and log with MLflow

In [33]:
def run_training_pipeline(data_path):
    df = pd.read_csv(data_path)
    X_train, X_test, y_train, y_test = split_data(df)

    #mlflow.set_experiment("Credit Risk Modeling")

    with mlflow.start_run():
        models = {
            'LogisticRegression': train_logistic_regression(X_train, y_train),
            'RandomForest': train_random_forest(X_train, y_train)
        }

        best_model = None
        best_score = 0

        for name, model in models.items():
            metrics = evaluate_model(model, X_test, y_test)
            mlflow.log_params(model.get_params())
            mlflow.log_metrics(metrics)
            mlflow.sklearn.log_model(model, name)

            if metrics['f1'] > best_score:
                best_score = metrics['f1']
                best_model = model

        # Save the best model locally
        joblib.dump(best_model, '../data/models/best_model.pkl')

        # Register the best model to MLflow
        mlflow.sklearn.log_model(best_model, "model", registered_model_name="CreditRiskBestModel")

# Run training pipeline

In [34]:
# Ensure directory exists
os.makedirs('./models', exist_ok=True)
run_training_pipeline("../data/labeled_data.csv")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Successfully registered model 'CreditRiskBestModel'.
Created version '1' of model 'CreditRiskBestModel'.
