# Task 5 - Model Training and Tracking

### Split the Data

In [None]:
from sklearn.model_selection import train_test_split

X = processed_df.drop(columns=['is_high_risk'])
y = processed_df['is_high_risk']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

Choose and Train Models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

lr = LogisticRegression(max_iter=1000)
rf = RandomForestClassifier(random_state=42)

lr.fit(X_train, y_train)
rf.fit(X_train, y_train)

Hyperparameter Tuning (Random Search)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

param_dist = {'n_estimators': [100, 200], 'max_depth': [None, 10, 20]}
search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=3, cv=3, scoring='f1', random_state=42)
search.fit(X_train, y_train)
best_model = search.best_estimator_

Model Evaluation

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    
    return {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, y_prob)
    }

results = evaluate(best_model, X_test, y_test)
print(results)

Track Model with MLflow

In [None]:
import mlflow
import mlflow.sklearn

with mlflow.start_run():
    mlflow.sklearn.log_model(best_model, artifact_path="credit_risk_model")
    for metric, value in results.items():
        mlflow.log_metric(metric, value)

Unit Testing

In [None]:
# tests/test_data_processing.py
import pandas as pd
from src.feature_engineering import feature_engineering

def test_feature_engineering_shape():
    df = pd.DataFrame({'CustomerId': [1, 2], 'Amount': [100, 200], 'TransactionStartTime': ['2023-01-01', '2023-01-02']})
    result = feature_engineering(df)
    assert result.shape[0] == 2

def test_feature_engineering_not_empty():
    df = pd.DataFrame({'CustomerId': [1], 'Amount': [50], 'TransactionStartTime': ['2023-01-01']})
    result = feature_engineering(df)
    assert not result.empty