## P7 - Prep the data 

---

In [3]:
# fmt: off

import warnings

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

warnings.filterwarnings('ignore')


In [4]:
import matplotlib.pyplot as plt
import seaborn as sns

---

In [5]:
# Fonction Cleaning Finale
def cleaning(features, test_features, encoding="ohe"):
    """Fonction cleaning finale"""
    # Extract the ids
    train_ids = features["SK_ID_CURR"]
    test_ids = test_features["SK_ID_CURR"]

    # Extract the labels for training
    labels = features["TARGET"]

    # Remove the ids and target
    features = features.drop(columns=["SK_ID_CURR", "TARGET"])
    test_features = test_features.drop(columns=["SK_ID_CURR"])

    # One Hot Encoding
    if encoding == "ohe":
        features = pd.get_dummies(features)
        test_features = pd.get_dummies(test_features)

        # Align the dataframes by the columns
        features, test_features = features.align(test_features, join="inner", axis=1)

        # No categorical indices to record
        cat_indices = "auto"

    # Integer label encoding
    elif encoding == "le":
        # Create a label encoder
        label_encoder = LabelEncoder()

        # List for storing categorical indices
        cat_indices = []

        # Iterate through each column
        for i, col in enumerate(features):
            if features[col].dtype == "object":
                # Map the categorical features to integers
                features[col] = label_encoder.fit_transform(
                    np.array(features[col].astype(str)).reshape((-1,))
                )
                test_features[col] = label_encoder.transform(
                    np.array(test_features[col].astype(str)).reshape((-1,))
                )

                # Record the categorical indices
                cat_indices.append(i)

    # Catch error if label encoding scheme is not valid
    else:
        raise ValueError("Encoding must be either 'ohe' or 'le'")

    print("Training Data Shape: ", features.shape)
    print("Testing Data Shape: ", test_features.shape)

    # Extract feature names
    feature_names = list(features.columns)

    # Impute the domainnomial features
    imputer = SimpleImputer(strategy="median")

    features = imputer.fit_transform(features)
    test_features = imputer.transform(test_features)

    # Scale the domainnomial features
    scaler = MinMaxScaler(feature_range=(0, 1))

    features = scaler.fit_transform(features)
    test_features = scaler.transform(test_features)

    # Convert to np arrays
    features = np.array(features)
    test_features = np.array(test_features)

    return feature_names, labels, features, test_features, train_ids, test_ids


In [6]:
# Training data
app_train = pd.read_csv("../../data/raw/application_train.csv")
print("Training data shape: ", app_train.shape)
app_train.head(2)

# Testing data features
app_test = pd.read_csv("../../data/raw/application_test.csv")
print("Testing data shape: ", app_test.shape)
app_test.head(2)

# copy to add fe
app_train_domain = app_train.copy()
app_test_domain = app_test.copy()

# feature engineering with domain knowledge variables
app_train_domain["CREDIT_INCOME_PERCENT"] = (
    app_train_domain["AMT_CREDIT"] / app_train_domain["AMT_INCOME_TOTAL"]
)
app_train_domain["ANNUITY_INCOME_PERCENT"] = (
    app_train_domain["AMT_ANNUITY"] / app_train_domain["AMT_INCOME_TOTAL"]
)
app_train_domain["CREDIT_TERM"] = (
    app_train_domain["AMT_ANNUITY"] / app_train_domain["AMT_CREDIT"]
)
app_train_domain["DAYS_EMPLOYED_PERCENT"] = (
    app_train_domain["DAYS_EMPLOYED"] / app_train_domain["DAYS_BIRTH"]
)

app_test_domain["CREDIT_INCOME_PERCENT"] = (
    app_test_domain["AMT_CREDIT"] / app_test_domain["AMT_INCOME_TOTAL"]
)
app_test_domain["ANNUITY_INCOME_PERCENT"] = (
    app_test_domain["AMT_ANNUITY"] / app_test_domain["AMT_INCOME_TOTAL"]
)
app_test_domain["CREDIT_TERM"] = (
    app_test_domain["AMT_ANNUITY"] / app_test_domain["AMT_CREDIT"]
)
app_test_domain["DAYS_EMPLOYED_PERCENT"] = (
    app_test_domain["DAYS_EMPLOYED"] / app_test_domain["DAYS_BIRTH"]
)

# Create an anomalous flag column
app_train_domain["DAYS_EMPLOYED_ANOM"] = app_train_domain["DAYS_EMPLOYED"] == 365243
# Replace the anomalous values with nan
app_train_domain["DAYS_EMPLOYED"].replace({365243: np.nan}, inplace=True)

app_test_domain["DAYS_EMPLOYED_ANOM"] = app_test_domain["DAYS_EMPLOYED"] == 365243
app_test_domain["DAYS_EMPLOYED"].replace({365243: np.nan}, inplace=True)


app_train_domain["DAYS_EMPLOYED_ANOM"] = app_train_domain["DAYS_EMPLOYED_ANOM"].astype(
    "int32"
)
app_test_domain["DAYS_EMPLOYED_ANOM"] = app_test_domain["DAYS_EMPLOYED_ANOM"].astype(
    "int32"
)

Training data shape:  (307511, 122)
Testing data shape:  (48744, 121)


In [7]:
liste_features, y_train, X_train, X_test, train_ids, test_ids = cleaning(
    app_train_domain, app_test_domain, encoding="ohe"
)
train_final = pd.DataFrame(X_train, columns=liste_features)
train_final["LABELS"] = y_train
train_final["SK_ID_CURR"] = train_ids

test_final = pd.DataFrame(X_test, columns=liste_features)
test_final["SK_ID_CURR"] = test_ids


Training Data Shape:  (307511, 246)
Testing Data Shape:  (48744, 246)


---

## Models

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier

### Models scores

In [9]:
from sklearn.metrics import (
    make_scorer,
    roc_auc_score,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)

scoring = {
    "roc_auc": make_scorer(roc_auc_score),
    "accuracy": make_scorer(accuracy_score),
    "precision": make_scorer(precision_score),
    "recall": make_scorer(recall_score),
    "f1": make_scorer(f1_score),
}


---

### SMOTE

In [10]:
from imblearn.over_sampling import SMOTE


smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check the new class distribution
print("Original class distribution:", pd.Series(y_train).value_counts())
print("Resampled class distribution:", pd.Series(y_train_resampled).value_counts())

Original class distribution: TARGET
0    282686
1     24825
Name: count, dtype: int64
Resampled class distribution: TARGET
1    282686
0    282686
Name: count, dtype: int64


---

In [11]:
# Models
models = [
    (
        "Logistic Regression",
        {"C": 10, "tol": 0.001, "random_state": 42, "verbose": 1, "n_jobs": -1},
        LogisticRegression(),
        (X_train, y_train),
    ),
    (
        "Random Forest",
        {"n_estimators": 100, "random_state": 42, "verbose": 1, "n_jobs": -4},
        RandomForestClassifier(),
        (X_train, y_train),
    ),
    (
        "Random Forest with SMOTE",
        {"n_estimators": 100, "random_state": 42, "verbose": 1, "n_jobs": -4},
        RandomForestClassifier(),
        (X_train_resampled, y_train_resampled),
    ),
]


In [16]:
def evaluate_models(models, cv=5, scoring=scoring):
    """
    Evaluate a list of models using cross-validation and store the results in a dictionary.

    Parameters:
    - models: List of tuples, each containing:
        - model_name: Name of the model (str)
        - params: Dictionary of parameters for the model
        - model: The model instance
        - train_data: Tuple containing (X_train, y_train)
    - cv: Number of cross-validation folds (int)
    - scoring: Scoring metric for cross-validation (str)

    Returns:
    - cv_results_dict: Dictionary containing cross-validation results for each model
    """

    # Dictionary to store results
    cv_results_dict = {}

    for model_name, params, model, train_data in models:
        X_train = train_data[0]
        y_train = train_data[1]

        cv_results = cross_validate(
            model.set_params(**params),
            X_train,
            y_train,
            cv=cv,
            scoring=scoring,
            return_estimator=True,
        )

        # Store the results in the dictionary
        cv_results_dict[model_name] = cv_results

    return cv_results_dict


cv_results_dict = evaluate_models(models, scoring=scoring)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-4)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=-4)]: Done  36 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-4)]: Done 100 out of 100 | elapsed:   11.8s finished
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    0.1s
[Parallel(n_jobs=7)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=-4)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=-4)]: Done  36 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-4)]: Done 100 out of 100 | el

---

# MLflow

In [22]:
import mlflow
from mlflow.models import infer_signature

In [12]:
for i, element in enumerate(models):
    model_name = element[0]
    params = element[1]
    model = element[2]

In [25]:
# Initialize MLflow
mlflow.set_experiment("MLflow try1")
mlflow.set_tracking_uri("http://localhost:5000")

MlflowException: API request to endpoint /api/2.0/mlflow/experiments/get-by-name failed with error code 403 != 200. Response body: ''

In [26]:
# Initialize MLflow
mlflow.set_experiment("MLflow try1")
mlflow.set_tracking_uri("http://localhost:5001")

for i, element in enumerate(models):
    model_name = element[0]
    params = element[1]
    model = element[2]

    with mlflow.start_run(run_name=model_name):
        mlflow.log_params(params)
        for model_name in cv_results_dict:
            mlflow.log_metrics(
                {
                    "roc_auc": cv_results_dict[model_name]["test_roc_auc"],
                    "accuracy": cv_results_dict[model_name]["accuracy"],
                    "precision": cv_results_dict[model_name]["precision"],
                    "recall": cv_results_dict[model_name]["test_recall"],
                    "f1": cv_results_dict[model_name]["f1"],
                    "fit_time": cv_results_dict[model_name]["fit_time"],
                }
            )

    mlflow.sklearn.log_model(model, "model")

MlflowException: API request to endpoint /api/2.0/mlflow/experiments/get-by-name failed with error code 403 != 200. Response body: ''

In [None]:
for metric in metrics 

In [73]:
# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

# Create a new MLflow Experiment
mlflow.set_experiment("MLflow try1")

# Start an MLflow run
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(params)

    # Log each metric individually
    mlflow.log_metric("accuracy", cv_results["test_accuracy"].mean())
    mlflow.log_metric("precision", cv_results["test_precision"].mean())
    mlflow.log_metric("recall", cv_results["test_recall"].mean())
    mlflow.log_metric("f1", cv_results["test_f1"].mean())

    # Log the loss metric
    # mlflow.log_metric(
    #     {
    #         #'roc_auc_score': cv_results['test_roc_auc'],
    #         'accuracy': (cv_results['test_accuracy'].mean()),
    #         'precision': (cv_results['test_precision'].mean()),
    #         'recall': (cv_results['test_recall'].mean()),
    #         'f1': (cv_results['test_f1'].mean()),
    #     }
    # )

    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Training Info", "RandomForest for credit classification")

    random_forest_domain.fit(X_train, y_train)

    # Infer the model signature
    signature = infer_signature(X_train, random_forest_domain.predict(X_train))

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=random_forest_domain,
        artifact_path="credit_model_test1",
        signature=signature,
        input_example=X_train,
        registered_model_name="tracking-quickstart",
    )

[Parallel(n_jobs=-4)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=-4)]: Done  36 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-4)]: Done 100 out of 100 | elapsed:   17.4s finished
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    0.5s
[Parallel(n_jobs=7)]: Done 100 out of 100 | elapsed:    1.2s finished
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    0.5s
[Parallel(n_jobs=7)]: Done 100 out of 100 | elapsed:    1.3s finished
Registered model 'tracking-quickstart' already exists. Creating a new version of this model...
2025/04/07 16:05:54 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: tracking-quickstart, version 5
Created version '5' of model 'tracking-quickstart'.


🏃 View run clumsy-koi-940 at: http://127.0.0.1:5000/#/experiments/726532937351277124/runs/2e70af58225c446989fa861ada3fd324
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/726532937351277124


---