In [None]:
import mlflow
from mlflow.models import infer_signature
from config import run_name
import polars as pl
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score 
import warnings

warnings.filterwarnings('ignore')

dataPath = "/Users/chrisjackson/Downloads/home-credit-credit-risk-model-stability/"


In [None]:
# NOTE: review the links mentioned above for guidance on connecting to a managed tracking server, such as the free Databricks Community Edition

mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

### NOTE:RUN THIS IN BASH TO START MLFLOW TRACKING SERVER ###
# mlflow server --host 127.0.0.1 --port 8080


## **Loading the Data**

We will now load the data and get it in the right format. We will conduct exploratory data analysis and create a baseline submission that we can iterate and improve on.

In [None]:
def set_table_dtypes(df: pl.DataFrame) -> pl.DataFrame:
    # implement here all desired dtypes for tables
    # the following is just an example
    for col in df.columns:
        # last letter of column name will help you determine the type
        if col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))

    return df

def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:  
        if df[col].dtype.name in ['object', 'string']:
            df[col] = df[col].astype("string").astype('category')
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
            df[col] = df[col].astype(new_dtype)
    return df

In [None]:
train_basetable = pl.read_csv(dataPath + "csv_files/train/train_base.csv")
train_static = pl.concat(
    [
        pl.read_csv(dataPath + "csv_files/train/train_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/train/train_static_0_1.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
train_static_cb = pl.read_csv(dataPath + "csv_files/train/train_static_cb_0.csv").pipe(set_table_dtypes)
train_person_1 = pl.read_csv(dataPath + "csv_files/train/train_person_1.csv").pipe(set_table_dtypes) 
train_credit_bureau_b_2 = pl.read_csv(dataPath + "csv_files/train/train_credit_bureau_b_2.csv").pipe(set_table_dtypes) 

In [None]:
train_static

In [None]:
test_basetable = pl.read_csv(dataPath + "csv_files/test/test_base.csv")
test_static = pl.concat(
    [
        pl.read_csv(dataPath + "csv_files/test/test_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/test/test_static_0_1.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/test/test_static_0_2.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
test_static_cb = pl.read_csv(dataPath + "csv_files/test/test_static_cb_0.csv").pipe(set_table_dtypes)
test_person_1 = pl.read_csv(dataPath + "csv_files/test/test_person_1.csv").pipe(set_table_dtypes) 
test_credit_bureau_b_2 = pl.read_csv(dataPath + "csv_files/test/test_credit_bureau_b_2.csv").pipe(set_table_dtypes) 

## Define an MLflow Experiment

In order to group any distinct runs of a particular project or idea together, we can define an Experiment that will group each iteration (runs) together. 
Defining a unique name that is relevant to what we're working on helps with organization and reduces the amount of work (searching) to find our runs later on. 

In [None]:
def get_or_create_experiment(experiment_name):
    """
    Retrieve the ID of an existing MLflow experiment or create a new one if it doesn't exist.

    This function checks if an experiment with the given name exists within MLflow.
    If it does, the function returns its ID. If not, it creates a new experiment
    with the provided name and returns its ID.

    Parameters:
    - experiment_name (str): Name of the MLflow experiment.

    Returns:
    - str: ID of the existing or newly created MLflow experiment.
    """

    if experiment := mlflow.get_experiment_by_name(experiment_name):
        return experiment.experiment_id
    else:
        return mlflow.create_experiment(experiment_name)


In [None]:
experiment_id = get_or_create_experiment("MLflow Home Credit Test")

In [None]:
experiment_id

## **Feature Engineering**

We will join the tables on case_id. There are additional ways we can work with the data, but we will leave this as is for now.

In [None]:
# We need to use aggregation functions in tables with depth > 1, so tables that contain num_group1 column or 
# also num_group2 column.
train_person_1_feats_1 = train_person_1.group_by("case_id").agg(
    pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"),
    (pl.col("incometype_1044T") == "SELFEMPLOYED").max().alias("mainoccupationinc_384A_any_selfemployed")
)

# Here num_group1=0 has special meaning, it is the person who applied for the loan.
train_person_1_feats_2 = train_person_1.select(["case_id", "num_group1", "housetype_905L"]).filter(
    pl.col("num_group1") == 0
).drop("num_group1").rename({"housetype_905L": "person_housetype"})

# Here we have num_goup1 and num_group2, so we need to aggregate again.
train_credit_bureau_b_2_feats = train_credit_bureau_b_2.group_by("case_id").agg(
    pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
    (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31")
)

# We will process in this examples only A-type and M-type columns, so we need to select them.
selected_static_cols = []
for col in train_static.columns:
    if col[-1] in ("A", "M"):
        selected_static_cols.append(col)
print(selected_static_cols)

selected_static_cb_cols = []
for col in train_static_cb.columns:
    if col[-1] in ("A", "M"):
        selected_static_cb_cols.append(col)
print(selected_static_cb_cols)

# Join all tables together.
data = train_basetable.join(
    train_static.select(["case_id"]+selected_static_cols), how="left", on="case_id"
).join(
    train_static_cb.select(["case_id"]+selected_static_cb_cols), how="left", on="case_id"
).join(
    train_person_1_feats_1, how="left", on="case_id"
).join(
    train_person_1_feats_2, how="left", on="case_id"
).join(
    train_credit_bureau_b_2_feats, how="left", on="case_id"
)

In [None]:
test_person_1_feats_1 = test_person_1.group_by("case_id").agg(
    pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"),
    (pl.col("incometype_1044T") == "SELFEMPLOYED").max().alias("mainoccupationinc_384A_any_selfemployed")
)

test_person_1_feats_2 = test_person_1.select(["case_id", "num_group1", "housetype_905L"]).filter(
    pl.col("num_group1") == 0
).drop("num_group1").rename({"housetype_905L": "person_housetype"})

test_credit_bureau_b_2_feats = test_credit_bureau_b_2.group_by("case_id").agg(
    pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
    (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31")
)

data_submission = test_basetable.join(
    test_static.select(["case_id"]+selected_static_cols), how="left", on="case_id"
).join(
    test_static_cb.select(["case_id"]+selected_static_cb_cols), how="left", on="case_id"
).join(
    test_person_1_feats_1, how="left", on="case_id"
).join(
    test_person_1_feats_2, how="left", on="case_id"
).join(
    test_credit_bureau_b_2_feats, how="left", on="case_id"
)

In [None]:
# Set the current active MLflow experiment
mlflow.set_experiment(experiment_id=experiment_id)

In [None]:
case_ids = data["case_id"].unique().shuffle(seed=1)
case_ids_train, case_ids_test = train_test_split(case_ids, train_size=0.6, random_state=1)
case_ids_valid, case_ids_test = train_test_split(case_ids_test, train_size=0.5, random_state=1)




cols_pred = []
for col in data.columns:
    if col[-1].isupper() and col[:-1].islower():
        cols_pred.append(col)

print(cols_pred)

def from_polars_to_pandas(case_ids: pl.DataFrame) -> pl.DataFrame:
    return (
        data.filter(pl.col("case_id").is_in(case_ids))[["case_id", "WEEK_NUM", "target"]].to_pandas(),
        data.filter(pl.col("case_id").is_in(case_ids))[cols_pred].to_pandas(),
        data.filter(pl.col("case_id").is_in(case_ids))["target"].to_pandas()
    )

base_train, X_train, y_train = from_polars_to_pandas(case_ids_train)
base_valid, X_valid, y_valid = from_polars_to_pandas(case_ids_valid)
base_test, X_test, y_test = from_polars_to_pandas(case_ids_test)

for df in [X_train, X_valid, X_test]:
    df = convert_strings(df)

In [None]:
print(f"Train: {X_train.shape}")
print(f"Valid: {X_valid.shape}")
print(f"Test: {X_test.shape}")

## **Machine Learning**

We will train a LightGBM model and submit predictions. 

## **Optuna Hyperparameter Optimization**
[Optuna](https://optuna.org) is an open source hyperparameter optimization framework to automate hyperparameter search. It offers an efficient way to find ideal hyperparameters and many visualizations of the results and search space. 

In [None]:
import optuna

# override Optuna's default logging to ERROR only
optuna.logging.set_verbosity(optuna.logging.ERROR)

# define a logging callback that will report on only new challenger parameter configurations if a
# trial has usurped the state of 'best conditions'


def champion_callback(study, frozen_trial):
    """
    Logging callback that will report when a new trial iteration improves upon existing
    best trial values.
    """

    winner = study.user_attrs.get("winner", None)

    if study.best_value and winner != study.best_value:
        study.set_user_attr("winner", study.best_value)
        if winner:
            improvement_percent = (abs(winner - study.best_value) / study.best_value) * 100
            print(
                f"Trial {frozen_trial.number} achieved value: {frozen_trial.value} with "
                f"{improvement_percent: .4f}% improvement"
            )
        else:
            print(f"Initial trial {frozen_trial.number} achieved value: {frozen_trial.value}")


In [None]:

# Create a dataset for lightgbm
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_valid = lgb.Dataset(X_valid, label=y_valid, reference=lgb_train)

# Define the objective function for Optuna
def objective(trial):
    with mlflow.start_run(nested=True):
        param = {
            "boosting_type": trial.suggest_categorical("boosting_type", ["gbdt", "rf", "dart"]),
            "objective": "binary",
            "metric": "auc",
            "max_depth": trial.suggest_int("max_depth", 1, 10),
            "num_leaves": trial.suggest_int("num_leaves", 20, 60),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
            "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
            "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
            "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
            "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
            "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
            "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
            "n_estimators": 1000,
            "verbose": -1,
            "feature_pre_filter": False,  # Explicitly disabling feature pre-filtering
        }
        
        gbm = lgb.train(
            param,
            lgb_train,
            valid_sets=lgb_valid,
            callbacks=[lgb.log_evaluation(50), lgb.early_stopping(10)]
        )
        
        preds = gbm.predict(X_valid)
        auc = roc_auc_score(y_valid, preds)
        
        # Log to MLflow
        mlflow.log_params(param)
        mlflow.log_metric("auc", auc)
    return auc


## GINI STABILITY   


In [None]:
def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    gini_in_time = base.loc[:, ["WEEK_NUM", "target", "score"]]\
        .sort_values("WEEK_NUM")\
        .groupby("WEEK_NUM")[["target", "score"]]\
        .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()
    
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a*x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

## MLFLOW TRAIN AND LOG

In [None]:

# Initiate the parent run and call the hyperparameter tuning child run logic
with mlflow.start_run(experiment_id=experiment_id, run_name=run_name, nested=True):
    # Initialize the Optuna study
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=10, callbacks=[champion_callback])  
        
    # Log the best hyperparameters to MLflow
    mlflow.log_params(study.best_params)
    mlflow.log_metric("AUC", study.best_value)
        
    # Log tags
    mlflow.set_tags(
        tags={
            "project": "Kaggle Home Credit Default Risk",
            "optimizer_engine": "optuna",
            "model_family": "lgbm",
            "feature_set_version": 1,
        }
    )

    # Fit model instance
    model = lgb.train(study.best_params, lgb_train)

    # Add the preditions to the base table
    base_train["score"] = model.predict(X_train)
    
    # Calc the gini stability score
    stability_score_train = gini_stability(base_train)
    
    # Log the gini stability score to MLflow
    mlflow.log_metric("gini_stability", stability_score_train)
    
    # Log the model to MLflow    
    artifact_path = "model"

    mlflow.lightgbm.log_model(model, artifact_path)

    # Get the logged model uri so that we can load it from the artifact store
    model_uri = mlflow.get_artifact_uri(artifact_path)



## Plot Visuals From Optuna Hyperparameter Optimization

**plot_optimization_history** plots optimization history of all trials in a study. The blue dots show the AUC on each trial, and the red line - the best value attained.

In [None]:
from optuna.visualization import plot_optimization_history

plot_optimization_history(study)

We will now use the best hyperparameters found by Optuna and train the model and make predictions.

In [None]:
# Use the best parameters to train the final model
best_params = study.best_params

# You must manually add the parameters that were not tuned by Optuna back into the best_params dictionary
best_params.update({
    "objective": "binary",
    "metric": "auc",
    "n_estimators": 1000,
    "verbose": -1,
    "feature_pre_filter": False,  # Since it was explicitly set in the objective function
})

# Train the final model with the best parameters found
gbm_final = lgb.train(
    best_params,
    lgb_train,
    valid_sets=lgb_valid,
    callbacks=[lgb.log_evaluation(50), lgb.early_stopping(10)]
)

In [None]:
for base, X in [(base_train, X_train), (base_valid, X_valid), (base_test, X_test)]:
    y_pred = gbm_final.predict(X, num_iteration=gbm_final.best_iteration)
    base["score"] = y_pred

print(f'The AUC score on the train set is: {roc_auc_score(base_train["target"], base_train["score"])}') 
print(f'The AUC score on the valid set is: {roc_auc_score(base_valid["target"], base_valid["score"])}') 
print(f'The AUC score on the test set is: {roc_auc_score(base_test["target"], base_test["score"])}')  

In [None]:
def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    gini_in_time = base.loc[:, ["WEEK_NUM", "target", "score"]]\
        .sort_values("WEEK_NUM")\
        .groupby("WEEK_NUM")[["target", "score"]]\
        .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()
    
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a*x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

stability_score_train = gini_stability(base_train)
stability_score_valid = gini_stability(base_valid)
stability_score_test = gini_stability(base_test)

print(f'The stability score on the train set is: {stability_score_train}') 
print(f'The stability score on the valid set is: {stability_score_valid}') 
print(f'The stability score on the test set is: {stability_score_test}')  

## **Submission**

Scoring the submission dataset is below, we need to take care of new categories. Then we save the score as a last step.

In [None]:
X_submission = data_submission[cols_pred].to_pandas()
X_submission = convert_strings(X_submission)
categorical_cols = X_train.select_dtypes(include=['category']).columns

for col in categorical_cols:
    train_categories = set(X_train[col].cat.categories)
    submission_categories = set(X_submission[col].cat.categories)
    new_categories = submission_categories - train_categories
    X_submission.loc[X_submission[col].isin(new_categories), col] = "Unknown"
    new_dtype = pd.CategoricalDtype(categories=train_categories, ordered=True)
    X_train[col] = X_train[col].astype(new_dtype)
    X_submission[col] = X_submission[col].astype(new_dtype)

y_submission_pred = gbm_final.predict(X_submission, num_iteration=gbm_final.best_iteration)

In [None]:
submission = pd.DataFrame({
    "case_id": data_submission["case_id"].to_numpy(),
    "score": y_submission_pred
}).set_index('case_id')
submission.to_csv("./submission.csv")

In [None]:
submission.head()

## **Next Steps**

This section outlines the next steps to enhance the model's performance further. These steps include additional feature engineering, feature selection, feature importance analysis, and evaluating other models.

### Additional Feature Engineering

- **Create Interaction Features**: Investigate potential interactions between features that could be predictive of the outcome.
- **Polynomial Features**: Generate polynomial features for the numeric variables to capture non-linear relationships.
- **Encoding Categorical Variables**: Explore different encoding strategies (e.g., one-hot encoding, target encoding) for categorical variables.

### Feature Selection

- **Correlation Analysis**: Remove highly correlated features to reduce multicollinearity.
- **Importance-Based Selection**: Utilize model-based feature importance to retain the most relevant features.
- **Wrapper Methods**: Experiment with forward selection, backward elimination, or recursive feature elimination (RFE) techniques.

### Feature Importance Analysis

- **Model-Based Importance**: Use tree-based models (e.g., Random Forest, XGBoost) to evaluate feature importance.
- **Permutation Importance**: Assess the impact of shuffling each feature on the model's performance to identify crucial features.
- **SHAP Values**: Utilize SHAP (SHapley Additive exPlanations) to interpret the model's predictions and understand the impact of each feature.

### Evaluating Other Models

- **Compare Different Models**: Evaluate the performance of various machine learning models (e.g., SVM, Neural Networks, Ensemble Models).
- **Hyperparameter Tuning**: Perform hyperparameter tuning on other models to find the optimal settings.
- **Stacking/Ensembling**: Explore stacking or ensembling methods to combine predictions from multiple models for improved accuracy.

### Model Evaluation and Validation

- **Cross-Validation**: Use cross-validation techniques to assess the model's performance more reliably.
- **Performance Metrics**: Evaluate the models using appropriate metrics (e.g., AUC, accuracy, F1 score) for your specific problem.
- **Validation Curves**: Plot validation curves to identify overfitting or underfitting.
- **Learning Curves**: Analyze learning curves to understand how well the model is learning from the training data.
