In [1]:
import polars as pl
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score 
import warnings
import os
import re

warnings.filterwarnings('ignore')

# For Kaggle make the dataPath = "/kaggle/input/home-credit-credit-risk-model-stability/"
# dataPath = "/kaggle/input/home-credit-credit-risk-model-stability/"
# For local testing make the dataPath = "/Users/chrisjackson/Downloads/home-credit-credit-risk-model-stability/"
dataPath = "/Users/chrisjackson/Downloads/home-credit-credit-risk-model-stability/"

## Data Cleaning and Aggregation

In [2]:
class Pipeline:
    @staticmethod
    def set_table_dtypes(df): #Standardize the dtype.
        for col in df.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int64))
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
            elif col[-1] in ("D",):
                df = df.with_columns(pl.col(col).cast(pl.Date))            

        return df
    
    @staticmethod
    def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
        for col in df.columns:  
            if df[col].dtype.name in ['object', 'string']:
                df[col] = df[col].astype("string").astype('category')
                current_categories = df[col].cat.categories
                new_categories = current_categories.to_list() + ["Unknown"]
                new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
                df[col] = df[col].astype(new_dtype)
        return df
    
    @staticmethod
    def handle_dates(df): #Change the feature for D to the difference in days from date_decision.
        for col in df.columns:
            if col[-1] in ("D",):
                df = df.with_columns(pl.col(col) - pl.col("date_decision"))
                df = df.with_columns(pl.col(col).dt.total_days())
                
        df = df.drop("date_decision", "MONTH")

        return df
    
    @staticmethod
    def filter_cols(df): #Remove those with an average is_null exceeding 0.95 and those that do not fall within the range 1 < nunique < 200.
        for col in df.columns:
            if col not in ["target", "case_id", "WEEK_NUM"]:
                isnull = df[col].is_null().mean()

                if isnull > 0.95:
                    df = df.drop(col)

        for col in df.columns:
            if (col not in ["target", "case_id", "WEEK_NUM"]) & (df[col].dtype == pl.String):
                freq = df[col].n_unique()

                if (freq == 1) | (freq > 200):
                    df = df.drop(col)

        return df

In [3]:
def data_agg(depth=0, test_train='train') -> pl.DataFrame:
    # List all files in the folder
    all_files = os.listdir(dataPath + f"parquet_files/{test_train}/")

    # Filter files where the first number in the filename is depth
    filtered_files = []
    for file in all_files:
        digit = re.search(r'\d+', file)
        if digit and digit.group() == str(depth):
            filtered_files.append(file)
    
    
    # Read the base table
    aggregated_df = pl.read_parquet(dataPath + f"parquet_files/{test_train}/{test_train}_base.parquet")
    # Add the static tables to the aggregated_df, meaning the first number in the file name is 0 and they should be listed in filetred_files
    for file in filtered_files:
        df = pl.read_parquet(dataPath + f"parquet_files/{test_train}/{file}")

        # Join the aggregated results to the overall aggregated dataframe using the case_id column as the index
        aggregated_df = aggregated_df.join(df, on="case_id", how="left")
    
        # The columns with right in the name might contain info not in the non-right columns.  We need to combine them into one column
        # First we need to find the columns with right in the name
        right_columns = [col for col in aggregated_df.columns if "right" in col]
        # Then we need to find the columns without right in the name
        non_right_columns = [col for col in aggregated_df.columns if "right" not in col]
        # We need to iterate over the right columns and add them to the non-right columns and pick the non-null value
        for col in right_columns:
            # Find the non-right column that corresponds to the right column
            non_right_col = col.replace("_right", "")
            # Add the non-right column to the aggregated_df if the non-right column is null
            aggregated_df = aggregated_df.with_columns(
            pl.when(pl.col(non_right_col).is_null())
            .then(pl.col(col))
            .otherwise(pl.col(non_right_col))
            .alias(non_right_col)
            )
        # We need to drop the right columns
        aggregated_df = aggregated_df.drop(right_columns)
    
    # Process the data with the Pipeline class
    aggregated_df = Pipeline.set_table_dtypes(aggregated_df)
    aggregated_df = Pipeline.handle_dates(aggregated_df)
    aggregated_df = Pipeline.filter_cols(aggregated_df)
    
    # If the depth is 1, then we are interested in the base tables and the tables with a depth of 1
    # if depth == 1:
    #     # As a test lets try to count the number of applications for each case_id
    #     # Gather the files with a depth of 1
    #     depth_1_files = [file for file in filtered_files if file[0] == "1"]
    #     return aggregated_df

    
    # # Remove basetable from the list of parquet files
    # parquet_files = [file for file in parquet_files if file != "train_base.parquet"]
    
    # # Iterate over each parquet file
    # for file in parquet_files:
    #     # Read the parquet file
    #     df = pl.read_parquet(os.path.join(dataPath, file))
        
    #     # Check if the file contains the column num_group1 and is a file with a depth of 1
    #     if "num_group1" in df.columns and "num_group2" not in df.columns:

    #         # Remove the num_group1 column
    #         df = df.drop("num_group1")
            # Join the aggregated results to the overall aggregated dataframe using the case_id column as the index
    #         aggregated_df = aggregated_df.join(df, on="case_id", how="outer")
        
    #     # Check if the file contains the column num_group1 and num_group2 and is a file with a depth of 2
    #     elif "num_group1" in df.columns and "num_group2" in df.columns and num_group2 is not None:
            # # Check if the aggregated df has the num_group1 and num_group2 columns
            # if "num_group2" not in aggregated_df.columns:
            #     # Add the num_group2 column to the aggregated_df
            #     aggregated_df = aggregated_df.join(df, on="case_id", how="outer")
            # # Filter for the desired num_group1 and num_group2 values
            # df = df.filter(pl.col("num_group1") == num_goup1).filter(pl.col("num_group2") == num_group2)
            # # Remove the num_group1 and num_group2 columns
            # df = df.drop("num_group1").drop("num_group2")
            # # Join the aggregated results to the overall aggregated dataframe using the case_id column as the index
            # aggregated_df = aggregated_df.join(df, on="case_id", how="outer")
        
    return aggregated_df
    

In [4]:
data = data_agg(0, 'train')

In [5]:
def dpd_only(data, test_train):
    # Find the columns ending in P as they are the days past due columns
    dpd_columns = [col for col in data.columns if col.endswith("P")]

    # Lets try making a dataframe of the payment history only and see if we can predict the target, might be a path to ensembling
    # Create a dataframe of the payment history
    if test_train == "train":
        payment_history = data.select(dpd_columns + ["target"] + ["case_id"] + ["WEEK_NUM"])
    else:
        payment_history = data.select(dpd_columns + ["case_id"] + ["WEEK_NUM"])
    # Make a run with data as the payment history only
    return payment_history

data = dpd_only(data, 'train')

### Data Processing Pipeline

In [6]:
def split_data(data):
    case_ids = data["case_id"].unique().shuffle(seed=1)
    case_ids_train, case_ids_valid = train_test_split(case_ids, train_size=0.8, random_state=1)

    cols_pred = []
    for col in data.columns:
        if col[-1].isupper() and col[:-1].islower():
            cols_pred.append(col)

    def from_polars_to_pandas(case_ids: pl.DataFrame) -> pl.DataFrame:
        return (
            data.filter(pl.col("case_id").is_in(case_ids))[cols_pred].to_pandas(),
            data.filter(pl.col("case_id").is_in(case_ids))["target"].to_pandas().ravel()
        )
    
    X_train, y_train = from_polars_to_pandas(case_ids_train)
    X_valid, y_valid = from_polars_to_pandas(case_ids_valid)
    
    return X_train, y_train, X_valid, y_valid


In [7]:
import optuna

# override Optuna's default logging to ERROR only
optuna.logging.set_verbosity(optuna.logging.ERROR)

# define a logging callback that will report on only new challenger parameter configurations if a
# trial has usurped the state of 'best conditions'


def champion_callback(study, frozen_trial):
    """
    Logging callback that will report when a new trial iteration improves upon existing
    best trial values.
    """

    winner = study.user_attrs.get("winner", None)

    if study.best_value and winner != study.best_value:
        study.set_user_attr("winner", study.best_value)
        if winner:
            improvement_percent = (abs(winner - study.best_value) / study.best_value) * 100
            print(
                f"Trial {frozen_trial.number} achieved value: {frozen_trial.value} with "
                f"{improvement_percent: .4f}% improvement"
            )
        else:
            print(f"Initial trial {frozen_trial.number} achieved value: {frozen_trial.value}")


In [8]:
# Get the data from the split_data function
X_train, y_train, X_valid, y_valid = split_data(data)

# Create a dataset for lightgbm
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_valid = lgb.Dataset(X_valid, label=y_valid, reference=lgb_train)


In [9]:
# Define the objective function for Optuna
def objective(trial):
    
    param = {
        "boosting_type": trial.suggest_categorical("boosting_type", ["gbdt", "rf"]),
        "objective": "binary",
        "metric": "auc",
        "max_depth": trial.suggest_int("max_depth", 1, 10),
        "num_leaves": trial.suggest_int("num_leaves", 20, 60),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
        "n_estimators": 1000,
        "verbose": -1,
        "feature_pre_filter": False,  # Explicitly disabling feature pre-filtering
    }
    
    gbm = lgb.train(
        param,
        lgb_train,
        valid_sets=lgb_valid,
        callbacks=[lgb.log_evaluation(50), lgb.early_stopping(10)]
    )
    
    preds = gbm.predict(X_valid)
    auc = roc_auc_score(y_valid, preds)

    return auc


In [10]:
def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    gini_in_time = base.loc[:, ["WEEK_NUM", "target", "score"]]\
        .sort_values("WEEK_NUM")\
        .groupby("WEEK_NUM")[["target", "score"]]\
        .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()
    
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a*x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

## TRAINING

In [11]:
# Initialize the Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10, callbacks=[champion_callback])  
    
# Fit model instance
model = lgb.train(study.best_params, lgb_train)


Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.681786
[100]	valid_0's auc: 0.687016
[150]	valid_0's auc: 0.689184
[200]	valid_0's auc: 0.690174
[250]	valid_0's auc: 0.690862
Early stopping, best iteration is:
[276]	valid_0's auc: 0.691251
Initial trial 0 achieved value: 0.69125065849512
Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.693013
Early stopping, best iteration is:
[73]	valid_0's auc: 0.69419
Trial 1 achieved value: 0.6941898892610685 with  0.4234% improvement
Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.680595
[100]	valid_0's auc: 0.68337
[150]	valid_0's auc: 0.685239
Early stopping, best iteration is:
[145]	valid_0's auc: 0.68532
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[7]	valid_0's auc: 0.642878
Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.690484
[100]	valid_0's auc: 0.6929

## Submission


In [15]:
data

actualdpdtolerance_344P,maxdpdfrom6mto36m_3546853P,maxdpdlast12m_727P,maxdpdlast24m_143P,maxdpdlast3m_392P,maxdpdlast6m_474P,maxdpdlast9m_1059P,maxdpdtolerance_374P,posfpd10lastmonth_333P,posfpd30lastmonth_3976960P,posfstqpd30lastmonth_3976962P,case_id,WEEK_NUM
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,57543,92
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,57549,92
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,57551,92
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,57552,92
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,57569,92
,,,,,,,,,,,57630,92
,,,,,,,,,,,57631,92
,,,,,,,,,,,57632,92
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,57633,92
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,57634,92


In [17]:
# Load the sample submission file
sub_df = pd.read_csv(dataPath + "sample_submission.csv")
# Load the test data
data = data_agg(0, 'test')
# Make the data the payment history only
data = dpd_only(data, 'test')
# Generate the predictions from the test files
y_sub_pred = model.predict(data, predict_disable_shape_check=True)
# Add the predictions to the submission file
sub_df = sub_df.set_index("case_id")
sub_df['score'] = y_sub_pred
# Save the submission file
sub_df.to_csv("./submission.csv")
sub_df

Unnamed: 0_level_0,score
case_id,Unnamed: 1_level_1
57543,0.069724
57549,0.069724
57551,0.069724
57552,0.069724
57569,0.069724
57630,0.049073
57631,0.049073
57632,0.049073
57633,0.069645
57634,0.069724
