Load data

In [2]:
import polars as pl
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

dataPath = "C:/Users/Maevex/Desktop/Lujain/home-credit-credit-risk-model-stability/"

Reading the Train and Test Data

In [3]:
# Read CSV files
train_basetable = pd.read_csv(dataPath + "csv_files/train/train_base.csv")

train_static_0_0 = pd.read_csv(dataPath + "csv_files/train/train_static_0_0.csv")

train_static_0_1 = pd.read_csv(dataPath + "csv_files/train/train_static_0_1.csv")

# Concatenate the DataFrames vertically
train_static = pd.concat([train_static_0_0, train_static_0_1], axis=0, ignore_index=True)

train_static_cb = pd.read_csv(dataPath + "csv_files/train/train_static_cb_0.csv")

train_person_1 = pd.read_csv(dataPath + "csv_files/train/train_person_1.csv")

train_credit_bureau_b_2 = pd.read_csv(dataPath + "csv_files/train/train_credit_bureau_b_2.csv")


  train_static_0_0 = pd.read_csv(dataPath + "csv_files/train/train_static_0_0.csv")
  train_static_0_1 = pd.read_csv(dataPath + "csv_files/train/train_static_0_1.csv")
  train_static_cb = pd.read_csv(dataPath + "csv_files/train/train_static_cb_0.csv")
  train_person_1 = pd.read_csv(dataPath + "csv_files/train/train_person_1.csv")


In [4]:
# Read CSV files for test data
test_basetable = pd.read_csv(dataPath + "csv_files/test/test_base.csv")

test_static_0_0 = pd.read_csv(dataPath + "csv_files/test/test_static_0_0.csv")

test_static_0_1 = pd.read_csv(dataPath + "csv_files/test/test_static_0_1.csv")

test_static_0_2 = pd.read_csv(dataPath + "csv_files/test/test_static_0_2.csv")

# Concatenate the DataFrames vertically
test_static = pd.concat([test_static_0_0, test_static_0_1, test_static_0_2], axis=0, ignore_index=True)

test_static_cb = pd.read_csv(dataPath + "csv_files/test/test_static_cb_0.csv")

test_person_1 = pd.read_csv(dataPath + "csv_files/test/test_person_1.csv")

test_credit_bureau_b_2 = pd.read_csv(dataPath + "csv_files/test/test_credit_bureau_b_2.csv")


Finding nulls values

In [5]:
import pandas as pd

# Function to find null values and their percentages
def find_nulls(df):
    null_counts = df.isnull().sum()
    total_counts = len(df)
    null_percentages = (null_counts / total_counts) * 100
    return pd.DataFrame({
        'Null Count': null_counts,
        'Null Percentage': null_percentages
    })


# Finding nulls in each dataframe for the train data
train_nulls = {
    "train_basetable": find_nulls(train_basetable),
    "train_static": find_nulls(train_static),
    "train_static_cb": find_nulls(train_static_cb),
    "train_person_1": find_nulls(train_person_1),
    "train_credit_bureau_b_2": find_nulls(train_credit_bureau_b_2)
}

# Convert the dictionaries to DataFrames
train_nulls_df = pd.concat(train_nulls, axis=1)

# Exporting the nulls count to an Excel file
# with pd.ExcelWriter(dataPath + 'null_counts.xlsx') as writer:
#     train_nulls_df.to_excel(writer, sheet_name='train_nulls', index=True)


Casting types

In [6]:
import pandas as pd

def determine_dtype(col_name):
    if col_name[-1] == 'D':
        return 'datetime64[ns]'
    elif col_name[-1] == 'L':
        return 'object'
    elif col_name[-1] == 'A':
        return 'float64'
    elif col_name[-1] == 'M':
        return 'object'
    elif col_name[-1] == 'P':
        return 'float64'
    elif col_name[-1] == 'T':
        return 'object'
    else:
        return 'object'

def set_table_dtypes(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:
        dtype = determine_dtype(col)
        df[col] = df[col].astype(dtype)
    return df

def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:
        if df[col].dtype.name in ['object', 'string']:
            df[col] = df[col].astype("string").astype('category')
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
            df[col] = df[col].astype(new_dtype)
    return df




In [7]:
# Read CSV files
train_static_0_0 = set_table_dtypes(train_static_0_0)
train_static_0_1 = set_table_dtypes(train_static_0_1)
train_static_cb = set_table_dtypes(train_static_cb)
train_person_1 = set_table_dtypes(train_person_1)
train_credit_bureau_b_2 = set_table_dtypes(train_credit_bureau_b_2)

In [8]:
# Read CSV files for test data
test_static_0_0 = set_table_dtypes(test_static_0_0)
test_static_0_1 = set_table_dtypes(test_static_0_1)
test_static_0_2 = set_table_dtypes(test_static_0_2) # NOT NEEDED
test_static_cb = set_table_dtypes(test_static_cb)
test_person_1 = set_table_dtypes(test_person_1)
test_credit_bureau_b_2 = set_table_dtypes(test_credit_bureau_b_2)

# Merging

In [9]:
import pandas as pd

# Define the aggregation function
def aggregate_train_person_1_feats_1(df):
    agg_df = df.groupby("case_id").agg(
        mainoccupationinc_384A_max=pd.NamedAgg(column="mainoccupationinc_384A", aggfunc="max"),
        mainoccupationinc_384A_any_selfemployed=pd.NamedAgg(column="incometype_1044T", aggfunc=lambda x: (x == "SELFEMPLOYED").max())
    ).reset_index()
    return agg_df

# Apply the aggregation function
train_person_1_feats_1 = aggregate_train_person_1_feats_1(train_person_1)

# Filter and select operations
train_person_1_feats_2 = train_person_1.loc[train_person_1["num_group1"] == 0, ["case_id", "housetype_905L"]]
train_person_1_feats_2.rename(columns={"housetype_905L": "person_housetype"}, inplace=True)

# Define the aggregation function for another table
def aggregate_train_credit_bureau_b_2_feats(df):
    agg_df = df.groupby("case_id").agg(
        pmts_pmtsoverdue_635A_max=pd.NamedAgg(column="pmts_pmtsoverdue_635A", aggfunc="max"),
        pmts_dpdvalue_108P_over31=pd.NamedAgg(column="pmts_dpdvalue_108P", aggfunc=lambda x: (x > 31).max())
    ).reset_index()
    return agg_df

# Apply the aggregation function
train_credit_bureau_b_2_feats = aggregate_train_credit_bureau_b_2_feats(train_credit_bureau_b_2)

# Selecting columns based on their suffix
selected_static_cols = [col for col in train_static.columns if col[-1] in ("A", "M")]
print(selected_static_cols)

selected_static_cb_cols = [col for col in train_static_cb.columns if col[-1] in ("A", "M")]
print(selected_static_cb_cols)

# Joining tables together
data = train_basetable.merge(
    train_static[["case_id"] + selected_static_cols], how="left", on="case_id"
).merge(
    train_static_cb[["case_id"] + selected_static_cb_cols], how="left", on="case_id"
).merge(
    train_person_1_feats_1, how="left", on="case_id"
).merge(
    train_person_1_feats_2, how="left", on="case_id"
).merge(
    train_credit_bureau_b_2_feats, how="left", on="case_id"
)


#1m 51s


['amtinstpaidbefduel24m_4187115A', 'annuity_780A', 'annuitynextmonth_57A', 'avginstallast24m_3658937A', 'avglnamtstart24m_4525187A', 'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A', 'credamount_770A', 'currdebt_22A', 'currdebtcredtyperange_828A', 'disbursedcredamount_1113A', 'downpmt_116A', 'inittransactionamount_650A', 'lastapprcommoditycat_1041M', 'lastapprcommoditytypec_5251766M', 'lastapprcredamount_781A', 'lastcancelreason_561M', 'lastotherinc_902A', 'lastotherlnsexpense_631A', 'lastrejectcommoditycat_161M', 'lastrejectcommodtypec_5251769M', 'lastrejectcredamount_222A', 'lastrejectreason_759M', 'lastrejectreasonclient_4145040M', 'maininc_215A', 'maxannuity_159A', 'maxannuity_4075009A', 'maxdebt4_972A', 'maxinstallast24m_3658928A', 'maxlnamtstart6m_4525199A', 'maxoutstandbalancel12m_4187113A', 'maxpmtlast3m_4525190A', 'previouscontdistrict_112M', 'price_1097A', 'sumoutstandtotal_3546847A', 'sumoutstandtotalest_4493215A', 'totaldebt_9A', 'totalsettled_863A', 'totinstallas

In [10]:
import pandas as pd

# Define the aggregation function for test_person_1_feats_1
def aggregate_test_person_1_feats_1(df):
    agg_df = df.groupby("case_id").agg(
        mainoccupationinc_384A_max=pd.NamedAgg(column="mainoccupationinc_384A", aggfunc="max"),
        mainoccupationinc_384A_any_selfemployed=pd.NamedAgg(column="incometype_1044T", aggfunc=lambda x: (x == "SELFEMPLOYED").max())
    ).reset_index()
    return agg_df

# Apply the aggregation function
test_person_1_feats_1 = aggregate_test_person_1_feats_1(test_person_1)

# Filter and select operations for test_person_1_feats_2
test_person_1_feats_2 = test_person_1.loc[test_person_1["num_group1"] == 0, ["case_id", "housetype_905L"]]
test_person_1_feats_2.rename(columns={"housetype_905L": "person_housetype"}, inplace=True)

# Define the aggregation function for test_credit_bureau_b_2_feats
def aggregate_test_credit_bureau_b_2_feats(df):
    agg_df = df.groupby("case_id").agg(
        pmts_pmtsoverdue_635A_max=pd.NamedAgg(column="pmts_pmtsoverdue_635A", aggfunc="max"),
        pmts_dpdvalue_108P_over31=pd.NamedAgg(column="pmts_dpdvalue_108P", aggfunc=lambda x: (x > 31).max())
    ).reset_index()
    return agg_df

# Apply the aggregation function
test_credit_bureau_b_2_feats = aggregate_test_credit_bureau_b_2_feats(test_credit_bureau_b_2)

# Joining tables together
data_submission = test_basetable.merge(
    test_static[["case_id"] + selected_static_cols], how="left", on="case_id"
).merge(
    test_static_cb[["case_id"] + selected_static_cb_cols], how="left", on="case_id"
).merge(
    test_person_1_feats_1, how="left", on="case_id"
).merge(
    test_person_1_feats_2, how="left", on="case_id"
).merge(
    test_credit_bureau_b_2_feats, how="left", on="case_id"
)


In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Shuffle the case_ids and split into train, validation, and test sets
case_ids = data["case_id"].unique()
case_ids = pd.Series(case_ids).sample(frac=1, random_state=1).to_numpy()
case_ids_train, case_ids_test = train_test_split(case_ids, train_size=0.6, random_state=1)
case_ids_valid, case_ids_test = train_test_split(case_ids_test, train_size=0.5, random_state=1)

# Extract columns for prediction
cols_pred = [col for col in data.columns if col[-1].isupper() and col[:-1].islower()]

print(cols_pred)

# Function to filter data and convert to pandas DataFrame
def from_polars_to_pandas(case_ids, data):
    filtered_data = data[data["case_id"].isin(case_ids)]
    return (
        filtered_data[["case_id", "WEEK_NUM", "target"]],
        filtered_data[cols_pred],
        filtered_data["target"]
    )

# Split data into train, validation, and test sets
base_train, X_train, y_train = from_polars_to_pandas(case_ids_train, data)
base_valid, X_valid, y_valid = from_polars_to_pandas(case_ids_valid, data)
base_test, X_test, y_test = from_polars_to_pandas(case_ids_test, data)

# Convert string columns to category
for df in [X_train, X_valid, X_test]:
    df = convert_strings(df)


['amtinstpaidbefduel24m_4187115A', 'annuity_780A', 'annuitynextmonth_57A', 'avginstallast24m_3658937A', 'avglnamtstart24m_4525187A', 'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A', 'credamount_770A', 'currdebt_22A', 'currdebtcredtyperange_828A', 'disbursedcredamount_1113A', 'downpmt_116A', 'inittransactionamount_650A', 'lastapprcommoditycat_1041M', 'lastapprcommoditytypec_5251766M', 'lastapprcredamount_781A', 'lastcancelreason_561M', 'lastotherinc_902A', 'lastotherlnsexpense_631A', 'lastrejectcommoditycat_161M', 'lastrejectcommodtypec_5251769M', 'lastrejectcredamount_222A', 'lastrejectreason_759M', 'lastrejectreasonclient_4145040M', 'maininc_215A', 'maxannuity_159A', 'maxannuity_4075009A', 'maxdebt4_972A', 'maxinstallast24m_3658928A', 'maxlnamtstart6m_4525199A', 'maxoutstandbalancel12m_4187113A', 'maxpmtlast3m_4525190A', 'previouscontdistrict_112M', 'price_1097A', 'sumoutstandtotal_3546847A', 'sumoutstandtotalest_4493215A', 'totaldebt_9A', 'totalsettled_863A', 'totinstallas

Putting the Dataframees into excel

In [None]:
print(f"Train: {X_train.shape}")
print(f"Valid: {X_valid.shape}")
print(f"Test: {X_test.shape}")

# Exporting the nulls count to an Excel file
with pd.ExcelWriter(dataPath + 'Data_After.xlsx') as writer:
    # X_train.head(int(len(X_train) / 100)).to_excel(writer, sheet_name='Train', index=False)
    # X_valid.to_excel(writer, sheet_name='Valid', index=False)
    # X_test.to_excel(writer, sheet_name='Test', index=False)

Lightgmb model


In [25]:
import lightgbm as lgb

lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_valid = lgb.Dataset(X_valid, label=y_valid, reference=lgb_train)

params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 5,
    "num_leaves": 31,
    "learning_rate": 0.01,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "n_estimators": 5000,
    "verbose": -1,
}


gbm = lgb.train(
    params,
    lgb_train,
    valid_sets=lgb_valid,
    callbacks=[lgb.log_evaluation(100), lgb.early_stopping(10)]
    
)



Training until validation scores don't improve for 10 rounds
[100]	valid_0's auc: 0.702002
[200]	valid_0's auc: 0.715652
[300]	valid_0's auc: 0.724347
[400]	valid_0's auc: 0.729243


In [None]:
import matplotlib.pyplot as plt

# Extracting the evaluation results.
results = gbm.evals_result_
epochs = len(results['valid_0']['auc'])
x_axis = range(0, epochs)

# Plotting AUC vs Iteration
plt.figure(figsize=(10, 5))
plt.plot(x_axis, results['valid_0']['auc'], label='AUC')
plt.title('AUC vs Iterations')
plt.xlabel('Iterations')
plt.ylabel('AUC')
plt.legend()
plt.show()


Explaining the LightGBM Model
### General Parameters
- **`boosting_type`**: This specifies the type of boosting to use. Common options are:
  - `"gbdt"`: Gradient Boosting Decision Tree (default).
  - `"dart"`: Dropouts meet Multiple Additive Regression Trees.
  - `"goss"`: Gradient-based One-Side Sampling.
  - `"rf"`: Random Forest.

### Objective and Metric Parameters
- **`objective`**: This defines the learning task. In this case:
  - `"binary"`: For binary classification, which aims to classify inputs into one of two categories.
  
- **`metric`**: This specifies the evaluation metric for validation data. In this case:
  - `"auc"`: Area Under the Curve, commonly used for binary classification to evaluate the model's ability to distinguish between positive and negative classes.

### Tree Parameters
- **`max_depth`**: The maximum depth of a tree. Deeper trees can capture more complex patterns but may lead to overfitting. A `max_depth` of 5 is a moderate depth, balancing complexity and overfitting.

- **`num_leaves`**: Maximum number of leaves in one tree. The number of leaves should be less than `2^(max_depth)`. More leaves can lead to a more complex model. Setting `num_leaves` to 31 controls the model's complexity.

### Learning Control Parameters
- **`learning_rate`**: This controls the step size at each iteration while moving toward a minimum of the loss function. A smaller learning rate requires more boosting iterations but can improve model accuracy. A `learning_rate` of 0.05 is considered moderate.

### Feature and Data Sampling Parameters
- **`feature_fraction`**: The fraction of features to be used for each boosting iteration. It helps to prevent overfitting by randomly selecting a subset of features. A `feature_fraction` of 0.9 means 90% of features are used in each iteration.

- **`bagging_fraction`**: The fraction of data to be used for each boosting iteration (without resampling). This helps to prevent overfitting by introducing randomness. A `bagging_fraction` of 0.8 means 80% of the data is used in each iteration.

- **`bagging_freq`**: Frequency of performing bagging, meaning the model will perform bagging once in every specified number of iterations. A `bagging_freq` of 5 means bagging is done every 5 iterations.

### Training Control Parameters
- **`n_estimators`**: Maximum number of boosting iterations. The model will train up to this number of iterations unless early stopping criteria are met. A high value like 1000 provides ample opportunity for the model to converge.

- **`verbose`**: Controls the verbosity of the output. Setting `verbose` to -1 suppresses all messages, which can be useful to avoid clutter in the output.

### Callbacks
- **`lgb.log_evaluation(50)`**: This callback logs the evaluation results every 50 iterations. It helps in monitoring the training process.
  
- **`lgb.early_stopping(10)`**: This callback stops training if the model's performance on the validation set does not improve for 10 consecutive rounds. This prevents overfitting and saves computational resources by stopping training early when further training is unlikely to yield better results.

### Summary
These parameters collectively control various aspects of the LightGBM training process, such as the type of model to be used, the structure and complexity of the trees, the learning process, and measures to prevent overfitting. Proper tuning of these parameters is crucial to building a performant model.