In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/og-dataset/credit_risk_dataset.csv
/kaggle/input/playground-series-s4e10/sample_submission.csv
/kaggle/input/playground-series-s4e10/train.csv
/kaggle/input/playground-series-s4e10/test.csv


# Import Data

In [2]:
train= pd.read_csv('/kaggle/input/playground-series-s4e10/train.csv')
original = pd.read_csv('/kaggle/input/og-dataset/credit_risk_dataset.csv')
df = pd.concat([train, original], ignore_index=True)
df = df.fillna('None')
test = pd.read_csv('/kaggle/input/playground-series-s4e10/test.csv')

# Explore Data

In [3]:
df.columns.values

array(['id', 'person_age', 'person_income', 'person_home_ownership',
       'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt',
       'loan_int_rate', 'loan_percent_income',
       'cb_person_default_on_file', 'cb_person_cred_hist_length',
       'loan_status'], dtype=object)

In [4]:
df.describe()

Unnamed: 0,person_age,person_income,loan_amnt,loan_percent_income,cb_person_cred_hist_length,loan_status
count,91226.0,91226.0,91226.0,91226.0,91226.0,91226.0
mean,27.61648,64770.71,9350.348607,0.163154,5.810219,0.169447
std,6.148115,47936.96,5848.605423,0.097491,4.038412,0.375149
min,20.0,4000.0,500.0,0.0,2.0,0.0
25%,23.0,40000.0,5000.0,0.09,3.0,0.0
50%,26.0,57000.0,8000.0,0.14,4.0,0.0
75%,30.0,78000.0,12000.0,0.22,8.0,0.0
max,144.0,6000000.0,35000.0,0.83,30.0,1.0


In [5]:
df.shape

(91226, 13)

In [6]:
for i in df.columns.values:
    print( i, "...." ,df[i].isna().sum())

id .... 0
person_age .... 0
person_income .... 0
person_home_ownership .... 0
person_emp_length .... 0
loan_intent .... 0
loan_grade .... 0
loan_amnt .... 0
loan_int_rate .... 0
loan_percent_income .... 0
cb_person_default_on_file .... 0
cb_person_cred_hist_length .... 0
loan_status .... 0


# Feature engineering

In [7]:
def preprocess(df):
    df['loan_to_income'] = ((df['loan_amnt'] / df['person_income']) - df['loan_percent_income']).astype('string').astype('category')
    df['age_income_interaction'] = (df['person_age'] * df['person_income']).astype('string').astype('category')
    df['loan_to_emp_length_ratio'] = (df['loan_amnt'] / df['person_emp_length'].replace({'None': train['person_emp_length'].mean()}).astype('float')).astype('string').astype('category')
    monthly_income = df['person_income'] / 12
    df['monthly_debt'] = (df['loan_amnt'] * (1 + df['loan_int_rate'].replace({'None': train['loan_int_rate'].mean()})) / 12)
    df['dti_ratio'] = (df['monthly_debt'] / monthly_income).astype('string').astype('category')
    df['monthly_debt'] = df['monthly_debt'].astype('string').astype('category')
    df['risk_flag'] = (np.where((df['cb_person_default_on_file'] == 'Y') & (df['loan_grade'].isin(['C', 'D', 'E'])), 1, 0))
    df['risk_flag'] = df['risk_flag'].astype('category')
    df['person_home_ownership'] = df['person_home_ownership'].astype('category')
    df['loan_intent'] = df['loan_intent'].astype('category')
    df['loan_grade'] = df['loan_grade'].astype('category')
    df['cb_person_default_on_file'] = df['cb_person_default_on_file'].astype('category')
    df['person_emp_length'] = df['person_emp_length'].astype('string').astype('category')
    df['loan_int_rate'] = (df['loan_int_rate'] * 100).astype('string').astype('category')
    df['loan_percent_income'] = (df['loan_percent_income'] * 100).astype('string').astype('category')
    return df

In [8]:
df = preprocess(df)
test = preprocess(test)

  df['loan_to_emp_length_ratio'] = (df['loan_amnt'] / df['person_emp_length'].replace({'None': train['person_emp_length'].mean()}).astype('float')).astype('string').astype('category')
  df['monthly_debt'] = (df['loan_amnt'] * (1 + df['loan_int_rate'].replace({'None': train['loan_int_rate'].mean()})) / 12)


In [9]:
categorical_cols = [
    'loan_to_income', 'age_income_interaction', 'loan_to_emp_length_ratio', 
    'dti_ratio', 'monthly_debt', 'risk_flag', 'person_home_ownership', 
    'loan_intent', 'loan_grade', 'cb_person_default_on_file', 'person_emp_length', 
    'loan_int_rate', 'loan_percent_income'
]
numerical_cols= [
    'person_age', 'person_income', 'cb_person_cred_hist_length', 'loan_amnt'
]
y = df.pop('loan_status')

In [10]:
X = df.drop(['id'], axis = 1)

In [11]:
X.columns

Index(['person_age', 'person_income', 'person_home_ownership',
       'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt',
       'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file',
       'cb_person_cred_hist_length', 'loan_to_income',
       'age_income_interaction', 'loan_to_emp_length_ratio', 'monthly_debt',
       'dti_ratio', 'risk_flag'],
      dtype='object')

In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from category_encoders import BinaryEncoder

# Define the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),  # Standardize numerical features
        ('cat', 'passthrough', categorical_cols)
        #('cat', BinaryEncoder(handle_unknown='ignore'), categorical_cols)  # One-hot encode categorical features
    ]
)

In [13]:
X_train_transformed=preprocessor.fit_transform(X)

In [14]:
# Extract column names
# Get numerical column names
num_feature_names = numerical_cols

# Get categorical column names after one-hot encoding
cat_feature_names = categorical_cols

# Combine both sets of feature names
all_feature_names = list(num_feature_names) + list(cat_feature_names)

In [15]:
all_feature_names

['person_age',
 'person_income',
 'cb_person_cred_hist_length',
 'loan_amnt',
 'loan_to_income',
 'age_income_interaction',
 'loan_to_emp_length_ratio',
 'dti_ratio',
 'monthly_debt',
 'risk_flag',
 'person_home_ownership',
 'loan_intent',
 'loan_grade',
 'cb_person_default_on_file',
 'person_emp_length',
 'loan_int_rate',
 'loan_percent_income']

In [16]:
X_train_transformed.shape

(91226, 17)

In [17]:
# Convert the transformed array back to a DataFrame with the correct column names
X = pd.DataFrame(X_train_transformed, columns=all_feature_names, index=X.index)

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
X_train

Unnamed: 0,person_age,person_income,cb_person_cred_hist_length,loan_amnt,loan_to_income,age_income_interaction,loan_to_emp_length_ratio,dti_ratio,monthly_debt,risk_flag,person_home_ownership,loan_intent,loan_grade,cb_person_default_on_file,person_emp_length,loan_int_rate,loan_percent_income
12909,0.225033,0.601403,1.037488,-0.572849,0.004102564102564099,2714400,6000.0,0.5692307692307691,4439.999999999999,0,RENT,PERSONAL,A,N,1.0,788.0,6.0
64185,-0.750881,-0.412434,-0.9435,0.111079,0.0022222222222222088,1035000,10000.0,2.9355555555555557,11008.333333333334,0,MORTGAGE,MEDICAL,B,N,1.0,1221.0,22.0
43828,0.225033,-0.308129,0.046994,0.453042,0.0,1450000,12000.0,3.4776,14490.0,1,RENT,HOMEIMPROVEMENT,C,Y,1.0,1349.0,24.0
79080,1.038295,-0.950643,1.037488,-0.786576,-0.0026041666666666574,652800,395.8333333333333,3.174088541666667,5078.541666666667,0,RENT,DEBTCONSOLIDATION,B,N,12.0,1183.0,25.0
18010,-0.750881,-0.349851,-0.448253,-0.059903,-0.0025000000000000022,1104000,4500.0,2.0625,8250.0,0,RENT,EDUCATION,B,N,2.0,1000.0,19.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,0.550338,0.074125,1.037488,2.504825,-0.018732509806217434,2118044,12000.0,3.126280662724665,17800.0,0,MORTGAGE,MEDICAL,A,N,2.0,790.0,37.0
54886,0.06238,-0.099521,0.542241,0.111079,-0.003333333333333355,1680000,833.3333333333334,1.07,5350.0,0,MORTGAGE,PERSONAL,A,N,12.0,542.0,17.0
76820,0.550338,2.090868,0.294618,0.965988,0.0009090909090909149,5115000,5000.0,1.2372727272727273,17012.5,0,MORTGAGE,HOMEIMPROVEMENT,C,N,3.0,1261.0,9.0
860,0.71299,0.526304,0.046994,-1.025951,0.00722222222222222,2880000,inf,0.23896666666666666,1792.25,0,MORTGAGE,PERSONAL,A,N,0.0,542.0,3.0


# Modelling

In [20]:
from catboost import Pool, cv, CatBoostClassifier

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [21]:
import optuna
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import numpy as np

# Define the objective function for the Optuna study
def objective(trial):
    # Suggest values for the hyperparameters
    depth = trial.suggest_int('depth', 4, 8)
    learning_rate = trial.suggest_float('learning_rate', 0.05, 0.13, log=True)
    l2_leaf_reg = trial.suggest_float('l2_leaf_reg', 0.05, 1.4, log=True)

    # Initialize StratifiedKFold for cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    roc_auc_scores = []

    # Perform cross-validation with early stopping
    for train_index, valid_index in skf.split(X, y):
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

        # Create the CatBoost model with the suggested hyperparameters
        model = CatBoostClassifier(
            iterations=1000,
            eval_metric = 'AUC',
            loss_function = 'Logloss',
            depth=depth,
            learning_rate=learning_rate,
            l2_leaf_reg=l2_leaf_reg,
            cat_features=categorical_cols,
            task_type='GPU',
            random_strength = 0,
            random_seed=42,
            verbose=False  # Suppress training output
        )

        # Train the model on the training fold with early stopping
        model.fit(
            X_train, y_train,
            eval_set=(X_valid, y_valid),  # Validation set for early stopping
            early_stopping_rounds=50,     # Stop if no improvement in 50 rounds
            use_best_model=True           # Use the best model found during training
        )

        # Predict probabilities on the validation fold
        y_pred_proba = model.predict_proba(X_valid)[:, 1]

        # Calculate the ROC AUC score for the current fold
        roc_auc = roc_auc_score(y_valid, y_pred_proba)
        roc_auc_scores.append(roc_auc)

    # Return the average ROC AUC score across all folds
    return np.mean(roc_auc_scores)

In [22]:
from optuna.visualization import (
    plot_optimization_history,
    plot_param_importances,
    plot_parallel_coordinate,
    plot_contour,
    plot_slice,
    plot_intermediate_values
)


In [23]:
from optuna.samplers import TPESampler
# Create a study object and specify the optimization direction (maximize ROC AUC)
study = optuna.create_study(sampler=TPESampler(n_startup_trials=35, multivariate=True, seed=0),direction='maximize')

# Run the optimization for a specified number of trials
study.optimize(objective, n_trials=100)

# Print the best hyperparameters and the best score
print(f"Best hyperparameters: {study.best_params}")
print(f"Best AUC score: {study.best_value}")

# Visualize results
plot_optimization_history(study).show()
plot_param_importances(study).show()
plot_parallel_coordinate(study).show()
plot_contour(study).show()
plot_slice(study).show()


[I 2024-10-17 11:36:00,928] A new study created in memory with name: no-name-ca566402-d510-40ee-b392-08498babb801
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-10-17 11:38:30,427] Trial 0 finished with value: 0.9628134445060936 and parameters: {'depth': 6, 'learning_rate': 0.09902720698000689, 'l2_leaf_reg': 0.37261802964665525}. Best is trial 0 with value: 0.9628134445060936.
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 b

Best hyperparameters: {'depth': 5, 'learning_rate': 0.10589333021345024, 'l2_leaf_reg': 1.3493314922882345}
Best AUC score: 0.9630307068213589


In [24]:
# Get the best hyperparameters found by Optuna
best_params = study.best_params
print("Best Hyperparameters:", best_params)



Best Hyperparameters: {'depth': 5, 'learning_rate': 0.10589333021345024, 'l2_leaf_reg': 1.3493314922882345}


In [25]:
# Create the final model using the best hyperparameters
best_model = CatBoostClassifier(
    depth=best_params['depth'],
    learning_rate=best_params['learning_rate'],
    l2_leaf_reg=best_params['l2_leaf_reg'],
    cat_features=categorical_cols,
    verbose=False  # Suppress training output
)

# Train the model on the entire dataset
best_model.fit(X, y)


<catboost.core.CatBoostClassifier at 0x79f426c568f0>

In [26]:
# Evaluate the model on a test set (if available)
y_test_pred_proba = best_model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_test_pred_proba)
print("Test ROC AUC Score:", roc_auc)

# Save the model for future use
best_model.save_model("best_catboost_model.cbm")



Test ROC AUC Score: 0.9989398633807862


# Prediction

In [27]:
id_column = test['id'].values

In [28]:
X = test.drop(['id'], axis = 1)

In [29]:
X_train_transformed=preprocessor.transform(X)

In [30]:
# Extract column names
# Get numerical column names
num_feature_names = numerical_cols

# Get categorical column names after one-hot encoding
cat_feature_names = categorical_cols

# Combine both sets of feature names
all_feature_names = list(num_feature_names) + list(cat_feature_names)

In [31]:
X = pd.DataFrame(X_train_transformed, columns=all_feature_names, index=X.index)

In [32]:
id_column.shape

(39098,)

In [33]:

y_pred_submission = y_pred_submission = best_model.predict_proba(X)[:, 1]



In [34]:
# Create a new DataFrame with ID and Predictions
predictions_df = pd.DataFrame({'id': id_column, 'loan_status': y_pred_submission})

In [35]:
predictions_df

Unnamed: 0,id,loan_status
0,58645,0.999837
1,58646,0.017433
2,58647,0.807216
3,58648,0.010907
4,58649,0.103507
...,...,...
39093,97738,0.101231
39094,97739,0.007528
39095,97740,0.004900
39096,97741,0.277982


In [36]:
predictions_df.to_csv('submission.csv', index=False)
