In [167]:
# Import necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from zipfile import ZipFile


from sklearn.preprocessing import power_transform, OneHotEncoder, LabelEncoder, OrdinalEncoder, PowerTransformer, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')
import os
import joblib

## Load Clean Data

In [168]:
path = r"C:\Users\ncc892\Desktop\kaggle_competition\playground-series-s5e11.zip"

with ZipFile(path , 'r') as zippath:
    zippath.printdir()


with ZipFile(path, 'r') as zipref:
    with zipref.open('train.csv') as file:
        raw_train_data = pd.read_csv(file)
    
    with zipref.open('test.csv') as data:
        test_data = pd.read_csv(data)


File Name                                             Modified             Size
sample_submission.csv                          2025-10-28 23:08:48      2291139
test.csv                                       2025-10-28 23:08:48     23021430
train.csv                                      2025-10-28 23:08:50     55988519


In [169]:
data = pd.read_csv('eda_cleaned_train_data.csv')

df = data.copy()


In [170]:
X = df.drop(columns=['loan_paid_back'])
y = df['loan_paid_back']


In [171]:
# Trim X
numerical_cols = X.select_dtypes(include='number').columns

categorical_cols = X.select_dtypes(include='object').columns

numerical_cols

Index(['annual_income', 'debt_to_income_ratio', 'credit_score', 'loan_amount',
       'interest_rate'],
      dtype='object')

## Handle Skewness

In [172]:

X[numerical_cols] = power_transform(X[numerical_cols], method = 'yeo-johnson')
print(numerical_cols)

Index(['annual_income', 'debt_to_income_ratio', 'credit_score', 'loan_amount',
       'interest_rate'],
      dtype='object')


# Scaling

In [173]:
scaler = MinMaxScaler()

X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
numerical_cols

Index(['annual_income', 'debt_to_income_ratio', 'credit_score', 'loan_amount',
       'interest_rate'],
      dtype='object')

In [174]:
X[numerical_cols]

Unnamed: 0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate
0,0.280082,0.462105,0.706271,0.070202,0.605632
1,0.222801,0.722375,0.470074,0.129175,0.564250
2,0.395173,0.516688,0.604288,0.415457,0.387295
3,0.382196,0.369964,0.251331,0.131585,0.738331
4,0.251135,0.303016,0.536210,0.312339,0.412778
...,...,...,...,...,...
593989,0.230611,0.689819,0.625806,0.495565,0.452781
593990,0.319020,0.547285,0.304122,0.091855,0.657751
593991,0.383440,0.405764,0.559467,0.016991,0.630909
593992,0.501640,0.380426,0.716190,0.400588,0.393533


## Encode Categorical Data

In [175]:
# Create your list of categorical data for one-hot and ordinal
encode_list = ['gender', 'marital_status', 'employment_status', 'loan_purpose']
ordinal_columns = ['education_level', 'grade_subgrade']

# Assign order to column
education_order = ["Other", "High School", "Bachelor's", "Master's", "PhD"]
grade_order = ['A1', 'A2', 'A3', 'A4', 'A5', 'B1', 'B2', 'B3', 'B4', 'B5', 'C1', 'C2', 'C3', 'C4', 'C5', 'D1', 'D2', 'D3', 'D4', 'D5', 'E1', 'E2', 'E3', 'E4', 'E5', 'F1', 'F2', 'F3', 'F4', 'F5']


# Initialize ordinal encoder
ordinal = OrdinalEncoder(categories=[education_order, grade_order], handle_unknown='use_encoded_value', unknown_value=-1)
# Initialize one-hot encoder
one_hot = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)


# Call the column transformer
column_transformer = ColumnTransformer(
    transformers=[
            ("ordinal_enc", ordinal, ordinal_columns),
            ('one_hot_enc', one_hot, encode_list),
            # include the numeric columns(though no operation will be performed on them)
            ("num", 'passthrough', numerical_cols)
            ]
            )

# Fit the column transformer on the training data (X)
column_transformer.fit(X)

# Save the column transformer 
os.makedirs("artifacts", exist_ok=True)
joblib.dump(column_transformer, "artifacts/column_preprocessor.pkl")


['artifacts/column_preprocessor.pkl']

In [176]:
joblib.dump(column_transformer, "artifacts/column_preprocessor.pkl")


['artifacts/column_preprocessor.pkl']

# Model Building

### Second-level Splitting

In [177]:
# Second level splitting
X_train, X_val, y_train, y_val = train_test_split(X, y,  test_size=0.2, stratify=y, random_state=234)


In [178]:
X_train.head()

Unnamed: 0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade
189377,0.408306,0.50056,0.318618,0.40884,0.52932,Female,Single,Other,Unemployed,Debt consolidation,F3
283530,0.392195,0.297097,0.711226,0.218072,0.46625,Male,Married,High School,Self-employed,Business,C1
53299,0.414317,0.291123,0.575884,0.427484,0.570333,Male,Married,Master's,Employed,Debt consolidation,C4
522831,0.288132,0.214949,0.566489,0.364584,0.5143,Female,Single,Bachelor's,Employed,Vacation,C2
282581,0.417099,0.762783,0.61144,0.577766,0.591862,Female,Married,Master's,Employed,Debt consolidation,C2


### Data Transformation

In [179]:
# Call your function

X_train_t = column_transformer.transform(X_train)
X_val_t = column_transformer.transform(X_val)

In [180]:
X_train_t

array([[ 0.        , 27.        ,  0.        , ...,  0.31861829,
         0.40884003,  0.52931983],
       [ 1.        , 10.        ,  1.        , ...,  0.711226  ,
         0.21807155,  0.46624957],
       [ 3.        , 13.        ,  1.        , ...,  0.57588427,
         0.42748353,  0.57033329],
       ...,
       [ 3.        , 18.        ,  0.        , ...,  0.45217554,
         0.37851976,  0.36569336],
       [ 1.        , 13.        ,  0.        , ...,  0.5641465 ,
         0.21090044,  0.60177873],
       [ 1.        , 18.        ,  1.        , ...,  0.52236668,
         0.34035876,  0.60728284]], shape=(475195, 23))

### Model

In [181]:

# define the models to be used
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier()
}

# # Loop through
# for name,  model in models.items():
#     # print model name
#     print(f'Running {name} ')
#     model.fit(X_train, y_train)
#     # print(f'{roc_auc_score(y_val}')
    

for name, model in models.items():
    print(f"Running {name}...")

    model.fit(X_train_t, y_train)

    # prediction for ROC-AUC (classifiers only)
   
    y_pred = model.predict_proba(X_val_t)[:, 1]
    
    print(f"ROC-AUC: {roc_auc_score(y_val, y_pred):.4f}\n")


Running Logistic Regression...
ROC-AUC: 0.9058

Running Decision Tree...
ROC-AUC: 0.7698

Running Random Forest...
ROC-AUC: 0.9064



In [182]:
model = RandomForestClassifier(random_state=24, n_estimators=100)
model.fit(X_train_t, y_train)

# prediction for ROC-AUC (classifiers only)
y_pred = model.predict_proba(X_val_t)[:, 1]

print(f"ROC-AUC: {roc_auc_score(y_val, y_pred):.4f}\n")


ROC-AUC: 0.9064



In [183]:
raw_train_data['id']

0              0
1              1
2              2
3              3
4              4
           ...  
593989    593989
593990    593990
593991    593991
593992    593992
593993    593993
Name: id, Length: 593994, dtype: int64

In [184]:
test_data

Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade
0,593994,28781.05,0.049,626,11461.42,14.73,Female,Single,High School,Employed,Other,D5
1,593995,46626.39,0.093,732,15492.25,12.85,Female,Married,Master's,Employed,Other,C1
2,593996,54954.89,0.367,611,3796.41,13.29,Male,Single,Bachelor's,Employed,Debt consolidation,D1
3,593997,25644.63,0.110,671,6574.30,9.57,Female,Single,Bachelor's,Employed,Debt consolidation,C3
4,593998,25169.64,0.081,688,17696.89,12.80,Female,Married,PhD,Employed,Business,C1
...,...,...,...,...,...,...,...,...,...,...,...,...
254564,848558,92835.97,0.068,744,29704.00,13.48,Female,Single,Bachelor's,Employed,Debt consolidation,B2
254565,848559,48846.47,0.091,634,20284.33,9.58,Female,Married,High School,Employed,Debt consolidation,D4
254566,848560,20668.52,0.096,718,26387.55,9.00,Male,Single,Master's,Employed,Debt consolidation,C4
254567,848561,34105.09,0.094,739,11107.36,9.81,Male,Single,Bachelor's,Employed,Business,C2


In [185]:
id = test_data['id']
test_data = test_data.drop(columns='id')
test_transformed = column_transformer.transform(test_data)


In [186]:
test_transformed

array([[1.000000e+00, 1.900000e+01, 0.000000e+00, ..., 6.260000e+02,
        1.146142e+04, 1.473000e+01],
       [3.000000e+00, 1.000000e+01, 0.000000e+00, ..., 7.320000e+02,
        1.549225e+04, 1.285000e+01],
       [2.000000e+00, 1.500000e+01, 1.000000e+00, ..., 6.110000e+02,
        3.796410e+03, 1.329000e+01],
       ...,
       [3.000000e+00, 1.300000e+01, 1.000000e+00, ..., 7.180000e+02,
        2.638755e+04, 9.000000e+00],
       [2.000000e+00, 1.100000e+01, 1.000000e+00, ..., 7.390000e+02,
        1.110736e+04, 9.810000e+00],
       [1.000000e+00, 1.700000e+01, 0.000000e+00, ..., 6.240000e+02,
        1.924614e+04, 1.164000e+01]], shape=(254569, 23))

In [187]:
# prediction 
pred = model.predict_proba(test_transformed)[:, 1]

# Save the model into DataFrame

In [188]:
# Create a submission file
submission = pd.DataFrame({
    'id': id,
    'prediction': pred
})

# Save as a csv file
submission.to_csv('submission.csv', index=False)

## Hyperparameter Tuning