In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.compose import ColumnTransformer
from scipy.stats import randint, uniform
from sklearn.pipeline import Pipeline

In [2]:
df_test = pd.read_csv("P:/notebooks/competition KAGGLE/data/Predicting Loan Payback/test.csv")
df_train = pd.read_csv("P:/notebooks/competition KAGGLE/data/Predicting Loan Payback/train.csv")

In [3]:
df_train.isna().sum()

id                      0
annual_income           0
debt_to_income_ratio    0
credit_score            0
loan_amount             0
interest_rate           0
gender                  0
marital_status          0
education_level         0
employment_status       0
loan_purpose            0
grade_subgrade          0
loan_paid_back          0
dtype: int64

In [4]:
target = df_train["loan_paid_back"]
idk = df_test["id"]
df_train = df_train.drop(columns=["loan_paid_back", "id"])
df_test = df_test.drop(columns=["id"])

In [5]:
df_train

Unnamed: 0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade
0,29367.99,0.084,736,2528.42,13.67,Female,Single,High School,Self-employed,Other,C3
1,22108.02,0.166,636,4593.10,12.92,Male,Married,Master's,Employed,Debt consolidation,D3
2,49566.20,0.097,694,17005.15,9.76,Male,Single,High School,Employed,Debt consolidation,C5
3,46858.25,0.065,533,4682.48,16.10,Female,Single,High School,Employed,Debt consolidation,F1
4,25496.70,0.053,665,12184.43,10.21,Male,Married,High School,Employed,Other,D1
...,...,...,...,...,...,...,...,...,...,...,...
593989,23004.26,0.152,703,20958.37,10.92,Female,Single,High School,Employed,Business,C3
593990,35289.43,0.105,559,3257.24,14.62,Male,Single,Bachelor's,Employed,Debt consolidation,F5
593991,47112.64,0.072,675,929.27,14.13,Female,Married,Bachelor's,Employed,Debt consolidation,C1
593992,76748.44,0.067,740,16290.40,9.87,Male,Single,Bachelor's,Employed,Debt consolidation,B2


In [6]:
X = df_train
y = target

In [7]:
cat_cols = df_train.drop(columns=["annual_income", "debt_to_income_ratio", "credit_score", "loan_amount", "interest_rate"]).columns
num_cols = [col for col in X.columns if col not in cat_cols]

In [8]:
cat_inputer = Pipeline(steps=[
    ('ohe', OneHotEncoder(handle_unknown='ignore'))
])

In [9]:
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('num', 'passthrough', num_cols)
])

In [10]:
pipe = Pipeline(steps=[
    ('prep', preprocessor),
    ('model', DecisionTreeClassifier(random_state=42))
])

In [11]:
param_RSCV = {
    'model__max_depth': randint(4,20),
    'model__min_samples_split': randint(2,30),
    'model__min_samples_leaf': randint(1,30),
    'model__max_features': uniform(0.6, 0.4),
    'model__ccp_alpha': uniform(0, 0.0001)
}
search = RandomizedSearchCV(
    pipe,
    param_RSCV,
    n_iter=60,
    cv=5,
    scoring='roc_auc',
    random_state=42,
    n_jobs=-1
)

search.fit(X, y)

print(search.best_score_)
print(search.best_params_)

0.9146891745768897
{'model__ccp_alpha': np.float64(1.0789142699330445e-05), 'model__max_depth': 18, 'model__max_features': np.float64(0.9583054382694077), 'model__min_samples_leaf': 17, 'model__min_samples_split': 21}


In [20]:
best_parametr = search.best_params_

In [21]:
best_score = search.best_score_
best_score

np.float64(0.9146891745768897)

In [22]:
best_model = search.best_estimator_

In [18]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
scores = cross_val_score(pipe, X, y, cv=cv, scoring="roc_auc", n_jobs=-1)
print(scores)

[0.90794632 0.9052111  0.90685735 0.904747   0.90513085]


In [23]:
pred = best_model.predict_proba(df_test)[:,1]

In [24]:
submission = pd.DataFrame({
    "id": idk,
    "loan_paid_back": pred
})

In [25]:
submission.to_csv("P:/notebooks/competition KAGGLE/answer/Predicting Loan Payback/submission.csv", index=False)