In [175]:
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,StandardScaler

In [176]:
df = pd.read_csv("loan_default_risk_dataset.csv")

df.head()

Unnamed: 0,Retirement_Age,Debt_Amount,Monthly_Savings,Loan_Default_Risk
0,60.0,2996.52,2378.49,0
1,66.4,4137.23,1538.92,1
2,58.5,19865.75,2434.8,1
3,49.8,16855.7,2677.82,1
4,67.3,7902.37,2206.72,0


In [177]:
df.isnull().sum()

Retirement_Age       0
Debt_Amount          1
Monthly_Savings      2
Loan_Default_Risk    0
dtype: int64

In [178]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Retirement_Age     300 non-null    float64
 1   Debt_Amount        299 non-null    float64
 2   Monthly_Savings    298 non-null    float64
 3   Loan_Default_Risk  300 non-null    int64  
dtypes: float64(3), int64(1)
memory usage: 9.5 KB


In [179]:
from sklearn.impute import SimpleImputer

# impute = SimpleImputer(strategy="mean")

# df[["Monthly_Savings","Debt_Amount"]] = impute.fit_transform(df[["Monthly_Savings","Debt_Amount"]])



In [180]:
x = df.drop(columns=("Loan_Default_Risk"),axis=1)

y = df["Loan_Default_Risk"]

xtrain,xtest,ytrain,ytest = train_test_split(x,y,train_size=0.8,random_state=42)

In [181]:
model = DecisionTreeClassifier(random_state=42)

model.fit(xtrain,ytrain)

In [182]:
num_col = x.select_dtypes(include=["int64","float64"]).columns

num_col

Index(['Retirement_Age', 'Debt_Amount', 'Monthly_Savings'], dtype='object')

In [183]:
params={
    'criterion':['entropy','gini'],
    'min_samples_split':[2,3,5,10],
    'max_depth':[5,10,50,100,200],
    'min_samples_leaf':[2,3,5,7,10]
}

In [184]:
preprocessor = ColumnTransformer(
    transformers=[
        ("simp",SimpleImputer(),num_col),
        ("stand",StandardScaler(),num_col)
    ],remainder="passthrough"
)

In [185]:
pipeline = Pipeline(
    steps=[
        ("preprocessing",preprocessor),
        ("grid",GridSearchCV(model,params,cv=5,n_jobs=-1))
				]
)

In [186]:
pipeline.fit(xtrain,ytrain)

In [187]:
pipeline.score(xtest,ytest)

0.85

In [188]:
pipeline.score(xtrain,ytrain)


0.9