# Decision Tree


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as snb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [3]:
df = pd.read_csv('loan.csv')

In [4]:
df.head()

Unnamed: 0,age,gender,occupation,education_level,marital_status,income,credit_score,loan_status
0,32,Male,Engineer,Bachelor's,Married,85000.0,720.0,Approved
1,45,Female,Teacher,Master's,Single,62000.0,680.0,Approved
2,28,Male,Student,High School,Single,25000.0,590.0,Denied
3,51,Female,Manager,Bachelor's,Married,105000.0,780.0,Approved
4,36,Male,Accountant,Bachelor's,Married,75000.0,710.0,Approved


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61 entries, 0 to 60
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              61 non-null     int64  
 1   gender           61 non-null     object 
 2   occupation       60 non-null     object 
 3   education_level  57 non-null     object 
 4   marital_status   58 non-null     object 
 5   income           60 non-null     float64
 6   credit_score     57 non-null     float64
 7   loan_status      61 non-null     object 
dtypes: float64(2), int64(1), object(5)
memory usage: 3.9+ KB


In [6]:
df.isnull().sum()

age                0
gender             0
occupation         1
education_level    4
marital_status     3
income             1
credit_score       4
loan_status        0
dtype: int64

In [8]:
for col in df.columns:
    if df[col].isnull().sum()>0:
        if df[col].dtype=='object':
            df[col].fillna(df[col].mode()[0], inplace=True)
        else:
            df[col].fillna(df[col].mean(), inplace=True)
            

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61 entries, 0 to 60
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              61 non-null     int64  
 1   gender           61 non-null     object 
 2   occupation       61 non-null     object 
 3   education_level  61 non-null     object 
 4   marital_status   61 non-null     object 
 5   income           61 non-null     float64
 6   credit_score     61 non-null     float64
 7   loan_status      61 non-null     object 
dtypes: float64(2), int64(1), object(5)
memory usage: 3.9+ KB


In [11]:
Label_Encoder = LabelEncoder()

categorical_col = df.select_dtypes(include={'object', 'category'}).columns

categorical_col

Index(['gender', 'occupation', 'education_level', 'marital_status',
       'loan_status'],
      dtype='object')

In [12]:
for col in categorical_col:
    df[col]=Label_Encoder.fit_transform(df[col])

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61 entries, 0 to 60
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              61 non-null     int64  
 1   gender           61 non-null     int64  
 2   occupation       61 non-null     int64  
 3   education_level  61 non-null     int64  
 4   marital_status   61 non-null     int64  
 5   income           61 non-null     float64
 6   credit_score     61 non-null     float64
 7   loan_status      61 non-null     int64  
dtypes: float64(2), int64(6)
memory usage: 3.9 KB


In [None]:
x=df.drop('loan_status', axis=1)
y=df['loan_status']

Unnamed: 0,age,gender,occupation,education_level,marital_status,income,credit_score
0,32,1,12,1,0,85000.0,720.0
1,45,0,35,4,1,62000.0,680.0
2,28,1,33,3,1,25000.0,590.0
3,51,0,16,1,0,105000.0,780.0
4,36,1,0,1,0,75000.0,710.0
...,...,...,...,...,...,...,...
56,39,1,2,4,0,100000.0,770.0
57,25,0,27,1,1,32000.0,570.0
58,43,1,4,1,0,95000.0,760.0
59,30,0,37,4,1,55000.0,650.0


In [None]:
# Step 1: Split into 70% training and 30% temp
x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.3, random_state=42)

# Step 2: Split the temp set into 50% validation and 50% test
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)


In [17]:
# Step 3: Train the Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(x_train, y_train)

In [19]:
# Step 4: Evaluate the model on the validation set
y_val_pred = dt_model.predict(x_val)
val_accuracy = accuracy_score(y_val, y_val_pred)

val_accuracy

1.0

In [21]:
y_test_pred = dt_model.predict(x_test)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.9000


In [23]:
scaler_x=StandardScaler()

x_train_scaled=scaler_x.fit_transform(x_train)
x_test_scaled=scaler_x.transform(x_test)

In [24]:
model=DecisionTreeClassifier(random_state=42)
model.fit(x_train_scaled,y_train)


In [26]:
y_pred_scaled=model.predict(x_test_scaled)

In [28]:
test_accuracy = accuracy_score(y_test, y_pred_scaled)

test_accuracy

0.9

In [30]:
# Perform K-fold cross-validation (k=5)
cv_scores = cross_val_score(dt_model, x, y, cv=5)  # cv=5 means 5-fold cross-validation
avg_cv_scores=np.mean(cv_scores)

avg_cv_scores

np.float64(0.95)

In [31]:

from sklearn.model_selection import GridSearchCV
param_grid = {
    'criterion': ['gini', 'entropy'],         
    'max_depth': [None, 10, 20, 30, 40],    
    'min_samples_split': [2, 5, 10],          
    'min_samples_leaf': [1, 2, 4],            
    'max_features': [None, 'sqrt', 'log2'],   
}


grid_search = GridSearchCV(estimator=dt_model, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=2)

grid_search.fit(x_train, y_train)


print("Best hyperparameters found: ", grid_search.best_params_)

Fitting 5 folds for each of 270 candidates, totalling 1350 fits
Best hyperparameters found:  {'criterion': 'gini', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [32]:

best_model = grid_search.best_estimator_


y_pred = best_model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Test accuracy of the best model: {accuracy}")

Test accuracy of the best model: 0.9
