In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

In [2]:
dirty_data = pd.read_csv('loan-train.csv')
data = dirty_data.dropna()

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(data.drop(columns=['Loan_ID', 'Loan_Status']), data['Loan_Status'], random_state= 1)

print("No of rows in total set: " + str(data.shape[0]))
print("No of rows in train set: " + str(X_train.shape[0]))
print("No of rows in test set: " + str(X_test.shape[0]))


# Convert categorical variables to numerical using label encoding
encoder = LabelEncoder()
for feature in X_train.columns:
    if X_train[feature].dtype == 'object':
        X_train[feature] = encoder.fit_transform(X_train[feature])

No of rows in total set: 306
No of rows in train set: 229
No of rows in test set: 77


In [4]:
model = DecisionTreeClassifier()
model.fit(X_train, Y_train)
y_pred = model.predict(X_train)

acc = accuracy_score(Y_train, y_pred)
print(acc)

1.0


In [5]:
# testing it on another data
for feature in X_test.columns:
    if X_test[feature].dtype == 'object':
        X_test[feature] = encoder.fit_transform(X_test[feature])


predictions = model.predict(X_test)
accuracy = accuracy_score(Y_test, predictions)
precision = precision_score(Y_test, predictions, pos_label='Y')
recall = recall_score(Y_test, predictions, pos_label='Y')
f1 = f1_score(Y_test, predictions, pos_label='Y')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.735632183908046
Precision: 0.8363636363636363
Recall: 0.7666666666666667
F1 Score: 0.8
