In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn import set_config

In [2]:
df=pd.read_csv("modified.csv")

In [3]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
1,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
2,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
3,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
4,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2


In [4]:
df["loan_grade"].unique()

array(['B', 'C', 'A', 'D', 'E', 'F', 'G'], dtype=object)

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(df.drop(['loan_status'], axis=1), df['loan_status'],\
                                                   test_size=0.25, random_state=100)

In [6]:
cat_cols=df.select_dtypes(["object"]).columns

In [7]:
cat_cols

Index(['person_home_ownership', 'loan_intent', 'loan_grade',
       'cb_person_default_on_file'],
      dtype='object')

In [8]:
num_cols=df.select_dtypes(["int","float"])

In [9]:
num_cols.drop("loan_status",axis=1,inplace=True)

In [10]:
num_cols=num_cols.columns

In [11]:
num_cols

Index(['person_age', 'person_income', 'person_emp_length', 'loan_amnt',
       'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length'],
      dtype='object')

## creating pipeline

In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [13]:
num_pipeline = Pipeline(steps=[('Standard Scaler',StandardScaler())])
cat_pipeline = Pipeline(steps=[('OneHot Encoder',OneHotEncoder(drop='first'))])
ct = ColumnTransformer([('Standardization',num_pipeline,num_cols),\
                        ('OneHotEncoder',cat_pipeline,cat_cols)])
dec_tree = DecisionTreeClassifier(criterion='entropy')
random_forest = RandomForestClassifier(n_estimators=40, criterion='entropy', random_state = 0)
knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p = 2)
svm = SVC(kernel='rbf', random_state=1, C = 2.0)
classifier = LogisticRegression(random_state = 0)
dectree = make_pipeline(ct,dec_tree)
randfor=make_pipeline(ct,random_forest)
Knnmod=make_pipeline(ct,knn)
svmmod=make_pipeline(ct,svm)
classmod=make_pipeline(ct,classifier)


In [14]:
dectree.fit(X_train, Y_train)

In [15]:
randfor.fit(X_train, Y_train)

In [16]:
Knnmod.fit(X_train, Y_train)

In [17]:
svmmod.fit(X_train, Y_train)

In [18]:
classmod.fit(X_train, Y_train)

In [19]:
mod_pred=dectree.predict(X_test)

In [20]:
pd.DataFrame({"Actual":Y_test,"predicted":mod_pred})

Unnamed: 0,Actual,predicted
28957,0,0
28854,0,0
24591,0,0
551,0,0
24904,0,0
...,...,...
17577,0,0
4799,1,1
25767,1,1
15966,0,0


In [21]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [22]:
print('accuracy of decision tree algorithm :%.2f%%' % (accuracy_score(Y_test, mod_pred)*100))

accuracy of decision tree algorithm :89.57%


In [23]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [24]:
classifiers=[('K-Nearest Neighbors', Knnmod),('Random Forest', randfor),('Logistic Regression', classmod),('SVM', svmmod,),('Decision Tree', dectree)]
    
accuracies = []

# Train and evaluate each classifier
for name, clf in classifiers:
    clf.fit(X_train, Y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(Y_test, y_pred)
    accuracies.append({'Algorithm': name, 'Accuracy': accuracy})

# Create a DataFrame for Plotly
df = pd.DataFrame(accuracies)

# Create an interactive bar plot using Plotly
fig = px.bar(df, x='Algorithm', y='Accuracy', color='Algorithm',
             title='Accuracy of Different Algorithms', height=500)

# Show the plot
fig.show()


AttributeError: 'NoneType' object has no attribute 'split'

In [None]:
pip install yellowbrick

In [None]:
from yellowbrick.classifier import ConfusionMatrix

In [None]:
li=[dectree,randfor,classmod,svmmod,Knnmod]
cm = ConfusionMatrix(li[0])
cm.fit(X_train, Y_train)
cm.score(X_test, Y_test)
mod_pred=dectree.predict(X_test)
print(classification_report(Y_test, mod_pred))

In [None]:
cm = ConfusionMatrix(li[1])
cm.fit(X_train, Y_train)
cm.score(X_test, Y_test)
randfor_pred=li[1].predict(X_test)
print(classification_report(Y_test, randfor_pred))

In [None]:
cm = ConfusionMatrix(li[2])
cm.fit(X_train, Y_train)
cm.score(X_test, Y_test)
class_pred=li[2].predict(X_test)
print(classification_report(Y_test, class_pred))

In [None]:
cm = ConfusionMatrix(li[3])
cm.fit(X_train, Y_train)
cm.score(X_test, Y_test)
svm_pred=li[3].predict(X_test)
print(classification_report(Y_test, svm_pred))

In [None]:
cm = ConfusionMatrix(li[4])
cm.fit(X_train, Y_train)
cm.score(X_test, Y_test)
knn_pred=li[4].predict(X_test)
print(classification_report(Y_test, knn_pred))