In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
df= pd.read_csv("data.csv")
df.sample(10)

In [None]:
cat_features = [i for i in df.columns if df[i].dtype =='O']
cat_features.remove('Loan_ID')
cat_features.remove('Loan_Status')
cat_features

In [None]:
num_features = [i for i in df.columns if df[i].dtype !='O']
num_features

In [None]:
fig, ax = plt.subplots(3,2, figsize = (15,15))
axs = ax.ravel()
for i, feature in enumerate(cat_features):
    sns.countplot(x = feature, hue = 'Loan_Status', ax = axs[i], data = df)

In [None]:
fig, ax = plt.subplots(3,2, figsize = (15,15))
axs = ax.ravel()
for i, feature in enumerate(num_features):
    sns.kdeplot(x = feature, hue = 'Loan_Status', ax = axs[i], data = df, fill = True)

In [None]:
fig, ax = plt.subplots(3,2, figsize = (15,15))
axs = ax.ravel()
for i, feature in enumerate(cat_features):
    sns.kdeplot(x = 'ApplicantIncome', hue = feature, ax = axs[i], data = df, fill = True)

In [None]:
fig, ax = plt.subplots(3,2, figsize = (15,15))
axs = ax.ravel()
for i, feature in enumerate(cat_features):
    sns.kdeplot(x = 'LoanAmount', hue = feature, ax = axs[i], data = df, fill = True)

In [None]:
df.head(5)

In [None]:
df.isnull().sum()

In [None]:
df.Gender.unique()

In [None]:
df.Loan_Amount_Term.unique()

In [None]:
df['Gender'].fillna(df['Gender'].mode()[0], inplace = True)
df['Married'].fillna(df['Married'].mode()[0], inplace = True)
df['Dependents'].fillna(df['Dependents'].mode()[0], inplace = True)
df['Self_Employed'].fillna(df['Self_Employed'].mode()[0], inplace = True)
df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace = True)
df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0], inplace = True)

In [None]:
df['LoanAmount'].fillna(df['LoanAmount'].mean(), inplace = True)

In [None]:
df.isnull().sum()

In [None]:
df['TotalIncome'] = df['ApplicantIncome'] + df['CoapplicantIncome']

In [None]:
df.drop(columns=['Loan_ID', 'ApplicantIncome', 'CoapplicantIncome'], axis = 1, inplace=True)

In [None]:
corr_matrix = df.corr()
corr_features = corr_matrix.index
plt.figure(figsize=(15,15))
sns.heatmap(df[corr_features].corr(), annot = True)

In [None]:
df['TotalIncome'].hist(bins =25)
plt.grid(b=None)

In [None]:
df['TotalIncome_log']= np.log(df['TotalIncome'])
df['TotalIncome_log'].hist()
plt.grid(b=None)

In [None]:
df['LoanAmount'].hist(bins =25)
plt.grid(b=None)

In [None]:
df['LoanAmount_log']= np.log(df['LoanAmount'])
df['LoanAmount_log'].hist(bins = 25)
plt.grid(b=None)

In [None]:
df.drop(columns= ['LoanAmount', 'TotalIncome'], axis=1, inplace= True)

In [None]:
df = pd.get_dummies(df, drop_first= True)

In [None]:
df.shape

In [None]:
y = df['Loan_Status_Y']
X = df.drop('Loan_Status_Y', axis=1)

In [None]:
print('shape of X:', X.shape)
print('shape of y:', y.shape)

In [None]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=5)

from sklearn.preprocessing import MinMaxScaler
ss = MinMaxScaler()
xtrain = ss.fit_transform(xtrain)
xtest = ss.transform(xtest)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import cross_val_score


In [None]:
#logistic regression
log_model = LogisticRegression()
log_model.fit(xtrain, ytrain)

ypred = log_model.predict(xtest)

con_matrix = confusion_matrix(ytest, ypred)
acc_score = accuracy_score(ytest, ypred)
report = classification_report(ytest, ypred)


cross_score = cross_val_score(log_model, X, y)
score = np.mean(cross_score)

# print('con_matrix:', con_matrix)
print('acc_score:', acc_score)
# print('report:', report)
# print('score:', score)

In [None]:
sns.heatmap(con_matrix, annot = True)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
scoreListRf = []
for i in range(2,25):
    RFclassifier = RandomForestClassifier(n_estimators = 1000, random_state = 1, max_leaf_nodes=i)
    RFclassifier.fit(xtrain, ytrain)
    scoreListRf.append(RFclassifier.score(xtest, ytest))

# plt.plot(range(2,25), scoreListRf)
# plt.xticks(np.arange(2,25,1))
# plt.xlabel("RF Value")
# plt.ylabel("score")
# plt.show()
RFAcc=max(scoreListRf)
print("Random Forest Accuracy: {:.2f}%" .format(RFAcc*100))

In [None]:
SVCclassifier = SVC(kernel='rbf', max_iter=500)
SVCclassifier.fit(xtrain, ytrain)

ypred = SVCclassifier.predict(xtest)

print(classification_report(ytest, ypred))
print(confusion_matrix(ytest, ypred))

from sklearn.metrics import accuracy_score
SVCAcc = accuracy_score(ypred, ytest)
print('SVC accuracy: {:.2f}%' .format(SVCAcc*100))

In [None]:
scoreListknn = []
for i in range(1,21):
    KNclassifier = KNeighborsClassifier(n_neighbors=i)
    KNclassifier.fit(xtrain,ytrain)
    scoreListknn.append(KNclassifier.score(xtest, ytest))


# plt.plot(range(1,21), scoreListknn)
# plt.xticks(np.arange(1,21,1))
# plt.xlabel("K Value")
# plt.ylabel("Score")
# plt.show()
KNAcc=max(scoreListknn)
print("KNN best Accuracy: {:.2f}%" .format(KNAcc*100))

In [None]:
#AdaBoostClassifier
# Load libraries

from sklearn.ensemble import AdaBoostClassifier
from sklearn import datasets
# Import train_test_split function
from sklearn.model_selection import train_test_split
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Create adaboost classifer object
abc = AdaBoostClassifier(n_estimators=50, learning_rate=1)
# Train Adaboost Classifer
model = abc.fit(xtrain, ytrain)

#Predict the response for test dataset
ypred = model.predict(xtest)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(ytest, ypred))


In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.linear_model import BayesianRidge
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer, MissingIndicator




In [None]:
scoreListDT = []
for i in range(2,21):
    DTclassifier = DecisionTreeClassifier(max_leaf_nodes=2)
    DTclassifier.fit(xtrain, ytrain)
    scoreListDT.append(DTclassifier.score(xtest, ytest))

# plt.plot(range(2,21), scoreListDT)
# plt.xtricks(np.arange(2,21,1))
# plt.xlabel("leaf")
# plt.ylabel("score")
# plt.show()
DTAcc = max(scoreListDT)
print("Decision tree Accuracy: {: .2f}%" .format(DTAcc*100))