## Importing Necessary Libraries

In [None]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Import the dataset

In [None]:
df = pd.read_excel('../data/dataset_v1.xlsx', index_col="Loan_ID")
full_df_engineered = pd.read_excel("../data/dataset_v3.xlsx")
full_df_engineered.head()

## Splitting The Data Into Train and Test Sets

In [None]:
x = full_df_engineered.drop('Loan_Status', axis = 1)
y = full_df_engineered['Loan_Status'].copy()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
x_train.to_csv("../data/x_train.csv")
x_test.to_csv("../data/x_test.csv")

In [None]:
sns.barplot(df.Loan_Status.value_counts())

## Choosing The Best Models

## Steps 
* Choosing Classification Models
* Cross Validation

### Types of Classification Models

#### Importing Classification Models

In [None]:
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

#### Instatiate the models

In [None]:
xgb_classifier = XGBClassifier(random_state=42)
tree_classifier = DecisionTreeClassifier(random_state=42)
rf_classifier = RandomForestClassifier(random_state=42, n_estimators=100 , class_weight='balanced')
gb_classifier = GradientBoostingClassifier(random_state=42, n_estimators=1000)
log_classifier = LogisticRegression(random_state=42)
svc_classifier = SVC(kernel="linear", C=2.0, random_state=42, probability=True)
knn_classifier = KNeighborsClassifier()

In [None]:
columns=['XGBClassifier' , 'DecisionTreeClassifier' , 'RandomForestClassifier' , 'GradientBoostingClassifier', 'LogisticRegression', 'SUpport Vector Classifier', 'KNeighborsClassifier']
result1=[]
result2=[]
result3=[]

In [None]:
def cal(model):
    model.fit(x_train,y_train)
    pred = model.predict(x_test)
    accuracy = accuracy_score(pred,y_test)
    recall = recall_score(pred,y_test)
    f1 = f1_score(pred,y_test)
    
    result1.append(accuracy)
    result2.append(recall)
    result3.append(f1)
    
    sns.heatmap(confusion_matrix(pred,y_test) , annot=True)
    print(model)
    print('accuracy is : ' , accuracy)
    print('recall is : ' , recall)
    print('f1 is : ' , f1)

In [None]:
y_train.unique()

In [None]:
cal(xgb_classifier)

In [None]:
cal(tree_classifier)

In [None]:
cal(rf_classifier)

In [None]:
cal(gb_classifier)

In [None]:
cal(log_classifier)

In [None]:
cal(svc_classifier)

In [None]:
cal(knn_classifier)

In [None]:
result1

In [None]:
result2

In [None]:
result3

In [None]:
len(columns), len(result1), len(result2), len(result3)

In [None]:
finlresult = pd.DataFrame({'Algorithm' : columns , 'Accuracy' : result1 , 'Recall' : result2 , 'FScore' : result3})

In [None]:
plt.figure(figsize=(15, 5))
plt.plot(finlresult.Algorithm, result1, label = 'Accuracy')
plt.plot(finlresult.Algorithm, result2, label = 'Recall')
plt.plot(finlresult.Algorithm, result3, label = 'F1score')
plt.legend()
plt.show()

In [None]:
finlresult.plot(kind = "bar", x="Algorithm")

In [None]:
filename = "models_and_encoders/best_loan_model(SVC).pkl"
pkl = pickle.dump(svc_classifier, open(filename, 'wb'))

In [None]:
cross_val_score(svc_classifier, x_train, y_train, cv=5)

In [None]:
pred = svc_classifier.predict(x_test)
pred