In [None]:
import  numpy as np
import  pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report

from sklearn import feature_selection
from sklearn import model_selection
from sklearn.metrics import accuracy_score 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings('ignore')

# Récupération du dataframe nettoyé

In [None]:
df = pd.read_csv('train_propre.csv')
#test = pd.read_csv('test_propre.csv')

In [None]:
df.head()

In [None]:
#test.head()

In [None]:
X = df.drop(["Loan_Status"], axis=1)
X

In [None]:
X.shape

In [None]:
X.columns

In [None]:
y = df["Loan_Status"]
y

In [None]:
y.shape

# TRAIN/TEST Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [None]:
plt.figure(figsize=(8,5), dpi=100)
plt.title("Distribution des données")
plt.hist(y, label="jeu total")
plt.hist(y_train, label="jeu d'apprentissage")
plt.hist(y_test, label="jeu de test")
plt.xlabel("Attribution du prêt")
plt.ylabel("Nombre d'exemples")
plt.legend()
plt.show()

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
X_test.shape

In [None]:
y_test.shape

# Features Scaling 

 transformer les variables continues en utilsant MinMaxScaler
 
 https://towardsdatascience.com/what-is-feature-scaling-why-is-it-important-in-machine-learning-2854ae877048
 
 https://medium.com/codex/feature-scaling-in-machine-learning-e86b360d1c31

In [None]:
scaler = StandardScaler()

scaler.fit(X_train)

In [None]:
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [None]:
plt.figure(figsize=(16, 10), dpi=100)

plt.subplot(1, 2, 1)
plt.title('Raw Data')
plt.boxplot(X_train)
plt.xlabel('Caractéristiques')

plt.subplot(1, 2, 2)
plt.title('Scaled Data')
plt.boxplot(scaled_X_train)
plt.xlabel('Caractéristiques')

plt.show()

# GridSearchCV MLPClassifier (perceptron)

In [None]:
hyperparameters = {'hidden_layer_sizes': [(1,), (3,), (5,), (7,)], 'activation': ['logistic', 'tanh', 'relu']}

In [None]:
model = MLPClassifier(solver='lbfgs', alpha=0.0001, max_iter=10000, random_state=0, max_fun=15000)

In [None]:
gridSearchCV = GridSearchCV(model,
                            hyperparameters,
                            n_jobs=-1,
                            refit=True,
                            cv=3,
                            return_train_score=True,
                            scoring=f1_score)

gridSearchCV.fit(scaled_X_train, y_train)

In [None]:
gridSearchCV.best_params_

In [None]:
pd.DataFrame(gridSearchCV.cv_results_).sort_values(by=['rank_test_score'])

- rank_test_score : je vais prendre rank à 1 et tester avec les deux fonctions d'activation relu et tanh
- mean_train_score : 0.86 et 0.83
- mean_test_score : 0.82 et 0.80
- Le meilleur c'est de prendre train et test avec le moins d'écart

## Evaluation des performances du modèle sur le jeu d'apprentissage

In [None]:
y_predict_train = gridSearchCV.best_estimator_.predict(scaled_X_train)

## Matrice de confusion Train

In [None]:
cm = confusion_matrix(y_train, y_predict_train, normalize='true')

In [None]:
names = ['Vrai Négatif (VN)', 'Faux Positif (FP)','Faux Négatif (FN)', 'Vrai Positif (VP)']
counts = ["{0:0.0f}\n".format(value) for value in confusion_matrix(y_train, y_predict_train, normalize=None).flatten()]
percentages = ["{0:.2%}".format(value) for value in confusion_matrix(y_train, y_predict_train, normalize='true').flatten()]

box_labels = [f"{v1}\n{v2}{v3}" for v1, v2, v3 in zip(names, counts, percentages)]
box_labels = np.asarray(box_labels).reshape(cm.shape[0],cm.shape[1])

plt.figure(figsize=(8,5), dpi=100)

plt.title("Matrice de confusion (train dataset)")

sns.heatmap(cm,
            vmin=0.0,
            vmax=1.0,
            cmap='Blues',
            annot=box_labels,
            fmt='',
            xticklabels=['Non', 'Oui'],
            yticklabels=['Non', 'Oui'])

plt.xlabel("Valeurs prédites")
plt.ylabel("Valeurs réelles")

plt.show()

In [None]:
print(classification_report(y_train, y_predict_train))

In [None]:
MLPAcc = accuracy_score(y_predict_train,y_train)
print('MLP accuracy: {:.2f}%'.format(MLPAcc*100))

### Il faudrait afficher les courbes de roc (accuracy loss) mais dans le cas du gridsearchcv on ne peut pas les afficher car on n'est pas sur keras, y a pas de history, history que sur keras

## Predict sur le Test

In [None]:
y_predict_test = gridSearchCV.best_estimator_.predict(scaled_X_test)

In [None]:
MLPAcc = accuracy_score(y_test, y_predict_test)
print('MLP accuracy: {:.2f}%'.format(MLPAcc*100))

## Matrice de confusion Test

In [None]:
cm = confusion_matrix(y_test, y_predict_test, normalize='true')

In [None]:
names = ['Vrai Négatif (VN)', 'Faux Positif (FP)','Faux Négatif (FN)', 'Vrai Positif (VP)']
counts = ["{0:0.0f}\n".format(value) for value in confusion_matrix(y_test, y_predict_test, normalize=None).flatten()]
percentages = ["{0:.2%}".format(value) for value in confusion_matrix(y_test, y_predict_test, normalize='true').flatten()]

box_labels = [f"{v1}\n{v2}{v3}" for v1, v2, v3 in zip(names, counts, percentages)]
box_labels = np.asarray(box_labels).reshape(cm.shape[0],cm.shape[1])

plt.figure(figsize=(8,5), dpi=100)

plt.title("Matrice de confusion (test dataset)")

sns.heatmap(cm,
            vmin=0.0,
            vmax=1.0,
            cmap='Blues',
            annot=box_labels,
            fmt='',
            xticklabels=['Non', 'Oui'],
            yticklabels=['Non', 'Oui'])

plt.xlabel("Valeurs prédites")
plt.ylabel("Valeurs réelles")

plt.show()

# GridSearchCV Random Forest

In [None]:
rfc=RandomForestClassifier(random_state=42)

In [None]:
param_grid = { 
    'n_estimators': [100, 150, 200, 250, 300, 350, 400, 450, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8,10],
    'criterion' :['gini', 'entropy']
}

In [None]:
gridSearchCV2 = GridSearchCV(estimator=rfc, 
                            param_grid=param_grid, 
                            n_jobs=-1,
                            cv= 3 #3 ou 5
                           )
gridSearchCV2.fit(X_train, y_train)

In [None]:
gridSearchCV2.best_params_

In [None]:
res = pd.DataFrame(gridSearchCV2.cv_results_).sort_values(by=['rank_test_score'])
res

In [None]:
print(res.loc[1,['params']])

## Evaluation des performances du modèle sur le jeu d'apprentissage

In [None]:
y_predict_train2 = gridSearchCV2.best_estimator_.predict(X_train)

## Matrice de confusion Train

In [None]:
cm2 = confusion_matrix(y_train, y_predict_train2, normalize='true')

In [None]:
names = ['Vrai Négatif (VN)', 'Faux Positif (FP)','Faux Négatif (FN)', 'Vrai Positif (VP)']
counts = ["{0:0.0f}\n".format(value) for value in confusion_matrix(y_train, y_predict_train2, normalize=None).flatten()]
percentages = ["{0:.2%}".format(value) for value in confusion_matrix(y_train, y_predict_train2, normalize='true').flatten()]

box_labels = [f"{v1}\n{v2}{v3}" for v1, v2, v3 in zip(names, counts, percentages)]
box_labels = np.asarray(box_labels).reshape(cm2.shape[0],cm2.shape[1])

plt.figure(figsize=(8,5), dpi=100)

plt.title("Matrice de confusion (train dataset)")

sns.heatmap(cm2,
            vmin=0.0,
            vmax=1.0,
            cmap='Blues',
            annot=box_labels,
            fmt='',
            xticklabels=['Non', 'Oui'],
            yticklabels=['Non', 'Oui'])

plt.xlabel("Valeurs prédites")
plt.ylabel("Valeurs réelles")

plt.show()

In [None]:
print(classification_report(y_train, y_predict_train2))

In [None]:
print("Train accuracy for Random Forest on CV data: ",accuracy_score(y_train, y_predict_train2))

In [None]:
MLPAcc = accuracy_score(y_predict_train2,y_train)
print('MLP accuracy: {:.2f}%'.format(MLPAcc*100))

## Predict sur le Test

In [None]:
y_predict_test2 = gridSearchCV2.best_estimator_.predict(X_test)

In [None]:
MLPAcc = accuracy_score(y_test, y_predict_test2)
print('MLP accuracy: {:.2f}%'.format(MLPAcc*100))

## Matrice de confusion Test

In [None]:
cm2 = confusion_matrix(y_test, y_predict_test2, normalize='true')

In [None]:
names = ['Vrai Négatif (VN)', 'Faux Positif (FP)','Faux Négatif (FN)', 'Vrai Positif (VP)']
counts = ["{0:0.0f}\n".format(value) for value in confusion_matrix(y_test, y_predict_test2, normalize=None).flatten()]
percentages = ["{0:.2%}".format(value) for value in confusion_matrix(y_test, y_predict_test2, normalize='true').flatten()]

box_labels = [f"{v1}\n{v2}{v3}" for v1, v2, v3 in zip(names, counts, percentages)]
box_labels = np.asarray(box_labels).reshape(cm2.shape[0],cm2.shape[1])

plt.figure(figsize=(8,5), dpi=100)

plt.title("Matrice de confusion (test dataset)")

sns.heatmap(cm2,
            vmin=0.0,
            vmax=1.0,
            cmap='Blues',
            annot=box_labels,
            fmt='',
            xticklabels=['Non', 'Oui'],
            yticklabels=['Non', 'Oui'])

plt.xlabel("Valeurs prédites")
plt.ylabel("Valeurs réelles")

plt.show()

# Training model choice

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.metrics import mean_squared_error

## Logistic Regression

In [None]:
LRclassifier = LogisticRegression(solver='saga', max_iter=500, random_state=1)
LRclassifier.fit(X_train, y_train)

y_pred = LRclassifier.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

from sklearn.metrics import accuracy_score
LRAcc = accuracy_score(y_pred,y_test)
print('LR accuracy: {:.2f}%'.format(LRAcc*100))

## K-Nearest Neighbour (KNN)

In [None]:
scoreListknn = []
for i in range(1,21):
    KNclassifier = KNeighborsClassifier(n_neighbors = i)
    KNclassifier.fit(scaled_X_train, y_train)
    scoreListknn.append(KNclassifier.score(X_test, y_test))
    
plt.plot(range(1,21), scoreListknn)
plt.xticks(np.arange(1,21,1))
plt.xlabel("K value")
plt.ylabel("Score")
plt.show()
KNAcc = max(scoreListknn)
print("KNN best accuracy: {:.2f}%".format(KNAcc*100))

## Support Vector Machine (SVM)

In [None]:
SVCclassifier = SVC(kernel='rbf', max_iter=500)
SVCclassifier.fit(scaled_X_train, y_train)

y_pred = SVCclassifier.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

from sklearn.metrics import accuracy_score
SVCAcc = accuracy_score(y_pred,y_test)
print('SVC accuracy: {:.2f}%'.format(SVCAcc*100))

## Gaussian NB

In [None]:
NBclassifier2 = GaussianNB()
NBclassifier2.fit(scaled_X_train, y_train)

y_pred = NBclassifier2.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

from sklearn.metrics import accuracy_score
NBAcc = accuracy_score(y_pred,y_test)
print('Gaussian Naive Bayes accuracy: {:.2f}%'.format(NBAcc*100))

## Decision Tree

In [None]:
scoreListDT = []
for i in range(2,21):
    DTclassifier = DecisionTreeClassifier(max_leaf_nodes=i)
    DTclassifier.fit(scaled_X_train, y_train)
    scoreListDT.append(DTclassifier.score(X_test, y_test))
    
plt.plot(range(2,21), scoreListDT)
plt.xticks(np.arange(2,21,1))
plt.xlabel("Leaf")
plt.ylabel("Score")
plt.show()
DTAcc = max(scoreListDT)
print("Decision Tree Accuracy: {:.2f}%".format(DTAcc*100))

## Random Forest

In [None]:
scoreListRF = []
for i in range(2,25):
    RFclassifier = RandomForestClassifier(n_estimators = 1000, random_state = 1, max_leaf_nodes=i)
    RFclassifier.fit(scaled_X_train, y_train)
    scoreListRF.append(RFclassifier.score(X_test, y_test))
    
plt.plot(range(2,25), scoreListRF)
plt.xticks(np.arange(2,25,1))
plt.xlabel("RF Value")
plt.ylabel("Score")
plt.show()
RFAcc = max(scoreListRF)
print("Random Forest Accuracy:  {:.2f}%".format(RFAcc*100))

## Model Comparaison

In [None]:
compare = pd.DataFrame({'Model': ['Logistic Regression', 'K Neighbors', 'SVM', 
                                  'Gaussian NB', 'Decision Tree', 'Random Forest'], 
                        'Accuracy': [LRAcc*100, KNAcc*100, SVCAcc*100, 
                                     NBAcc*100,  DTAcc*100, RFAcc*100]})
compare.sort_values(by='Accuracy', ascending=False)

In [None]:
model = RandomForestClassifier(n_estimators = 1000, random_state = 1, max_leaf_nodes=i)

In [None]:
model.fit(scaled_X_train, y_train)
model=model.predict(X_test)

# Pickle

- On garde le RandomForestClassifier du GridSearchCV

In [None]:
model_retenu = gridSearchCV2.best_estimator_.predict(X_test)

In [None]:
import pickle
 
# Save the trained model as a pickle string.
with open('model_Final.pkl', 'wb') as model_file:
    pickle.dump(model_retenu, model_file)

In [None]:
X_test

In [None]:
test=[[0, 1, 2, 1, 0, 6344443, 4754440, 130, 360, 1, 1]]
#test=model.predict(test)
test=gridSearchCV2.best_estimator_.predict(test)
test