In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
import pickle # pour générer le modèle à déployer dans l'application


In [2]:
# lire la base de données
banktrain = pd.read_csv('/home/bennis/Téléchargements/train_u6lujuX_CVtuZ9i.csv')
banktrain.head(15)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
6,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
7,LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,N
8,LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,Y
9,LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,N


In [3]:
banktrain.shape

(614, 13)

In [4]:
banktrain.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [5]:
# On va d'abord nettoyer le dataset en recensant les valeurs manquantes NaN 
# Dans le dataset, nous allons donc utiliser la méthode sort_values

banktrain.isnull().sum().sort_values(ascending=False)

Credit_History       50
Self_Employed        32
LoanAmount           22
Dependents           15
Loan_Amount_Term     14
Gender               13
Married               3
Loan_ID               0
Education             0
ApplicantIncome       0
CoapplicantIncome     0
Property_Area         0
Loan_Status           0
dtype: int64

In [6]:
# On voit que l'on a beaucoup de valeurs manquantes, qu'elles soient numériques ou catégorielles
# On a donc intérêt à bien les dissocier

categorie=[]
numerik=[]
for i,j in enumerate(banktrain.dtypes):
    if j==object:
        categorie.append(banktrain.iloc[:,i])
    else :
        numerik.append(banktrain.iloc[:,i])

In [7]:
# On va réorganiser la liste categorie en matrice avec des données visibles et exploitables
# de manière à renseigner les valeurs manquantes

In [8]:
categorie=pd.DataFrame(categorie)

In [9]:
categorie.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,604,605,606,607,608,609,610,611,612,613
Loan_ID,LP001002,LP001003,LP001005,LP001006,LP001008,LP001011,LP001013,LP001014,LP001018,LP001020,...,LP002959,LP002960,LP002961,LP002964,LP002974,LP002978,LP002979,LP002983,LP002984,LP002990
Gender,Male,Male,Male,Male,Male,Male,Male,Male,Male,Male,...,Female,Male,Male,Male,Male,Female,Male,Male,Male,Female
Married,No,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,...,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,No
Dependents,0,1,0,0,0,2,0,3+,2,1,...,1,0,1,2,0,0,3+,1,2,0
Education,Graduate,Graduate,Graduate,Not Graduate,Graduate,Graduate,Not Graduate,Graduate,Graduate,Graduate,...,Graduate,Not Graduate,Graduate,Not Graduate,Graduate,Graduate,Graduate,Graduate,Graduate,Graduate
Self_Employed,No,No,Yes,No,No,Yes,No,No,No,No,...,No,No,No,No,No,No,No,No,No,Yes
Property_Area,Urban,Rural,Urban,Urban,Urban,Urban,Urban,Semiurban,Urban,Semiurban,...,Semiurban,Urban,Semiurban,Rural,Rural,Rural,Rural,Urban,Urban,Semiurban
Loan_Status,Y,N,Y,Y,Y,Y,Y,N,Y,N,...,Y,N,Y,Y,Y,Y,Y,Y,Y,N


In [10]:
# En transposant la matrice ci-dessus, nous obtiendrons les catégories en colonnes

In [11]:
categorie=categorie.transpose()

In [12]:
categorie.head(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,Urban,Y
4,LP001008,Male,No,0,Graduate,No,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,Urban,Y
6,LP001013,Male,Yes,0,Not Graduate,No,Urban,Y
7,LP001014,Male,Yes,3+,Graduate,No,Semiurban,N
8,LP001018,Male,Yes,2,Graduate,No,Urban,Y
9,LP001020,Male,Yes,1,Graduate,No,Semiurban,N


In [13]:
# Nous allons procéder de même avec les valeurs numériques

In [14]:
numerik=pd.DataFrame(numerik)
numerik=numerik.transpose()

In [15]:
numerik.head(10)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
0,5849.0,0.0,,360.0,1.0
1,4583.0,1508.0,128.0,360.0,1.0
2,3000.0,0.0,66.0,360.0,1.0
3,2583.0,2358.0,120.0,360.0,1.0
4,6000.0,0.0,141.0,360.0,1.0
5,5417.0,4196.0,267.0,360.0,1.0
6,2333.0,1516.0,95.0,360.0,1.0
7,3036.0,2504.0,158.0,360.0,0.0
8,4006.0,1526.0,168.0,360.0,1.0
9,12841.0,10968.0,349.0,360.0,1.0


In [16]:
# On peut remplacer les valeurs manquantes par les valeurs qui se répètent le plus

In [17]:
categorie['Married'].value_counts()

Yes    398
No     213
Name: Married, dtype: int64

In [18]:
categorie=categorie.apply(lambda x:x.fillna(x.value_counts().index[0]))
categorie.isnull().any()

Loan_ID          False
Gender           False
Married          False
Dependents       False
Education        False
Self_Employed    False
Property_Area    False
Loan_Status      False
dtype: bool

In [19]:
categorie.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,Urban,Y
4,LP001008,Male,No,0,Graduate,No,Urban,Y


In [20]:
# On peut supprimer la colonne des Loan_ID
categorie.drop('Loan_ID',axis=1,inplace=True)

In [21]:
numerik.fillna(method='bfill',inplace=True)
numerik.isnull().any()

ApplicantIncome      False
CoapplicantIncome    False
LoanAmount           False
Loan_Amount_Term     False
Credit_History       False
dtype: bool

In [22]:
# La variable Loan_status est la colonne target (Crédit accepté ou pas)
# Pour l'exploiter, nous allons les remplacer par des 0 et des 1

In [23]:
categorie['Loan_Status']

0      Y
1      N
2      Y
3      Y
4      Y
      ..
609    Y
610    Y
611    Y
612    Y
613    N
Name: Loan_Status, Length: 614, dtype: object

In [24]:
categorie['Loan_Status']=categorie['Loan_Status'].replace(['Y','N'],['1','0'])

In [25]:
categorie['Loan_Status']

0      1
1      0
2      1
3      1
4      1
      ..
609    1
610    1
611    1
612    1
613    0
Name: Loan_Status, Length: 614, dtype: object

In [26]:
target=categorie['Loan_Status']

In [27]:
# On va devoir concaténer les datas categorie et numerik dans un seul database

In [28]:
label=LabelEncoder() 
for i in categorie:
    categorie[i]=label.fit_transform(categorie[i])
categorie

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Property_Area,Loan_Status
0,1,0,0,0,0,2,1
1,1,1,1,0,0,0,0
2,1,1,0,0,1,2,1
3,1,1,0,1,0,2,1
4,1,0,0,0,0,2,1
...,...,...,...,...,...,...,...
609,0,0,0,0,0,0,1
610,1,1,3,0,0,0,1
611,1,1,1,0,0,2,1
612,1,1,2,0,0,2,1


In [29]:
X=pd.concat([categorie,numerik],axis=1)
y=target

In [30]:
X.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Property_Area,Loan_Status,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
0,1,0,0,0,0,2,1,5849.0,0.0,128.0,360.0,1.0
1,1,1,1,0,0,0,0,4583.0,1508.0,128.0,360.0,1.0
2,1,1,0,0,1,2,1,3000.0,0.0,66.0,360.0,1.0
3,1,1,0,1,0,2,1,2583.0,2358.0,120.0,360.0,1.0
4,1,0,0,0,0,2,1,6000.0,0.0,141.0,360.0,1.0


In [31]:
y.head()

0    1
1    0
2    1
3    1
4    1
Name: Loan_Status, dtype: object

In [32]:
target.value_counts()

1    422
0    192
Name: Loan_Status, dtype: int64

In [33]:
# A première vue, il y a plus de crédits acceptés que de crédits refusés 

In [35]:
S=StratifiedShuffleSplit(n_splits=1,test_size=0,2,random_state=42)


SyntaxError: positional argument follows keyword argument (2381291618.py, line 1)