<img src="img/banniere.jpg" />

# Imports

In [1]:
# builtin
import os

# data
import pandas as pd
import numpy as np

# preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
# estimators
from sklearn.linear_model import LogisticRegression

#MOdele selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV


# options 
import warnings
warnings.filterwarnings('ignore')

In [2]:
# import des données
df = pd.read_csv("datas/billets_complet.csv", sep = ',')

## Cleaning

In [3]:
df.is_genuine = LabelEncoder().fit_transform(df['is_genuine'])

## Data preparation

In [4]:
X = df.drop(columns='is_genuine')
y = df.is_genuine
# visualisation
X.head()

Unnamed: 0,diagonal,height_left,height_right,margin_low,margin_up,length
0,171.81,104.86,104.95,4.52,2.89,112.83
1,171.46,103.36,103.66,3.77,2.99,113.09
2,172.69,104.48,103.5,4.4,2.94,113.16
3,171.36,103.91,103.94,3.62,3.01,113.51
4,171.73,104.28,103.46,4.04,3.48,112.54


In [5]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: is_genuine, dtype: int64

## Test Train Split : Avec normalisation

In [6]:
X_train, X_test,y_train, y_test = train_test_split(X,
                                                  y,
                                                  test_size=0.3,
                                                  random_state=42)

In [7]:
scaler = preprocessing.StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Modelisation et recherche des hyper-parametre

In [8]:
estimator = LogisticRegression()

params = { 
    'C' : np.logspace(-3,3,7) ,
    'penalty' : ['l1','l2'],
    'solver' : ['newton-cg','lbfgs','liblinear']
         }

In [9]:
grid = GridSearchCV(estimator,
                    params,
                    cv=10, # nb folds
                    n_jobs=-1, #cpu
                    return_train_score=True,
                    verbose=1)
grid.fit(X_train_scaled, y_train)

Fitting 10 folds for each of 42 candidates, totalling 420 fits


GridSearchCV(cv=10, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'penalty': ['l1', 'l2'],
                         'solver': ['newton-cg', 'lbfgs', 'liblinear']},
             return_train_score=True, verbose=1)

In [10]:
# Meilleurs hyperparametres
best_params = grid.best_params_
best_params

{'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}

## Régression logistique avec les meilleurs hyper-parametre

In [11]:
reg_log = LogisticRegression(**best_params)
reg_log

LogisticRegression(C=0.1, penalty='l1', solver='liblinear')

In [12]:
# Prediction
reg_log.fit(X_train, y_train)
y_pred = reg_log.predict(X_test)
y_pred

array([0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1,

In [13]:
y_prob = reg_log.predict_proba(X_test).round(2)
# Vecteur de probabilté d'appartenir à une classe
y_prob[:10]

array([[0.9 , 0.1 ],
       [0.98, 0.02],
       [0.02, 0.98],
       [0.06, 0.94],
       [0.01, 0.99],
       [0.06, 0.94],
       [0.84, 0.16],
       [0.06, 0.94],
       [0.85, 0.15],
       [0.05, 0.95]])

## Test sur un echantillon

In [14]:
# import du df de test
new = pd.read_csv('datas/billets_production.csv')
# new.head()

In [15]:
# Variables explicatives
X_new = new.drop(columns='id')
# X_new.head()

In [16]:
# variable a expliquer
y_pred_new = reg_log.predict(X_new)
# y_pred_new

In [17]:
# On ajoute le resutat dans un colonne
new["Prediction"] = y_pred_new
# new.head()

In [18]:
# Calcul des probabilités d'appartenir à une classe
proba_new = reg_log.predict_proba(X_new)
# proba_new

In [19]:
# , 0 = faux billet, 1 = true = vrai billet
new['Proba_Faux'] = proba_new[:,0].round(4)*100
new['Proba_Vrai'] = proba_new[:,1].round(4)*100
new.head()

Unnamed: 0,diagonal,height_left,height_right,margin_low,margin_up,length,id,Prediction,Proba_Faux,Proba_Vrai
0,171.76,104.01,103.54,5.21,3.3,111.42,A_1,0,93.04,6.96
1,171.87,104.17,104.13,6.0,3.31,112.09,A_2,0,99.62,0.38
2,172.0,104.58,104.29,4.99,3.39,111.57,A_3,0,88.72,11.28
3,172.49,104.55,104.34,4.44,3.03,113.2,A_4,1,12.08,87.92
4,171.65,103.63,103.56,3.77,3.16,113.33,A_5,1,1.46,98.54


In [20]:
for j, i, k in zip(new.index, new["Prediction"], new["id"]):
    if i == True : 
       print("Le billet {}".format(k),"semble vrai avec une probabilité de "
             + str(new.iloc[j,9]),"%")
    else : 
        print("Le billet {}".format(k),"semble faux avec une probabilité de "
             + str(new.iloc[j,8]),"%")

Le billet A_1 semble faux avec une probabilité de 93.04 %
Le billet A_2 semble faux avec une probabilité de 99.62 %
Le billet A_3 semble faux avec une probabilité de 88.72 %
Le billet A_4 semble vrai avec une probabilité de 87.92 %
Le billet A_5 semble vrai avec une probabilité de 98.54 %


In [22]:
# export
new.to_csv('resultats/monmodel_1_avecCR.csv',index=False)

In [None]:
# test = pd.read_csv('resultats/monmodel_1.csv')
# test.head()