## Part 11 _ Naives Bayes

In [41]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [42]:
# Chargement de la dataset
dataset = pd.read_csv("dataset_CV_labelise_features1_100.csv", delimiter = ";")
dataset.head()

Unnamed: 0,CV_Sentences,Sentences_CV_clean,CV_Number,Sentence_line,Nb_tokens,%texte_lu,%texte_lu_fin_ligne,Is_alpha,Grammar,Label
0,SELMA LAFKIR CORDE 80 CODEUSE ENTHOUSIASTE,SELMA LAFKIR CORDE 80 CODEUSE ENTHOUSIASTE,CV_1,0,6,2.8,97.2,"[True, True, True, False, True, True]","['NOUN', 'PROPN', 'VERB', 'NUM', 'PROPN', 'PRO...",1
1,PROFIL,PROFIL,CV_1,1,1,3.27,96.73,[True],['NOUN'],0
2,PERSONNEL Je suis étudiante au lycée Condorcet.,PERSONNEL Je suis étudiante au lycée Condorcet.,CV_1,2,8,7.01,92.99,"[True, True, True, True, True, True, True, False]","['NOUN', 'PRON', 'AUX', 'ADJ', 'ADP', 'NOUN', ...",0
3,Je code depuis l'âge de 13 ans et j'aime créer...,Je code depuis l'âge de 13 ans et j'aime créer...,CV_1,3,24,18.22,81.78,"[True, True, True, False, True, True, False, T...","['PRON', 'VERB', 'ADP', 'DET', 'NOUN', 'ADP', ...",0
4,RÉALISATIONS,RÉALISATIONS,CV_1,4,1,18.69,81.31,[True],['NOUN'],0


In [43]:
display(100*dataset.isnull().sum()/dataset.shape[0])

CV_Sentences           0.000000
Sentences_CV_clean     0.034507
CV_Number              0.000000
Sentence_line          0.000000
Nb_tokens              0.000000
%texte_lu              0.000000
%texte_lu_fin_ligne    0.000000
Is_alpha               0.000000
Grammar                0.000000
Label                  0.000000
dtype: float64

In [44]:
dataset["Label"].unique()

array([1, 0], dtype=int64)

In [45]:
dataset = dataset.loc[(dataset['CV_Sentences'] != "#NOM?"),:]

In [46]:
dataset.shape 

(2819, 10)

In [47]:
dataset = dataset.loc[(dataset['CV_Sentences'] != ":"),:]

In [48]:
dataset.shape

(2816, 10)

In [49]:
dataset = dataset.dropna(axis =0, how = 'any')

In [50]:
dataset.shape

(2815, 10)

In [51]:
display(100*dataset.isnull().sum()/dataset.shape[0])

CV_Sentences           0.0
Sentences_CV_clean     0.0
CV_Number              0.0
Sentence_line          0.0
Nb_tokens              0.0
%texte_lu              0.0
%texte_lu_fin_ligne    0.0
Is_alpha               0.0
Grammar                0.0
Label                  0.0
dtype: float64

In [52]:
print("Separating labels from features...")
col_list = ['CV_Sentences',"Sentences_CV_clean","CV_Number", "Is_alpha", "Grammar", "Label"]
target_variable = "Label"

X = dataset.drop(col_list, axis = 1)
y = dataset.loc[:,target_variable]

print('y : ')
print(y.head())
print()
print('X :')
print(X.head())

Separating labels from features...
y : 
0    1
1    0
2    0
3    0
4    0
Name: Label, dtype: int64

X :
   Sentence_line  Nb_tokens  %texte_lu  %texte_lu_fin_ligne
0              0          6       2.80                97.20
1              1          1       3.27                96.73
2              2          8       7.01                92.99
3              3         24      18.22                81.78
4              4          1      18.69                81.31


In [53]:
# Automatically detect names of numeric/categorical columns
numeric_features = []
categorical_features = []
for i,t in X.dtypes.iteritems():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

Found numeric features  ['Sentence_line', 'Nb_tokens', '%texte_lu', '%texte_lu_fin_ligne']
Found categorical features  []


In [54]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42, stratify= y)

In [55]:
# Create pipeline for numeric features
numeric_transformer = SimpleImputer(strategy='mean')

In [56]:
# Create pipeline for categorical features
categorical_transformer = OneHotEncoder(drop='first')

In [57]:
# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [58]:
# Perform grid search
print("Grid search...")
classifier = RandomForestClassifier()

# Grid of values to be tested
params = {
    'max_depth': [2, 4, 6, 8, 10],
    'min_samples_leaf': [1, 2, 5],
    'min_samples_split': [2, 4, 8],
    'n_estimators': [10, 20, 40, 60, 80, 100]
}
gridsearch = GridSearchCV(classifier, param_grid = params, cv = 3) # cv : the number of folds to be used for CV
gridsearch.fit(X_train, y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best validation accuracy : ", gridsearch.best_score_)

Grid search...
...Done.
Best hyperparameters :  {'max_depth': 2, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 80}
Best validation accuracy :  0.9777990827045421


In [59]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = gridsearch.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

Predictions on training set...
...Done.
[0 0 0 ... 0 0 0]



In [60]:
# Predictions on test set
print("Predictions on test set...")
Y_test_pred = gridsearch.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

Predictions on test set...
...Done.
[0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0

In [61]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [62]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

In [63]:
y_pred  =  classifier.predict(X_test)

In [64]:
y_test

1045    0
1024    0
217     0
2670    1
2428    0
       ..
1009    0
2742    1
1722    0
806     0
1209    0
Name: Label, Length: 563, dtype: int64

In [65]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test, Y_test_pred)
ac = accuracy_score(y_test, Y_test_pred)

In [66]:
cm

array([[533,   6],
       [ 11,  13]], dtype=int64)

In [67]:
ac

0.9698046181172292

In [68]:
from sklearn.metrics import accuracy_score, f1_score, ConfusionMatrixDisplay, RocCurveDisplay

In [69]:
print("f1-score on training set : ", f1_score(y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(y_test, Y_test_pred))
print()

f1-score on training set :  0.7065868263473053
f1-score on test set :  0.6046511627906976

