## Part 12 - Naives Bayes

In [140]:
# Chargement des librairies nécessaires
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [141]:
# Chargement de la dataset
## Choix de 4 datasets, ne pas oublier de choisir la features_list adéquate

#dataset = pd.read_csv("dataset_CV_labelise_features1_100.csv", delimiter = ";", encoding = "utf-8") ## 100CV + 4 features numériques

#dataset = pd.read_csv("dataset_CV_labelise_features1_200.csv", delimiter = ";", encoding = "utf-8") ## 200CV + 4 features numériques

#dataset = pd.read_csv("dataset_CV_labelise_features2_100.csv", delimiter = ";", encoding = "utf-8") ## 100CV + 12 features numériques

dataset = pd.read_csv("dataset_CV_labelise_features2_200.csv", delimiter = ";", encoding = "utf-8") ## 200CV + 12 features numériques

In [142]:
# Visualisation des valeurs 'Null' dans la dataset
display(100*dataset.isnull().sum()/dataset.shape[0])

CV_Sentences                  0.000000
Sentences_CV_clean            0.035753
CV_Number                     0.000000
Sentence_line                 0.000000
Nb_tokens                     0.000000
%texte_lu                     0.000000
%texte_lu_fin_ligne           0.000000
Is_alpha                      0.000000
Grammar                       0.000000
Tokenization                  0.035753
Verb_count                    0.000000
Propn_count                   0.000000
Noun_count                    0.000000
Num_count                     0.000000
Pourcentage_verb_sentence     0.000000
Pourcentage_propn_sentence    0.000000
Pourcentage_noun_sentence     0.000000
Pourcentage_num_sentence      0.000000
Label                         0.000000
dtype: float64

Prepocessing

In [143]:
# Vérifier des valeurs de la colonne label
dataset["Label"].unique()

array([1, 0], dtype=int64)

In [144]:
# Suppression des lignes de CV_Sentences avec les '#NOM?'
dataset = dataset.loc[(dataset['CV_Sentences'] != "#NOM?"),:]
dataset.shape

(5416, 19)

In [145]:
# Suppression des lignes de CV_Sentences avec ':'
dataset = dataset.loc[(dataset['CV_Sentences'] != ":"),:]
dataset.shape

(5412, 19)

In [146]:
# Suppresion des lignes sans valeur (avec NaN)
dataset = dataset.dropna(axis =0, how = 'any')
dataset.shape

(5410, 19)

In [147]:
# Vérification de la présence des valeurs 'Null' dans la dataset
display(100*dataset.isnull().sum()/dataset.shape[0])

CV_Sentences                  0.0
Sentences_CV_clean            0.0
CV_Number                     0.0
Sentence_line                 0.0
Nb_tokens                     0.0
%texte_lu                     0.0
%texte_lu_fin_ligne           0.0
Is_alpha                      0.0
Grammar                       0.0
Tokenization                  0.0
Verb_count                    0.0
Propn_count                   0.0
Noun_count                    0.0
Num_count                     0.0
Pourcentage_verb_sentence     0.0
Pourcentage_propn_sentence    0.0
Pourcentage_noun_sentence     0.0
Pourcentage_num_sentence      0.0
Label                         0.0
dtype: float64

Application du modèle Naive Bayes

In [148]:
print("Separating labels from features...")

## Choisir la features_list par rapport au dataset
#features_list = ["CV_Sentences","Sentences_CV_clean","CV_Number", "Is_alpha", "Grammar", "Label"]                      ## Pour les datasets features1
features_list = ["CV_Sentences","Sentences_CV_clean","CV_Number", "Is_alpha", "Label", "Grammar", "Tokenization"]       ## Pour les datasets features2
target_variable = "Label"

X = dataset.drop(features_list, axis = 1)
y = dataset.loc[:,target_variable]

print('y : ')
print(y.head())
print()
print('X :')
print(X.head())

Separating labels from features...
y : 
0    1
1    0
2    0
3    0
4    0
Name: Label, dtype: int64

X :
   Sentence_line  Nb_tokens  %texte_lu  %texte_lu_fin_ligne  Verb_count  \
0              0          6       2.80                97.20           1   
1              1          1       3.27                96.73           0   
2              2          8       7.01                92.99           0   
3              3         24      18.22                81.78           5   
4              4          1      18.69                81.31           0   

   Propn_count  Noun_count  Num_count  Pourcentage_verb_sentence  \
0            3           1          1                   0.166667   
1            0           1          0                   0.000000   
2            1           2          0                   0.000000   
3            1           5          1                   0.208333   
4            0           1          0                   0.000000   

   Pourcentage_propn_sentence  Pou

In [149]:
# Automatically detect names of numeric/categorical columns
numeric_features = []
categorical_features = []
for i,t in X.dtypes.iteritems():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

Found numeric features  ['Sentence_line', 'Nb_tokens', '%texte_lu', '%texte_lu_fin_ligne', 'Verb_count', 'Propn_count', 'Noun_count', 'Num_count', 'Pourcentage_verb_sentence', 'Pourcentage_propn_sentence', 'Pourcentage_noun_sentence', 'Pourcentage_num_sentence']
Found categorical features  []


In [150]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42, stratify= y)

In [151]:
# Create pipeline for numeric features
numeric_transformer = SimpleImputer(strategy='mean')

In [152]:
# Create pipeline for categorical features
categorical_transformer = OneHotEncoder(drop='first')

In [153]:
# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [154]:
# Perform grid search
print("Grid search...")
classifier = RandomForestClassifier()

# Grid of values to be tested
params = {
    'max_depth': [2, 4, 6, 8, 10],
    'min_samples_leaf': [1, 2, 5],
    'min_samples_split': [2, 4, 8],
    'n_estimators': [10, 20, 40, 60, 80, 100]
}
gridsearch = GridSearchCV(classifier, param_grid = params, cv = 3) # cv : the number of folds to be used for CV
gridsearch.fit(X_train, y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best validation accuracy : ", gridsearch.best_score_)


Grid search...
...Done.
Best hyperparameters :  {'max_depth': 8, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 40}
Best validation accuracy :  0.9738910467065806


In [155]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = gridsearch.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

Predictions on training set...
...Done.
[0 0 0 ... 0 0 0]



In [156]:
# Predictions on test set
print("Predictions on test set...")
Y_test_pred = gridsearch.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

Predictions on test set...
...Done.
[0 0 0 ... 0 0 0]



In [157]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [158]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

In [159]:
y_pred  =  classifier.predict(X_test)

In [160]:
y_test

5009    0
2701    0
3437    0
2990    0
2896    0
       ..
2726    0
1138    0
3130    0
5401    0
828     0
Name: Label, Length: 1082, dtype: int64

In [161]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test, Y_test_pred)
ac = accuracy_score(y_test, Y_test_pred)

print("Confusion matrix :")
print(cm)
print("Accuracy score :") 

Confusion matrix :
[[1033    7]
 [  24   18]]
Accuracy score :
0.9713493530499075


In [162]:
from sklearn.metrics import accuracy_score, f1_score

In [163]:
print("f1-score on training set : ", f1_score(y_train, Y_train_pred))
print("f1-score on test set : ", f1_score(y_test, Y_test_pred))
print()

f1-score on training set :  0.7430555555555556
f1-score on test set :  0.5373134328358209



Fin Part 12