# **Classifieur n°1 : pertinence globale**

## **Importation**

In [66]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/My\ Drive/'Text mining'

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive/My Drive/Text mining


In [0]:
#!pip install -U spacy
#!python -m spacy download fr_core_news_md

In [68]:
#from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd
import spacy
import os
from collections import defaultdict, Counter
import numpy as np
import re
import glob
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize.regexp import WordPunctTokenizer
from nltk.stem import SnowballStemmer
nltk.download('stopwords')
sw=stopwords.words("french")
sw += ['être','avoir']
nlp = spacy.load('fr_core_news_md')

#chemin=r"C:\Users\HP\Documents\Claire\TEXTMINING\pukanina-chatbot-master\pukanina-chatbot-master\Data\Q&A.csv"
#QA=pd.read_csv(chemin,sep=';')
QA = pd.read_csv('Data/Q&A.csv',sep=";")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## **Pré-traitement**

In [0]:
#Définition des fonctions de prétraitement du texte
def lemmatise_text(text):
  tw_nlp = nlp(text)
  list_lem = [token.lemma_ for token in tw_nlp]
  text_lem = ' '.join(list_lem)
  return text_lem

def stem_text(text):
  tokenizer = WordPunctTokenizer()
  stemmer = SnowballStemmer('french')
  liste_racines = [stemmer.stem(token) for token in tokenizer.tokenize(text)]
  return ' '.join(liste_racines)

def normalise(text):
  #stop words, strip accent et lowercase vont être fait automatiquement
  text = text.replace('\n','').replace('\r','').split(" ")
  text = " ".join([i for i in text if i!=""])
  lemmas = lemmatise_text(text) #lemme de notre texte
  stems = stem_text(lemmas) #stem de notre texte A VOIR
  return stems

In [0]:
#Corpus_conv
f = open("Data/OpenSubtitles.fr-is.fr", mode='r', encoding='utf-8')
corpus_conv = f.readlines()[:156]
themes_corpus_conv = ['autres']*len(corpus_conv)

#Rassemblement questions et réponses pour créer un corpus = liste de documents
corpus_QA = list(QA.Answers) + list(QA.Questions)
themes_corpus_QA = ['metier']*len(corpus_QA)

#Création du corpus pour le classifieur de pertinence globale
corpus1 = corpus_conv + corpus_QA
themes1 = themes_corpus_conv + themes_corpus_QA

#Normalisation du corpus
collection1 = pd.Series(corpus1)
collection1 = collection1.apply(normalise)

In [0]:
#Création train et validation
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(collection1,
                                                      themes1,
                                                      train_size=0.75,
                                                      random_state=5)

In [72]:
#Création du vectorizer bin sur X_train : BINAIRE
vectorizer = CountVectorizer(lowercase=True,
                             stop_words=sw,
                             strip_accents='unicode',
                             binary=True).fit(X_train)
X_train_vectorized_bin = vectorizer.transform(X_train)
X_valid_vectorized_bin = vectorizer.transform(X_valid)

  'stop_words.' % sorted(inconsistent))


In [73]:
#Création du vectorizer count sur X_train : COUNT
vectorizer = CountVectorizer(lowercase=True,
                             stop_words=sw,
                             strip_accents='unicode',
                             binary=False).fit(X_train)
X_train_vectorized_count = vectorizer.transform(X_train)
X_valid_vectorized_count = vectorizer.transform(X_valid)

  'stop_words.' % sorted(inconsistent))


In [74]:
#Création du vectorizer tfidf sur X_train : TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(lowercase=True,
                             stop_words=sw,
                             strip_accents='unicode',
                             max_df=0.5).fit(X_train)
X_train_vectorized_tfidf = vectorizer.transform(X_train)
X_valid_vectorized_tfidf = vectorizer.transform(X_valid)

  'stop_words.' % sorted(inconsistent))


In [0]:
resultat_binaire = {}
resultat_count = {}
resultat_tfidf = {}

## **SVM**
**Vectorizer Binaire**

In [0]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn import model_selection
from sklearn.svm import SVC

param_grid = {'C': np.arange(0.1,10,0.1),'gamma':np.arange(0.009,1,0.1)} 
mvs = SVC(kernel = 'rbf')
grid = model_selection.GridSearchCV(estimator=mvs,param_grid=param_grid,cv = 4, scoring='accuracy', n_jobs = -1)

In [77]:
#Recherche des meilleurs paramètres
grille = grid.fit(X_train_vectorized_bin, y_train)
print(grille.best_params_,grille.best_score_)
predictions_valid = grille.predict(X_valid_vectorized_bin)
predictions_train = grille.predict(X_train_vectorized_bin)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_binaire['SVM']=accuracy_score(y_valid, predictions_valid)

{'C': 2.2, 'gamma': 0.109} 0.9060490940970193
0.9914529914529915 0.8717948717948718


**Vectorizer count**

In [78]:
#Recherche des meilleurs paramètres
grille = grid.fit(X_train_vectorized_count, y_train)
print(grille.best_params_,grille.best_score_)
predictions_valid = grille.predict(X_valid_vectorized_count)
predictions_train = grille.predict(X_train_vectorized_count)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_count['SVM']=accuracy_score(y_valid, predictions_valid)

{'C': 5.6, 'gamma': 0.109} 0.9060490940970193
1.0 0.8846153846153846


**Vectorizer tfidf**

In [79]:
#Recherche des meilleurs paramètres
grille = grid.fit(X_train_vectorized_tfidf, y_train)
print(grille.best_params_,grille.best_score_)
predictions_valid = grille.predict(X_valid_vectorized_tfidf)
predictions_train = grille.predict(X_train_vectorized_tfidf)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_tfidf['SVM']=accuracy_score(y_valid, predictions_valid)

{'C': 4.8, 'gamma': 0.20900000000000002} 0.9060490940970193
1.0 0.8974358974358975


## **Régression logistique**
**Vectorizer Binaire**

In [0]:
from sklearn.linear_model import LogisticRegression

param_grid = {'C': np.arange(0.5,5,0.1)} 
mlr = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter = 1500)
grid = model_selection.GridSearchCV(estimator=mlr,param_grid=param_grid,cv = 4, scoring='accuracy', n_jobs = -1)

In [81]:
#Recherche des meilleurs paramètres
grille = grid.fit(X_train_vectorized_bin, y_train)
print(grille.best_params_,grille.best_score_)
predictions_valid = grille.predict(X_valid_vectorized_bin)
predictions_train = grille.predict(X_train_vectorized_bin)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_binaire['log']=accuracy_score(y_valid, predictions_valid)

{'C': 3.2999999999999994} 0.8847165400350672
1.0 0.8461538461538461


**Vectorizer count**

In [82]:
#Recherche des meilleurs paramètres
grille = grid.fit(X_train_vectorized_count, y_train)
print(grille.best_params_,grille.best_score_)
predictions_valid = grille.predict(X_valid_vectorized_count)
predictions_train = grille.predict(X_train_vectorized_count)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_count['log']=accuracy_score(y_valid, predictions_valid)

{'C': 2.5999999999999996} 0.8847165400350672
1.0 0.8589743589743589


**Vectorizer tfidf**

In [83]:
#Recherche des meilleurs paramètres
grille = grid.fit(X_train_vectorized_tfidf, y_train)
print(grille.best_params_,grille.best_score_)
predictions_valid = grille.predict(X_valid_vectorized_tfidf)
predictions_train = grille.predict(X_train_vectorized_tfidf)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_tfidf['log']=accuracy_score(y_valid, predictions_valid)

{'C': 3.1999999999999993} 0.8933372296902397
1.0 0.8717948717948718


## **Random Forest**
**Vectorizer Binaire**

In [0]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(n_estimators=1000, oob_score = True)

In [85]:
model_rf.fit(X_train_vectorized_bin, y_train)
predictions_valid = model_rf.predict(X_valid_vectorized_bin)
predictions_train = model_rf.predict(X_train_vectorized_bin)
print(accuracy_score(y_train,predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_binaire['rf']=accuracy_score(y_valid, predictions_valid)

1.0 0.8333333333333334


**Vectorizer count**

In [86]:
model_rf.fit(X_train_vectorized_count, y_train)
predictions_valid = model_rf.predict(X_valid_vectorized_count)
predictions_train = model_rf.predict(X_train_vectorized_count)
print(accuracy_score(y_train,predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_count['rf']=accuracy_score(y_valid, predictions_valid)

1.0 0.8205128205128205


**Vectorizer tfidf**

In [87]:
model_rf.fit(X_train_vectorized_tfidf, y_train)
predictions_valid = model_rf.predict(X_valid_vectorized_tfidf)
predictions_train = model_rf.predict(X_train_vectorized_tfidf)
print(accuracy_score(y_train,predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_tfidf['rf']=accuracy_score(y_valid, predictions_valid)

1.0 0.8461538461538461


## **Ridge Regression**
**Vectorizer Binaire**

In [0]:
from sklearn.linear_model import RidgeClassifier

param_grid = {'alpha': [0.001,0.01,0.01,0.1,1,10,100]}
mridge = RidgeClassifier(tol=0.01)
grid = model_selection.GridSearchCV(estimator=mridge,param_grid=param_grid,cv = 4, scoring='accuracy', n_jobs = -1)

In [89]:
#Recherche des meilleurs paramètres
grille = grid.fit(X_train_vectorized_bin, y_train)
print(grille.best_params_,grille.best_score_)
predictions_valid = grille.predict(X_valid_vectorized_bin)
predictions_train = grille.predict(X_train_vectorized_bin)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_binaire['ridge']=accuracy_score(y_valid, predictions_valid)

{'alpha': 1} 0.841832261835184
1.0 0.8205128205128205


**Vectorizer count**

In [90]:
#Recherche des meilleurs paramètres
grille = grid.fit(X_train_vectorized_count, y_train)
print(grille.best_params_,grille.best_score_)
predictions_valid = grille.predict(X_valid_vectorized_count)
predictions_train = grille.predict(X_train_vectorized_count)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_count['ridge']=accuracy_score(y_valid, predictions_valid)

{'alpha': 1} 0.8204266510812391
1.0 0.8589743589743589


**Vectorizer tfidf**

In [91]:
#Recherche des meilleurs paramètres
grille = grid.fit(X_train_vectorized_tfidf, y_train)
print(grille.best_params_,grille.best_score_)
predictions_valid = grille.predict(X_valid_vectorized_tfidf)
predictions_train = grille.predict(X_train_vectorized_tfidf)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_tfidf['ridge']=accuracy_score(y_valid, predictions_valid)

{'alpha': 0.001} 0.8975745178258329
1.0 0.8846153846153846


## **Naif bayésien**
**Vectorizer binaire**

In [0]:
from sklearn.naive_bayes import MultinomialNB

In [93]:
model_nb = MultinomialNB().fit(X_train_vectorized_bin, y_train)
predictions_valid = model_nb.predict(X_valid_vectorized_bin)
predictions_train = model_nb.predict(X_train_vectorized_bin)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_binaire['naive']=accuracy_score(y_valid, predictions_valid)

0.9700854700854701 0.9102564102564102


**Vectorizer count**

In [94]:
model_nb = MultinomialNB().fit(X_train_vectorized_count, y_train)
predictions_valid = model_nb.predict(X_valid_vectorized_count)
predictions_train = model_nb.predict(X_train_vectorized_count)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_count['naive']=accuracy_score(y_valid, predictions_valid)

0.9401709401709402 0.8846153846153846


**Vectorizer tfidf**

In [95]:
model_nb = MultinomialNB().fit(X_train_vectorized_tfidf, y_train)
predictions_valid = model_nb.predict(X_valid_vectorized_tfidf)
predictions_train = model_nb.predict(X_train_vectorized_tfidf)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_tfidf['naive']=accuracy_score(y_valid, predictions_valid)

0.9871794871794872 0.9102564102564102


## **Boosting**
**Vectorizer binaire**

In [0]:
from sklearn.ensemble import GradientBoostingClassifier

param_grid = {'learning_rate':[0.01,0.1,1.0,10.0]}
mboosting = GradientBoostingClassifier()
grid = model_selection.GridSearchCV(estimator=mboosting,param_grid=param_grid,cv = 4, scoring='accuracy', n_jobs = -1)

In [97]:
grille = grid.fit(X_train_vectorized_bin, y_train)
predictions_valid = grille.predict(X_valid_vectorized_bin)
predictions_train = grille.predict(X_train_vectorized_bin)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_binaire['boosting']=accuracy_score(y_valid, predictions_valid)

0.9914529914529915 0.8076923076923077


**Vectorizer count**

In [98]:
grille = grid.fit(X_train_vectorized_count, y_train)
predictions_valid = grille.predict(X_valid_vectorized_count)
predictions_train = grille.predict(X_train_vectorized_count)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_count['boosting']=accuracy_score(y_valid, predictions_valid)

1.0 0.8461538461538461


**Vectorizer tfidf**

In [99]:
grille = grid.fit(X_train_vectorized_tfidf, y_train)
predictions_valid = grille.predict(X_valid_vectorized_tfidf)
predictions_train = grille.predict(X_train_vectorized_tfidf)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_tfidf['boosting']=accuracy_score(y_valid, predictions_valid)

1.0 0.7948717948717948


## **AdaBoostClassifier**
**Vectorizer binaire**

In [0]:
from sklearn.ensemble import AdaBoostClassifier

param_grid = {'learning_rate':[0.01,0.1,1.0,10.0], 'n_estimators':[50,60,70,80,90,100]}
madaboost = AdaBoostClassifier()
grid = model_selection.GridSearchCV(estimator=madaboost,param_grid=param_grid,cv = 4, scoring='accuracy', n_jobs = -1)

In [101]:
grille = grid.fit(X_train_vectorized_bin, y_train)
predictions_valid = grille.predict(X_valid_vectorized_bin)
predictions_train = grille.predict(X_train_vectorized_bin)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_binaire['adaboost']=accuracy_score(y_valid, predictions_valid)

0.9743589743589743 0.8333333333333334


**Vectorizer count**

In [102]:
grille = grid.fit(X_train_vectorized_count, y_train)
predictions_valid = grille.predict(X_valid_vectorized_count)
predictions_train = grille.predict(X_train_vectorized_count)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_count['adaboost']=accuracy_score(y_valid, predictions_valid)

0.9743589743589743 0.8333333333333334


**Vectorizer tfidf**

In [103]:
grille = grid.fit(X_train_vectorized_tfidf, y_train)
predictions_valid = grille.predict(X_valid_vectorized_tfidf)
predictions_train = grille.predict(X_train_vectorized_tfidf)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_tfidf['adaboost']=accuracy_score(y_valid, predictions_valid)

0.9914529914529915 0.8333333333333334


## **Restitution de l'ensemble des résultats**

In [104]:
resultat = [resultat_binaire,resultat_count,resultat_tfidf]
pd.DataFrame(resultat,index=['binaire','count','tfidf'])

Unnamed: 0,SVM,log,rf,ridge,naive,boosting,adaboost
binaire,0.871795,0.846154,0.833333,0.820513,0.910256,0.807692,0.833333
count,0.884615,0.858974,0.820513,0.858974,0.884615,0.846154,0.833333
tfidf,0.897436,0.871795,0.846154,0.884615,0.910256,0.794872,0.833333


## **Apprentissage du meilleur modèle sur l'ensemble du jeu de données**

In [105]:
#Vectoriseur et modèle à ENREGISTRER en pickle
vectorizer1 = TfidfVectorizer(lowercase=True,
                             stop_words=sw,
                             strip_accents='unicode',
                             max_df=0.5).fit(collection1)
collection_vectorized_tfidf = vectorizer1.transform(collection1)
classifieur1 = MultinomialNB().fit(collection_vectorized_tfidf, themes1)

  'stop_words.' % sorted(inconsistent))
