# **Classifieur n°2 : pertinence de thème**

## **Importation**

In [3]:
#!pip install -U spacy
#!python -m spacy download fr_core_news_md

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive/My Drive/text_mining/Notebooks


In [40]:
import pandas as pd
import spacy
from collections import defaultdict, Counter
import numpy as np
from sklearn.externals import joblib

#Importation pré-traitement
import nltk
from nltk.tokenize.regexp import WordPunctTokenizer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import unicodedata
nlp = spacy.load('fr_core_news_md')

#Strop words
def strip_accents(texte):
  return(unicodedata.normalize('NFKD', texte).encode('ASCII', 'ignore').decode('ASCII'))
nltk.download('stopwords')
sw=stopwords.words("french")
sw += ['être','avoir','comment']
sw= [strip_accents(w) for w in sw]

#Importation modèles
from sklearn.model_selection import train_test_split
from sklearn.metrics.cluster import contingency_matrix
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import model_selection
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

#Importation données
QA = pd.read_csv('../Data/Q&A.csv',sep=";")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## **Pré-traitements**

In [0]:
#Définition des fonctions de prétraitement du texte
def lemmatise_text(text):
  tw_nlp = nlp(text)
  list_lem = [token.lemma_ for token in tw_nlp]
  text_lem = ' '.join(list_lem)
  return text_lem

def stem_text(text):
  tokenizer = WordPunctTokenizer()
  stemmer = SnowballStemmer('french')
  liste_racines = [stemmer.stem(token) for token in tokenizer.tokenize(text)]
  return ' '.join(liste_racines)

def normalise(text):
  #stop words, strip accent et lowercase vont être fait automatiquement
  text = text.replace('\n','').replace('\r','').split(" ")
  text = " ".join([i for i in text if i!=""])
  lemmas = lemmatise_text(text) #lemme de notre texte
  stems = stem_text(lemmas) #stem de notre texte A VOIR
  return stems

In [0]:
#Normalisation du corpus
collection2 = pd.Series(list(QA.Answers) + list(QA.Questions))
collection2 = collection2.apply(normalise)
themes2 = list(QA.Themes) + list(QA.Themes)

In [0]:
#Création train et validation
X_train, X_valid, y_train, y_valid = train_test_split(collection2,
                                                      themes2,
                                                      train_size=0.75,
                                                      random_state=5)

In [0]:
#Création du vectorizer bin sur X_train : BINAIRE
vectorizer = CountVectorizer(lowercase=True,
                             stop_words=sw,
                             strip_accents='unicode',
                             binary=True).fit(X_train)
X_train_vectorized_bin = vectorizer.transform(X_train)
X_valid_vectorized_bin = vectorizer.transform(X_valid)

In [0]:
#Création du vectorizer count sur X_train : COUNT
vectorizer = CountVectorizer(lowercase=True,
                             stop_words=sw,
                             strip_accents='unicode',
                             binary=False).fit(X_train)
X_train_vectorized_count = vectorizer.transform(X_train)
X_valid_vectorized_count = vectorizer.transform(X_valid)

In [0]:
#Création du vectorizer count sur X_train : TFIDF
vectorizer = TfidfVectorizer(lowercase=True,
                             stop_words=sw,
                             strip_accents='unicode',
                             max_df=0.5).fit(X_train)
X_train_vectorized_tfidf = vectorizer.transform(X_train)
X_valid_vectorized_tfidf = vectorizer.transform(X_valid)

In [0]:
resultat_binaire = {}
resultat_count = {}
resultat_tfidf = {}

## **SVM**

**Vectorizer binaire**

In [0]:
param_grid = {'C': np.arange(0.1,10,0.1),'gamma':np.arange(0.009,1,0.1)} 
mvs = SVC(kernel = 'rbf')
grid = model_selection.GridSearchCV(estimator=mvs,param_grid=param_grid,cv = 4, scoring='accuracy', n_jobs = -1)

In [49]:
grille = grid.fit(X_train_vectorized_bin, y_train)
print(grille.best_params_,grille.best_score_)
predictions_valid = grille.predict(X_valid_vectorized_bin)
predictions_train = grille.predict(X_train_vectorized_bin)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_binaire['SVM']=accuracy_score(y_valid, predictions_valid)

{'C': 8.6, 'gamma': 0.009} 0.35028449502133713
0.8609271523178808 0.35294117647058826


**Vectorizer count**

In [50]:
grille = grid.fit(X_train_vectorized_count, y_train)
print(grille.best_params_,grille.best_score_)
predictions_valid = grille.predict(X_valid_vectorized_count)
predictions_train = grille.predict(X_train_vectorized_count)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_count['SVM']=accuracy_score(y_valid, predictions_valid)

{'C': 8.1, 'gamma': 0.009} 0.35739687055476527
0.847682119205298 0.39215686274509803


**Vectorizer tfidf**

In [51]:
grille = grid.fit(X_train_vectorized_tfidf, y_train)
print(grille.best_params_,grille.best_score_)
predictions_valid = grille.predict(X_valid_vectorized_tfidf)
predictions_train = grille.predict(X_train_vectorized_tfidf)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_tfidf['SVM']=accuracy_score(y_valid, predictions_valid)

{'C': 4.0, 'gamma': 0.20900000000000002} 0.5168918918918919
1.0 0.5098039215686274


## **Régression Logistique**
**Vectorizer binaire**

In [0]:
param_grid = {'C': np.arange(0.5,5,0.1)} 
mlr = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter = 1500)
grid = model_selection.GridSearchCV(estimator=mlr,param_grid=param_grid,cv = 4, scoring='accuracy', n_jobs = -1)

In [53]:
grille = grid.fit(X_train_vectorized_bin, y_train)
print(grille.best_params_,grille.best_score_)
predictions_valid = grille.predict(X_valid_vectorized_bin)
predictions_train = grille.predict(X_train_vectorized_bin)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_binaire['log']=accuracy_score(y_valid, predictions_valid)

{'C': 2.4999999999999996} 0.5096017069701281
1.0 0.45098039215686275


**Vectorizer count**

In [54]:
grille = grid.fit(X_train_vectorized_count, y_train)
print(grille.best_params_,grille.best_score_)
predictions_valid = grille.predict(X_valid_vectorized_count)
predictions_train = grille.predict(X_train_vectorized_count)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_count['log']=accuracy_score(y_valid, predictions_valid)

{'C': 3.599999999999999} 0.5296941678520626
1.0 0.5098039215686274


**Vectorizer tfidf**

In [55]:
grille = grid.fit(X_train_vectorized_tfidf, y_train)
print(grille.best_params_,grille.best_score_)
predictions_valid = grille.predict(X_valid_vectorized_tfidf)
predictions_train = grille.predict(X_train_vectorized_tfidf)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_tfidf['log']=accuracy_score(y_valid, predictions_valid)

{'C': 4.899999999999999} 0.5236486486486487
1.0 0.5294117647058824


## **RandomForest**
**Vectorizer binaire**

In [0]:
model_rf = RandomForestClassifier(n_estimators=1000, oob_score = True)

In [57]:
model_rf.fit(X_train_vectorized_bin, y_train)
predictions_valid = model_rf.predict(X_valid_vectorized_bin)
predictions_train = model_rf.predict(X_train_vectorized_bin)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_binaire['rf']=accuracy_score(y_valid, predictions_valid)

1.0 0.47058823529411764


**Vectorizer count**

In [58]:
model_rf.fit(X_train_vectorized_count, y_train)
predictions_valid = model_rf.predict(X_valid_vectorized_count)
predictions_train = model_rf.predict(X_train_vectorized_count)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_count['rf']=accuracy_score(y_valid, predictions_valid)

1.0 0.49019607843137253


**Vectorizer tfidf**

In [59]:
model_rf.fit(X_train_vectorized_tfidf, y_train)
predictions_valid = model_rf.predict(X_valid_vectorized_tfidf)
predictions_train = model_rf.predict(X_train_vectorized_tfidf)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_tfidf['rf']=accuracy_score(y_valid, predictions_valid)

1.0 0.39215686274509803


## **Ridge Classifieur**
**Vectorizer binaire**

In [0]:
param_grid = {'alpha': [0.001,0.01,0.01,0.1,1,10,100]}
mridge = RidgeClassifier(tol=0.01)
grid = model_selection.GridSearchCV(estimator=mridge,param_grid=param_grid,cv = 4, scoring='accuracy', n_jobs = -1)

In [61]:
grille = grid.fit(X_train_vectorized_bin, y_train)
print(grille.best_params_,grille.best_score_)
predictions_valid = grille.predict(X_valid_vectorized_bin)
predictions_train = grille.predict(X_train_vectorized_bin)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_binaire['ridge']=accuracy_score(y_valid, predictions_valid)

{'alpha': 10} 0.4767069701280227
1.0 0.5098039215686274


**Vectorizer count**

In [62]:
grille = grid.fit(X_train_vectorized_count, y_train)
print(grille.best_params_,grille.best_score_)
predictions_valid = grille.predict(X_valid_vectorized_count)
predictions_train = grille.predict(X_train_vectorized_count)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_count['ridge']=accuracy_score(y_valid, predictions_valid)

{'alpha': 10} 0.5033783783783784
0.9933774834437086 0.5098039215686274


**Vectorizer tfidf**

In [63]:
grille = grid.fit(X_train_vectorized_tfidf, y_train)
print(grille.best_params_,grille.best_score_)
predictions_valid = grille.predict(X_valid_vectorized_tfidf)
predictions_train = grille.predict(X_train_vectorized_tfidf)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_tfidf['ridge']=accuracy_score(y_valid, predictions_valid)

{'alpha': 0.001} 0.5892603129445235
1.0 0.6470588235294118


## **Naif bayésien**
**Vectorizer binaire**

In [64]:
model_nb = MultinomialNB().fit(X_train_vectorized_bin, y_train)
predictions_valid = model_nb.predict(X_valid_vectorized_bin)
predictions_train = model_nb.predict(X_train_vectorized_bin)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_binaire['naive']=accuracy_score(y_valid, predictions_valid)

0.9602649006622517 0.5490196078431373


**Vectorizer count**

In [65]:
model_nb = MultinomialNB().fit(X_train_vectorized_count, y_train)
predictions_valid = model_nb.predict(X_valid_vectorized_count)
predictions_train = model_nb.predict(X_train_vectorized_count)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_count['naive']=accuracy_score(y_valid, predictions_valid)

0.9337748344370861 0.5882352941176471


**Vectorizer tfidf**

In [66]:
model_nb = MultinomialNB().fit(X_train_vectorized_tfidf, y_train)
predictions_valid = model_nb.predict(X_valid_vectorized_tfidf)
predictions_train = model_nb.predict(X_train_vectorized_tfidf)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_tfidf['naive']=accuracy_score(y_valid, predictions_valid)

0.7748344370860927 0.37254901960784315


## **Boosting**
**Vectorizer binaire**

In [0]:
param_grid = {'learning_rate':[0.01,0.1,1.0,10.0]}
mboosting = GradientBoostingClassifier()
grid = model_selection.GridSearchCV(estimator=mboosting,param_grid=param_grid,cv = 4, scoring='accuracy', n_jobs = -1)

In [68]:
grille = grid.fit(X_train_vectorized_bin, y_train)
predictions_valid = grille.predict(X_valid_vectorized_bin)
predictions_train = grille.predict(X_train_vectorized_bin)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_binaire['boosting']=accuracy_score(y_valid, predictions_valid)

0.8940397350993378 0.47058823529411764


**Vectorize count**

In [69]:
grille = grid.fit(X_train_vectorized_count, y_train)
predictions_valid = grille.predict(X_valid_vectorized_count)
predictions_train = grille.predict(X_train_vectorized_count)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_count['boosting']=accuracy_score(y_valid, predictions_valid)

0.8741721854304636 0.43137254901960786


**Vectorizer tfidf**

In [70]:
grille = grid.fit(X_train_vectorized_tfidf, y_train)
predictions_valid = grille.predict(X_valid_vectorized_tfidf)
predictions_train = grille.predict(X_train_vectorized_tfidf)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_tfidf['boosting']=accuracy_score(y_valid, predictions_valid)

0.9403973509933775 0.39215686274509803


## **AdaBoost Classifier**
**Vectorizer binaire**

In [0]:
param_grid = {'learning_rate':[0.01,0.1,1.0,10.0], 'n_estimators':[50,60,70,80,90,100]}
madaboost = AdaBoostClassifier()
grid = model_selection.GridSearchCV(estimator=madaboost,param_grid=param_grid,cv = 4, scoring='accuracy', n_jobs = -1)

In [72]:
grille = grid.fit(X_train_vectorized_bin, y_train)
predictions_valid = grille.predict(X_valid_vectorized_bin)
predictions_train = grille.predict(X_train_vectorized_bin)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_binaire['adaboost']=accuracy_score(y_valid, predictions_valid)

0.5827814569536424 0.21568627450980393


**Vectorizer count**

In [73]:
grille = grid.fit(X_train_vectorized_count, y_train)
predictions_valid = grille.predict(X_valid_vectorized_count)
predictions_train = grille.predict(X_train_vectorized_count)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_count['adaboost']=accuracy_score(y_valid, predictions_valid)

0.5364238410596026 0.2549019607843137


**Vectorizer tfidf**

In [74]:
grille = grid.fit(X_train_vectorized_tfidf, y_train)
predictions_valid = grille.predict(X_valid_vectorized_tfidf)
predictions_train = grille.predict(X_train_vectorized_tfidf)
print(accuracy_score(y_train, predictions_train),accuracy_score(y_valid, predictions_valid))
resultat_tfidf['adaboost']=accuracy_score(y_valid, predictions_valid)

0.5894039735099338 0.29411764705882354


## **Restitution de l'ensemble des résultats**

In [75]:
resultat = [resultat_binaire,resultat_count,resultat_tfidf]
pd.DataFrame(resultat,index=['binaire','count','tfidf'])

Unnamed: 0,SVM,log,rf,ridge,naive,boosting,adaboost
binaire,0.352941,0.45098,0.470588,0.509804,0.54902,0.470588,0.215686
count,0.392157,0.509804,0.490196,0.509804,0.588235,0.431373,0.254902
tfidf,0.509804,0.529412,0.392157,0.647059,0.372549,0.392157,0.294118


## **Réapprendre le meilleur sur l'ensemble du corpus :**

In [0]:
vectorizer2 = TfidfVectorizer(lowercase=True,
                              stop_words=sw,
                              strip_accents='unicode',
                              max_df=0.5).fit(collection2)
collection_vect = vectorizer2.transform(collection2)
classifieur2 = RidgeClassifier(tol=0.01,alpha=0.001).fit(collection_vect, themes2)

In [77]:
from sklearn.externals import joblib
joblib.dump(classifieur2, '../Data/classifieur2.pkl')
joblib.dump(vectorizer2, '../Data/vectoriseur2.pkl')

['../Data/vectoriseur2.pkl']