# **Création des vectoriseurs pour chaque thème**
Va permettre d'évaluer la similarité entre une question utilisateur et toutes les questions/réponses de ce thème, de renvoyer ensuit la meilleure réponse.

## **Importation**

In [1]:
#!pip install -U spacy
#!python -m spacy download fr_core_news_md

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive/My Drive/text_mining/Notebooks


In [2]:
import pandas as pd
import spacy
from collections import defaultdict, Counter
import numpy as np
from sklearn.externals import joblib
from sklearn.metrics.pairwise import cosine_similarity

#Importation pré-traitement
import nltk
from nltk.tokenize.regexp import WordPunctTokenizer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import unicodedata
nlp = spacy.load('fr_core_news_md')

#Strop words
def strip_accents(texte):
  return(unicodedata.normalize('NFKD', texte).encode('ASCII', 'ignore').decode('ASCII'))
nltk.download('stopwords')
sw=stopwords.words("french")
sw += ['être','avoir','comment']
sw= [strip_accents(w) for w in sw]

#Importation données
QA = pd.read_csv('../Data/Q&A.csv',sep=";")



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## **Fonctions de pré-traitement**

In [0]:
#Définition des fonctions de prétraitement du texte
def lemmatise_text(text):
  tw_nlp = nlp(text)
  list_lem = [token.lemma_ for token in tw_nlp]
  text_lem = ' '.join(list_lem)
  return text_lem

def stem_text(text):
  tokenizer = WordPunctTokenizer()
  stemmer = SnowballStemmer('french')
  liste_racines = [stemmer.stem(token) for token in tokenizer.tokenize(text)]
  return ' '.join(liste_racines)

def normalise(text):
  #stop words, strip accent et lowercase vont être fait automatiquement
  text = text.replace('\n','').replace('\r','').split(" ")
  text = " ".join([i for i in text if i!=""])
  lemmas = lemmatise_text(text) #lemme de notre texte
  stems = stem_text(lemmas) #stem de notre texte A VOIR
  return stems

## **Vectoriseur pour chaque thème**

In [0]:
vectorizer_themes = {}
for t in QA.Themes.unique():
  vectorizer = TfidfVectorizer(lowercase=True, 
                               stop_words=sw,
                               strip_accents='unicode',
                               norm='l2')
  QA_themes = QA[QA.Themes==t]
  ind = list(QA_themes.index)
  corpus_QA_themes = pd.Series(list(QA_themes.Answers) + list(QA_themes.Questions)).apply(normalise)
  dtm = vectorizer.fit_transform(corpus_QA_themes)
  vectorizer_themes[t]=[vectorizer,dtm,ind]

In [5]:
from sklearn.externals import joblib
joblib.dump(vectorizer_themes, '../Data/vectorizer_themes.pkl')

['../Data/vectorizer_themes.pkl']