<a href="https://colab.research.google.com/github/Aurelien07/stackoverflowquestions/blob/main/Projet_5_topic_modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  Stack Overflow part : 2/3

Utilisation de la librairie Nb Extend pour mettre le code au format PEP 8.

In [None]:
# Pour les installations de certaines librairies via pip ou upgrade 
!pip install gensim==4.2.0 # -> pour relancer le modéle 
!pip install transformers

In [None]:
import pandas as pd
import numpy as np
import IPython.display
import re

# Permettra de créer des accés avec google drive
import os 

# from contractions import CONTRACTION_MAP  # pour les verbes contractées
# source : https://towardsdatascience.com/a-practitioners-guide-to-natural-language-processing-part-i-processing-understanding-text-9f4abfd13e72

# Pour la visualisation graphique :
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
import seaborn as sns
import plotly.express as px

# pour le modelling des mots :
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
import lxml
import html5lib
from bs4 import BeautifulSoup

# Pour l'optimisation des algos :
from sklearn.model_selection import GridSearchCV

# Pour le BOW :
from nltk.tokenize import word_tokenize

# Pour le tf-idf :
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Pour la PCA : 
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler # classe pour standardisation
from sklearn import preprocessing
from sklearn import decomposition

# Pour la LDA : 
from sklearn.decomposition import LatentDirichletAllocation

# Pour la NMF :
from sklearn.decomposition import NMF

# pour les algorithmes supervisés :
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier

# pour les algorithmes supervisés pré-entrainés :
import tensorflow_hub as tf_hub

# Pour word2vec :
import tensorflow as tf
import tensorflow.keras
from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import metrics as kmetrics
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
import gensim
from gensim.models import Word2Vec,word2vec

# Bert
import os
import transformers
from transformers import *

# Pour la visualisation des tokens :
from sklearn.feature_extraction.text import CountVectorizer
from yellowbrick.text import FreqDistVisualizer

# Pour les scores :
from sklearn import metrics
from sklearn.metrics import accuracy_score, jaccard_score, hamming_loss
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score

# Pour supprimer les warnings :
import warnings
warnings.filterwarnings("ignore")

## Fonction importante pour le modelling :

In [None]:
# Permet de visualiser les NaN du DF de façon globale : 

def pct_vals_miss(data:pd.DataFrame):
    
    """
    Permet d'avoir un % de cellules manquantes.

    Parameters :
    ----------
    
    data: :class:`Dataframe`
    DataFrame avec le pourcentage de données manquantes sur l'intégralité du df.

    """
    # avec un print possibilité d'avoir une visu par colonne
    somme = 0
    for i in list(data.columns):
        somme += data[i].isna().sum()
    return round((somme / (data.shape[0]*data.shape[1]))*100,2)

In [None]:
# DataFrame de visualisation des manquants :

def miss_value (data:pd.DataFrame) :
    
    """
    Permet d'avoir un % de cellules manquantes.

    Parameters:
    ----------
    
    data: :class:`Dataframe`
    DataFrame avec les pourcentages de données manquantes par colonnes.
    
    """
    
    print('-'*100)
    print(" "*41,'\033[1m'+ "Données manquantes :"+'\033[0m') # Describe
    print('-'*100)

    dico = {} # on crée un dictionnaire de données
    for col in data.columns:
        dico[col] = []
        dico[col].append(round((data[col].notnull().sum()/data.shape[0])*100,2))
        dico[col].append(data[col].isnull().sum())
        
    df = pd.DataFrame.from_dict(data=dico, orient="index", columns = ["Pourcentages", "Données manquantes"]).sort_values(by="Données manquantes", ascending=True)
    display(df)
    
    print('-'*100)
    print(" "*25,'\033[1m'+f"Le pourcentage de données manquantes est de",pct_vals_miss(data),"%"+'\033[0m')
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.from_dict.html => En cas de modif

In [None]:
# potentiellement à optimisé pour une meilleure visibilité :
def visu_yellow(data):

    # Load the text data

    vectorizer = CountVectorizer()
    docs = vectorizer.fit_transform(data)
    features = vectorizer.get_feature_names_out()
    # plt.figure(figsize=(18,12)) # Erreur => utilisation de size
    visualizer = FreqDistVisualizer(
        features=features, orient='v', size=(1080, 720))
    visualizer.fit(docs)
    visualizer.show()

In [None]:
def multiscore(y_test, y_pred):

    print("-"*53)
    print('')
    print(">"*15, "Accuracy :",
          '\033[1m' + str(round(metrics.accuracy_score(y_test, y_pred), 2)) + '\033[0m', "<"*20), 
    print('')
    print("-"*53)
    print('')
    print(">"*15, "Hamming loss :",
          '\033[1m' + str(round(metrics.hamming_loss(y_test, y_pred), 2)) + '\033[0m', "<"*17), # Pour les données qui ont été mal prédit
    print('')
    print("-"*53)
    print('')
    print(">"*15, "Jaccard_score :", '\033[1m' + str(round(metrics.jaccard_score(y_test, y_pred, average="weighted"), 2))
          + '\033[0m', "<"*16),
    print('')
    print("-"*53)
    print('')
    print(">"*15, "f1_macro_score :", '\033[1m' + str(round(metrics.f1_score(y_test, y_pred, average='macro'), 2))
          + '\033[0m', "<"*16),
    print('')
    print("-"*53)
    print('')
    print(">"*15, "f1_micro_score :", '\033[1m' + str(round(metrics.f1_score(y_test, y_pred, average='micro'), 2))
          + '\033[0m', "<"*16),
    print('')
    print("-"*53)
    print('')
    print(">"*15, "Recall_micro_score :", '\033[1m' + str(round(metrics.f1_score(y_test, y_pred, average='micro'), 2)) # Pour controler les True positives
          + '\033[0m', "<"*16),
    print('')
    print("-"*53)
    print('')
    print(">"*15, "Recall_macro_score :", '\033[1m' + str(round(metrics.f1_score(y_test, y_pred, average='macro'), 2)) # Pour controler les True positives
          + '\033[0m', "<"*16),
    print('')
    print("-"*53)

    

In [None]:
def dico_metric (dico, col) :

  """
  source : https://scikit-learn.org/stable/modules/model_evaluation.html
  Permet de créer un dictionnaire avec les scores des differents algorithmes.
  """
  dico[col]  = {'Accuracy': round(metrics.accuracy_score(y_test, y_pred), 2) ,
                'Hamming loss' : round(metrics.hamming_loss(y_test, y_pred), 2),
                'Jaccard_score_macro' : round(metrics.jaccard_score(y_test, y_pred, average="macro"), 2),
                'Jaccard_score_micro' : round(metrics.jaccard_score(y_test, y_pred, average="micro"), 2),
                'f1_macro_score' : round(metrics.f1_score(y_test, y_pred, average='macro'), 2),
                'f1_micro_score' : round(metrics.f1_score(y_test, y_pred, average='micro'), 2),
                'Recall_micro_score' : round(metrics.f1_score(y_test, y_pred, average='micro'), 2),
                'Recall_macro_score' : round(metrics.f1_score(y_test, y_pred, average='macro'), 2),
                'precision_score' : round(metrics.precision_score(y_test, y_pred, average='macro'), 2),
               # 'roc_AUC__macro_score' :  	round(metrics.roc_auc_score(y_test, y_pred, average='macro'), 2),
               # 'roc_AUC__micro_score' :  	round(metrics.roc_auc_score(y_test, y_pred, average='micro'), 2)
                }
  return dico

## Importation des données

Pour les besoins du notebook nous ne conservons que les titres, corps du texte et tags des document importés. 

Nous créons également un chemin pour utiliser nos fichiers dans le drive.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
path = "/content/drive/My Drive/Colab Notebooks/Projet_5/"

### Pré-visualisation avant traitement :

In [None]:
data = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Projet_5/P5_exploration.csv")
data.head()

## Pré-traitement

Pour chacune de nos opérations, nous allons utiliser la fonction %%time pour nous permettre de connaître le temps entre chaque opération.

En considérant le temps entre chaque opération, si l'opération est trop longue, il conviendra de faire un pickle pour le garder en mémoire.

### Suppression des balises Tags :

In [None]:
# Preprocess les tags
data["Tags"] = data["Tags"].str.lower() 
data["Tags"] = data["Tags"].replace({"><" : " "}, regex=True)
data["Tags"] = data["Tags"].replace({"<" : ""}, regex=True)
data["Tags"] = data["Tags"].replace({">" : ""}, regex=True)

### Création de la colonne corpus :

Nous commençons par créer une nouvelle variable associant le titre (Title) et le corps du texte (Body)

In [None]:
%%time
data['Corpus'] = data['Title'] + ' ' + data['Body']
# Certains algorithmes auront besoin d'une liste
corpus = data['Corpus'].to_list()
# Certains algorithmes auront besoin d'une liste
tags = data['Tags'].to_list()
display(data.head(5))

In [None]:
print("Visualisation pré-nettoyage du Corpus :")
print('')
display(data.Corpus[0])
print('')

### Nettoyage HTML via beautiful soup :

Nous allons maintenant nettoyer les données en rapport avec les balises HTML.

In [None]:
%%time

def clean_html(text_html):
    soup = BeautifulSoup(text_html, "html5lib")
    for element in soup.find_all("code"):
        # print(element)
        element.decompose()
    return soup.get_text().replace("\n", " ")


corpus_del_bal = [clean_html(text) for text in corpus]
data['Corpus'] = data['Corpus'].apply(lambda x : clean_html(x))

In [None]:
print("Visualisation du nettoyage Beautiful Soup :")
print('')
display(corpus_del_bal[0])
print('')

#### Visualisation Yellowbricks en token :

In [None]:
visu_yellow(corpus_del_bal)

On voit qu'il y a toujours des mots redondants qui ne servent à rien à l'analyse.

### Nettoyage du texte (Suppression des fins de lignes et des chiffres) :

ici, nous supprimons les fins de lignes et les chiffres.

In [None]:
%%time


def text_cleaning(text):
    text = re.sub('\w*\d\w*', '', text)  # supprimer tout les chiffres
    text = re.sub(r'\n', '', text)  # retirer les fins de lignes
    text = re.sub(r'\s+', ' ', text)  # retirer les fins de lignes de corpus
    # text = re.split("," , " ")
    return text


corpus_x = [text_cleaning(text) for text in corpus_del_bal]  # pour la liste
tags_x = [text_cleaning(text).strip() for text in tags]  # pour la liste
data['Corpus'] = data['Corpus'].apply(lambda x : text_cleaning(x))

In [None]:
print('-'*45)
print("Visualisation du nettoyage texte sur le corpus :")
print('-'*45)
display(corpus_x[0])
print("")

print('-'*45)
print("Visualisation du nettoyage texte sur les tags :")
print('-'*45)
display(tags_x[0])
print("")

#### Visualisation Yellowbricks en token :

In [None]:
visu_yellow(corpus_x)

Le corpus pré-nettoyé reste actuellement inutilisable.

In [None]:
visu_yellow(tags_x)

Ici, on peut voir que les Tags sont bien représentés.

### Suppression des verbes contractées :

Suppression des formes contractés des verbes.

In [None]:
CONTRACTION_MAP = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

In [None]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):

    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                      flags=re.IGNORECASE | re.DOTALL)

    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
            if contraction_mapping.get(match)\
            else contraction_mapping.get(match.lower())
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction

    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

In [None]:
%%time
corpus_x = [expand_contractions(text) for text in corpus_del_bal]
tags_x = [expand_contractions(text).strip() for text in tags]
# data['Corpus'] = data['Corpus'].apply(lambda x : expand_contractions(x))

In [None]:
print('-'*60)
print("Visualisation de la suppression des verbes contractés sur le corpus :")
print('-'*60)
display(corpus_x[0])
print("")

print('-'*60)
print("Visualisation de la suppression des verbes contractés sur les tags :")
print('-'*60)
display(tags_x[0])
print("")

#### Visualisation Yellowbricks en token :

In [None]:
visu_yellow(corpus_x)

Peu de difference ici, vu que les mots contractées ne doivent pas être dans le top 50.

In [None]:
visu_yellow(tags_x)

Idem les tags étant déjà des termes spécifiques, pas de raison de retrouver une difference entre les 2.

### Tokenization + suppression des stopwords :

La tokenisation consiste essentielleent à diviser une phrase, paragraphe ou un document de texte en unités plus petites, entant que mots ou termes individuels. On appelle ces mots des tokens d'ou tokenisation.

Un stopword est un mot qui est tellement commun qu'il est inutile de l'indexer ou de l'utiliser dans une recherche.

In [None]:
import string  # permet d'avoir accés à toute les ponctuations.
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

In [None]:
def tokenize(text):

    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)

    try:
        res = word_tokenize(text, language='english')
    except TypeError:
        return text

    res = [token for token in res if token not in punctuation]
    res = [token for token in res if token not in stop_words]
   # res = [token for token in res if token not in letters]
    return res

In [None]:
%%time
corpus_token = [tokenize(text) for text in corpus_x]
tag_token = [tokenize(text) for text in tags_x]
# data['Corpus'] = data['Corpus'].apply(lambda x : tokenize(x))

In [None]:
print('-'*65)
print("Visualisation de la suppression de la tokennisation sur le corpus :")
print('-'*65)
display(corpus_token[0])
print("")

print('-'*65)
print("Visualisation de la suppression de la tokennisation sur les tags :")
print('-'*65)
display(tag_token[0])
print('')

###  POS tagging :

Parts of Speech (POS) Tagging. Parts of speech tagging simply refers to assigning parts of speech to individual words in a sentence, which means that, unlike phrase matching, which is performed at the sentence or multi-word level, parts of speech tagging is performed at the token level.

source : https://stackabuse.com/python-for-nlp-parts-of-speech-tagging-and-named-entity-recognition/

NN: noun, common, singular or mass

In [None]:
nltk.download('averaged_perceptron_tagger')

In [None]:
def filtering_nouns(text):

    res = nltk.pos_tag(text)

    res = [token[0] for token in res if token[1] == 'NN']# Rajouter adverbe etc etc

    return res

In [None]:
%%time
nn_corpus = [filtering_nouns(tokens) for tokens in corpus_token]

In [None]:
print('-'*65)
print("Visualisation du POS Tagging sur le corpus :")
print('-'*65)
display(nn_corpus[0])
print("")

### Lemmatisation

La lemmatisation désigne un traitement lexical apporté à un texte en vue de son classement dans un index ou de son analyse. Ce traitement consiste à appliquer aux occurrences des lexèmes sujets à flexion un codage renvoyant à leur entrée lexicale commune, que l'on désigne sous le terme de lemme.

source : Wikipedia

In [None]:
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
def lemmatisation(text):

    # Init the Wordnet Lemmatizer
    lemmatizer = WordNetLemmatizer()

    res = [lemmatizer.lemmatize(word, wordnet.VERB) for word in text]
    res = [lemmatizer.lemmatize(word, wordnet.ADJ) for word in res]
    res = [lemmatizer.lemmatize(word, wordnet.NOUN) for word in res]
    res = [lemmatizer.lemmatize(word, wordnet.ADV) for word in res]
    return res

In [None]:
%%time
corpus_lem = [lemmatisation(tokens) for tokens in nn_corpus]
tags_lem = [lemmatisation(tokens) for tokens in tag_token]
# data['Corpus'] = data['Corpus'].apply(lambda x : lemmatisation(x))

tags_liste = []
for tokens in tags_lem:
    tokens = [token for token in tokens if len(token) > 1]
    tags_liste.append(tokens)

In [None]:
print('-'*65)
print("Visualisation de la lemmatisation sur le corpus :")
print('-'*65)
display(corpus_lem[0])
print("")

print('-'*65)
print("Visualisation de la lemmatisation sur les Tags")
print('-'*65)
display(tags_lem[0])
print("")

# Features Engineering :

## Création des colonnes de preprocessing et création d'un nouveau DF :

In [None]:
# remettre sous forme de phrase et non sous forme de liste
corpus_df = [" ".join(text) for text in corpus_lem]
# On crée un dataframe
corpus_df = pd.DataFrame(corpus_df, columns=['corpus_preprocessing'])

In [None]:
corpus_df

In [None]:
tags_df = [" ".join(tags) for tags in tags_liste]
tags_df = pd.DataFrame(tags_df, columns=['tags_preprocessing'])

In [None]:
tags_df

In [None]:
df_final = pd.concat([corpus_df, tags_df], axis=1)

In [None]:
df_final['corpus_preprocessing'] = df_final['corpus_preprocessing'].apply(
    lambda x: x.split(' '))
df_final['tags_preprocessing'] = df_final['tags_preprocessing'].apply(
    lambda x: x.split(' '))

In [None]:
corpus_final = df_final['corpus_preprocessing'].to_list()
tags_final = df_final['tags_preprocessing'].to_list()
data_corpus_base = data['Corpus'].to_list()

In [None]:
df_visualisation = pd.concat([data['Corpus'],
                              data['Tags'],
                              df_final['corpus_preprocessing'],
                              df_final['tags_preprocessing']],
                             axis=1)

In [None]:
display(df_visualisation.head())
# Ici on voit bien qu'on a nos tags modifiés et non modifiés,.
# on pourra supprimer les balises via regex par la suite pour les tags

In [None]:
# Faire un CSV to dataframe pour éviter de tout relancer

Avant de calculer le bag of Word, le tf idf et le nmf, on va réduire le nombre de tags pour notre analyse.

## Comptage des Tags :

In [None]:
df_visualisation["Tags_count"] = df_visualisation["Tags"].apply(lambda x : len(x.split()))
df_visualisation.head()

In [None]:
#define data
plt.figure(figsize=(12,12))
data = df_visualisation["Tags_count"].value_counts()
labels = ['1 tag', '2 tags', '3 tags', '4 tags', '5 tags']

#define Seaborn color palette to use
colors = sns.color_palette('bright')[0:5]

#create pie chart
plt.pie(data, labels = labels, colors = colors, autopct='%.0f%%')
plt.legend(labels)
plt.show()

In [None]:
print( "Le nombre de Tags dans une question est au nombre de : ", '\033[1m'+ str(round(df_visualisation["Tags_count"].mean(),2)) + '\033[0m')

# Enregistrer le dataframe dans un CSV :

In [None]:
#df_visualisation.to_csv("/content/drive/My Drive/Colab Notebooks/Projet_5/P5_xxx.csv", index=True) # Cela permettra d'éviter d'attendre de relancer le notebook.

In [None]:
#data = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Projet_5/P5_algorithme.csv")

# Analyse non supervisée :

## BOW :

### Pour les tags : 

In [None]:
vectorizer = CountVectorizer(tokenizer = lambda x: x.split())
tag_bow = vectorizer.fit_transform(df_visualisation['Tags'])

In [None]:
print("Nombres de questions :", tag_bow.shape[0])
print("Nombres de tags uniques :", tag_bow.shape[1])

Visualisation des 10 premiers Tags :

In [None]:
tags_name = vectorizer.get_feature_names_out()
print("Visualisation des 10 premiers tags pour le BOW :", tags_name[:10])

In [None]:
frequence = tag_bow.sum(axis=0).A1 # Equivalent to ravel
tag_dict = dict(zip(tags, frequence))

In [None]:
liste = []
for key, value in tag_dict.items():
  liste.append([key, value]) 

In [None]:
tag_counting = pd.DataFrame(liste, columns=['Tags', 'Counts'])
tag_counting.head(10)

In [None]:
display(tag_counting.max())
display(tag_counting.min())

In [None]:
tag_counting_graph = tag_counting.sort_values(['Counts'], ascending=False)
plt.plot(tag_counting_graph['Counts'].values)
plt.grid(True)
plt.title("Distribution des Tags :")
plt.xlabel("Nombre de Tags sur les plus fréquents")
plt.ylabel("Fréquence")

In [None]:
plt.plot(tag_counting_graph['Counts'][0:100].values)
plt.grid(True)
plt.title("Distribution des top 100 Tags :")
plt.xlabel("Nombre de Tags sur les plus fréquents")
plt.ylabel("Fréquence")

In [None]:
plt.plot(tag_counting_graph['Counts'][0:10].values)
plt.grid(True)
plt.title("Distribution des top 10 Tags :")
plt.xlabel("Nombre de Tags sur les plus fréquents")
plt.ylabel("Fréquence")

On peut voir qu'à partir de 8 on commence à avoir une fréquence inférieur à 1000, on aurait donc tendance à garder 1000 en fréquence.

Et 8 en tag number max

In [None]:
print("{} tags qui sont utilisés plus de 10 fois".format(tag_counting[tag_counting["Counts"]>10].shape[0]))
print("{} tags qui sont utilisés plus de 25 fois".format(tag_counting[tag_counting["Counts"]>25].shape[0]))
print("{} tags qui sont utilisés plus de 50 fois".format(tag_counting[tag_counting["Counts"]>50].shape[0]))
print("{} tags qui sont utilisés plus de 100 fois".format(tag_counting[tag_counting["Counts"]>100].shape[0]))
print("{} tags qui sont utilisés plus de 200 fois".format(tag_counting[tag_counting["Counts"]>200].shape[0]))
print("{} tags qui sont utilisés plus de 500 fois".format(tag_counting[tag_counting["Counts"]>500].shape[0]))

On peut voir confirmation que le top 206 tags sont utilisés plus de 50x, donc interessant à mettre en place.

In [None]:
"""df_visualisation['Corpus'] = df_visualisation['Corpus'].str.lower()
df_visualisation['Corpus'] = df_visualisation['Corpus'].apply(lambda x : tokenize(x))
df_visualisation['Corpus'] = df_visualisation['Corpus'].apply(lambda x : filtering_nouns(x))
df_visualisation['Corpus'] = df_visualisation['Corpus'].apply(lambda x : lemmatisation(x))"""

In [None]:
df_visualisation["Corpus"] = df_visualisation["corpus_preprocessing"].apply(lambda x : " ".join(x))

In [None]:
df_visualisation

In [None]:
def bag_of_words (texts) :
    data = texts
    cv = CountVectorizer(min_df = 200).fit(data)
    bow = cv.transform(data)
    
   # print ("Taille : ",  len (cv.vocabulary_))
   # print ("Contenu : ",  cv.vocabulary_) # too long
    
   # Nombre de lignes et de colonnes de la matrice via Bag Of Words :
   # print(bow.toarray().shape)
    
    data = pd.DataFrame.from_dict(cv.vocabulary_, orient='index',
                       columns=['Frequency'])
    data = data.sort_values(by=['Frequency'], ascending = False)
    data = data[data['Frequency']>200]
    
    data['percent'] = round((data['Frequency'])*100/ data['Frequency'].sum(),2)
    
    return(data, bow)

### Création d'un DataFrame de visualisation + une matrice :

In [None]:
%%time
data_bow,bow = bag_of_words(df_visualisation["Corpus"])

In [None]:
print("Nombres de questions pour le BOW:", bow.shape[0])
print("Nombres de tags uniques pour le BOW :", bow.shape[1])

#### Visualisation : 

In [None]:
data_bow

### Vectorisation :

In [None]:
%%time
cv = CountVectorizer(min_df = 200)
data_bow_2 = cv.fit_transform(df_visualisation["Corpus"])
data_bow_vec = pd.DataFrame(data_bow_2.toarray(), columns=cv.get_feature_names_out())
data_bow_vec.index = df_final.index

In [None]:
data_bow_vec

In [None]:
%%time
score_Sparsicity_bow = data_bow_2.todense()
print("Sparsicity: ", '\033[1m'+ str((((score_Sparsicity_bow > 0).sum()/score_Sparsicity_bow.size)*100).round(2)) + '\033[0m' , "%")

Afficher la Sparsicity (sous forme de nombre ou de proportion) d'une matrice.

Par exemple, . 99% de  Sparsicity signifie que 99 % des valeurs sont nulles. De même, une Sparsicity de 0 signifie que la matrice est entièrement dense.

## TF-IDF :

La formule du tf-Idf est  : __poids = fréquence du terme * indicateur similarité__

__Pour extraire les informations on utilise :__

__NER (Named Entity Recognition) :__ reconnaître des personnes, endroits, entreprises, etc.

__Extraction de relations :__ essayer d'extraire des relations sémantiques entre différents termes du texte. Par exemple, des relations familiales ("Marie est l'enfant de Patrick") spatiales ("Le piano est devant la fenêtre"), etc. Ces informations peuvent ensuite être stockées dans une base de données relationnelles ou un graphe.

__Extraction d'événements :__ extraire des actions qui arrivent à nos entités. Par exemple "le cours de l'action X a augmenté de 5%" ou bien "le président à déclaré X dans son discours"

__POS Tagging (Part-of-Speech Tagging) :__ représente les méthodes qui récupèrent la nature grammatical des mots d’une phrase - nom, verbe, adjectif, etc. Ce sont des propriété qui peuvent servir de caractéristiques utile lors de la création de certains modèles

In [None]:
def Tfidf (texts) :
    data = texts
    tf = TfidfVectorizer(min_df = 200).fit(data) # on choisit de prendre que les mots avec 200 itérations
    idf = tf.transform(texts)
    
    #print ("Taille : ",  len (tf.vocabulary_))
    # print ("Contenu : ",  tf.vocabulary_) # too long
    
    # Nombre de lignes et de colonnes de la matrice via Bag Of Words :
    #print(idf.toarray().shape)
    
    data = pd.DataFrame.from_dict(tf.vocabulary_, orient='index',
                       columns=['Frequency'])
    data = data.sort_values(by=['Frequency'], ascending = False)
    data = data[data['Frequency']>200]
    
    data['percent'] = round((data['Frequency']*100)/ data['Frequency'].sum(),2)
    
    return(data,idf)

### Création d'un DataFrame de visualisation + une matrice :

In [None]:
%%time
Data_idf,idf = Tfidf(df_visualisation["Corpus"])

In [None]:
print("Nombres de questions pour le TF-IDF :", idf.shape[0])
print("Nombres de tags uniques pour le TF-IDF :", idf.shape[1])

#### Visualisation :

In [None]:
Data_idf

### Vectorisation :

In [None]:
tfidf = TfidfTransformer()
data_tfidf = tfidf.fit_transform(bow)
data_tfidf_vec = pd.DataFrame(data_tfidf.toarray(), columns=cv.get_feature_names_out())
data_tfidf_vec.index = df_final.index
data_tfidf_vec.head()

In [None]:
%%time
score_Sparsicity_tfidf = data_tfidf.todense()
print("Sparsicity: ", '\033[1m'+ str((((score_Sparsicity_tfidf > 0).sum()/score_Sparsicity_tfidf.size)*100).round(2)) + '\033[0m' , "%")

On a donc 2.29% de données avec un 0 dans notre tf-idf.

## ACP :

Le code provient d'un cours d'OpenClassrooms.

In [None]:
%%time
vectorizer = TfidfVectorizer(min_df = 200)
X = vectorizer.fit_transform(df_visualisation["Corpus"])

In [None]:
def display_circles(pcs, n_comp, pca, axis_ranks, labels=None, label_rotation=0, lims=None):
    """Display correlation circles, one for each factorial plane"""

    # For each factorial plane
    for d1, d2 in axis_ranks: 
        if d2 < n_comp:

            # Initialise the matplotlib figure
            fig, ax = plt.subplots(figsize=(10,10))

            # Determine the limits of the chart
            if lims is not None :
                xmin, xmax, ymin, ymax = lims
            elif pcs.shape[1] < 30 :
                xmin, xmax, ymin, ymax = -1, 1, -1, 1
            else :
                xmin, xmax, ymin, ymax = min(pcs[d1,:]), max(pcs[d1,:]), min(pcs[d2,:]), max(pcs[d2,:])

            # Add arrows
            # If there are more than 30 arrows, we do not display the triangle at the end
            if pcs.shape[1] < 30 :
                plt.quiver(np.zeros(pcs.shape[1]), np.zeros(pcs.shape[1]),
                   pcs[d1,:], pcs[d2,:], 
                   angles='xy', scale_units='xy', scale=1, color="grey")
                # (see the doc : https://matplotlib.org/api/_as_gen/matplotlib.pyplot.quiver.html)
            else:
                lines = [[[0,0],[x,y]] for x,y in pcs[[d1,d2]].T]
                ax.add_collection(LineCollection(lines, axes=ax, alpha=.1, color='black'))
            
            # Display variable names
            if labels is not None:  
                for i,(x, y) in enumerate(pcs[[d1,d2]].T):
                    if x >= xmin and x <= xmax and y >= ymin and y <= ymax :
                        plt.text(x, y, labels[i], fontsize='14', ha='center', va='center', rotation=label_rotation, color="blue", alpha=0.5)
            
            # Display circle
            circle = plt.Circle((0,0), 1, facecolor='none', edgecolor='b')
            plt.gca().add_artist(circle)

            # Define the limits of the chart
            plt.xlim(xmin, xmax)
            plt.ylim(ymin, ymax)
        
            # Display grid lines
            plt.plot([-1, 1], [0, 0], color='grey', ls='--')
            plt.plot([0, 0], [-1, 1], color='grey', ls='--')

            # Label the axes, with the percentage of variance explained
            plt.xlabel('PC{} ({}%)'.format(d1+1, round(100*pca.explained_variance_ratio_[d1],1)))
            plt.ylabel('PC{} ({}%)'.format(d2+1, round(100*pca.explained_variance_ratio_[d2],1)))

            plt.title("Correlation Circle (PC{} and PC{})".format(d1+1, d2+1))
            plt.show(block=False)
        
def display_factorial_planes(X_projected, n_comp, pca, axis_ranks, labels=None, alpha=1, illustrative_var=None):
    '''Display a scatter plot on a factorial plane, one for each factorial plane'''

    # For each factorial plane
    for d1,d2 in axis_ranks:
        if d2 < n_comp:
 
            # Initialise the matplotlib figure      
            fig = plt.figure(figsize=(7,6))
        
            # Display the points
            if illustrative_var is None:
                plt.scatter(X_projected[:, d1], X_projected[:, d2], alpha=alpha)
            else:
                illustrative_var = np.array(illustrative_var)
                for value in np.unique(illustrative_var):
                    selected = np.where(illustrative_var == value)
                    plt.scatter(X_projected[selected, d1], X_projected[selected, d2], alpha=alpha, label=value)
                plt.legend()

            # Display the labels on the points
            if labels is not None:
                for i,(x,y) in enumerate(X_projected[:,[d1,d2]]):
                    plt.text(x, y, labels[i],
                              fontsize='14', ha='center',va='center') 
                
            # Define the limits of the chart
            boundary = np.max(np.abs(X_projected[:, [d1,d2]])) * 1.1
            plt.xlim([-boundary,boundary])
            plt.ylim([-boundary,boundary])
        
            # Display grid lines
            plt.plot([-100, 100], [0, 0], color='grey', ls='--')
            plt.plot([0, 0], [-100, 100], color='grey', ls='--')

            # Label the axes, with the percentage of variance explained
            plt.xlabel('PC{} ({}%)'.format(d1+1, round(100*pca.explained_variance_ratio_[d1],1)))
            plt.ylabel('PC{} ({}%)'.format(d2+1, round(100*pca.explained_variance_ratio_[d2],1)))

            plt.title("Projection of points (on PC{} and PC{})".format(d1+1, d2+1))
            #plt.show(block=False)
   
def display_scree_plot(pca):
    '''Display a scree plot for the pca'''

    scree = pca.explained_variance_ratio_*100
    plt.bar(np.arange(len(scree))+1, scree)
    plt.plot(np.arange(len(scree))+1, scree.cumsum(),c="red",marker='o')
    plt.xlabel("Number of principal components")
    plt.ylabel("Percentage explained variance")
    plt.title("Scree plot")
    plt.show(block=False)

In [None]:
# constitution du dataset pour l'ACP
df_acp = pd.DataFrame(data = X.toarray(),  
                      columns = list(vectorizer.get_feature_names_out()))

n_comp = 200
scaler = StandardScaler()
pca = PCA(n_components=n_comp)

X = df_acp.values
features = df_acp.columns
X_scaled = scaler.fit_transform(X)
pca_components = pca.fit_transform(X_scaled)
pcs = pca.components_

In [None]:
%%time
PCA_decomposition = decomposition.PCA(n_components = 200)
PCA_decomposition.fit(X_scaled)

In [None]:
print('\033[1m'+ 'Le pourcentage de variance expliqué pour 200 composantes est de :' + '\033[0m')
print('')
print((PCA_decomposition.explained_variance_ratio_)*100)
print('')
print(f"Le pourcentage de variance expliqué cumulé pour 200 composantes est de :", '\033[1m'+ str(round((PCA_decomposition.explained_variance_ratio_.sum())*100,2)) +'\033[0m', '%')

### Visualisation des dimensions :

In [None]:
plt.figure(figsize=(12, 12))
display_scree_plot(pca)

On peut voir qu'au fur et à mesure cela réduit, mais que le pourcentage de variance expliquée est relativement faible..

### Cercle de corrélation :

In [None]:
plt.figure(figsize=(20, 20))
display_circles(pcs, n_comp, pca, [(0,1)], labels = np.array(features))

### Vectorisation de la PC1 et PC2 :

In [None]:
X_projected = pca.transform(X_scaled) 

plt.figure(figsize=(30, 30))
display_factorial_planes(X_projected, n_comp, pca, [(0,1)], alpha = 0.1)
plt.show()

On peut voir ici que la PCA n'est pas pertinente, de ce fait on ne retiendra pas l'acp.

## LDA :

Hypothése de la LDA à confirmer :

    - Chaque document du corpus est un ensemble de mots sans ordre (bag-of-words)
    
    - Chaque document n'aborde un certain nombre de thèmes dans différentes proportions qui lui sont propres p(θm)

    - Chaque mot possède une distribution associée à chaque thème p(ϕk)
    
    - Zn représente le thème du mot Wn


### lda opti pour le BOW :

In [None]:
"""def lda_opti (texts) :

    score = []
    perplexity = []
    N = [5, 6, 7, 8, 9, 10, 11, 12, 15, 18, 20, 30]
    
    for n_comp in N :
        
        lda_model = LatentDirichletAllocation(random_state=0, n_components= n_comp, verbose = 2) # no random
        lda_fit = lda_model.fit(texts)
        lda_output = lda_fit.transform(texts)
        lda_score =  lda_fit.score(texts)
        score.append(lda_score)
        #print("Le score de cohérence est de :", score)
        lda_perplexity =lda_fit.perplexity(texts)
        perplexity.append(lda_perplexity)
        #print("Le score de perplexité est de :", perplexity)

    return(score, perplexity)"""

In [None]:
"""def lda_opti (texts) :

    lda_model = LatentDirichletAllocation(random_state=0) # no random
    score = []
    perplexity = []
    
    # Hyperparameters :
    params = { 
        'n_components': [5, 6, 7, 8, 9, 10, 11, 12, 15, 18, 20, 30], # number of component
        'learning_decay': [0.75, 0.80, 0.85] # learning rate from online method
    }

    # GridSearchCV :
    lda_search = GridSearchCV(lda_model,
                              param_grid=params,
                              #n_jobs=-1,
                              cv=5,
                              verbose=2
                             )
    
    
    lda_fit = lda_search.fit(texts) # fit data
    lda_output = lda_fit.transform(texts) # tranform data
    lda_score =  lda_fit.score(texts)
    score.append(lda_score)
    lda_perplexity =lda_fit.perplexity(texts)
    perplexity.append(lda_perplexity)
    
    return(score, perplexity, lda_output)"""

# Une méthode plus rapide a été utilisée.

In [None]:
"""
%%time
coherence_bow, perplexity_bow = lda_opti(bow)

import pickle
# Pour stock les données du lda bow :
pickle.dump(coherence_bow, open('score_bow.pkl', 'wb'))
pickle.dump(perplexity_bow, open('perplexity_bow.pkl', 'wb'))
# XXX.to_csv("P5_bow_lda.csv", index=False) # Cela permettra d'éviter de relancer le lda
# Une méthode plus rapide a été utilisée.
"""

In [None]:
"""%%time
# Define Search Param
params = {'n_components': [5, 6, 7, 8, 9, 10, 20, 30],
          'learning_decay': [.5, .7, .9]
          }

# Init the Model
lda = LatentDirichletAllocation(random_state=0)

# Init Grid Search Class
model = GridSearchCV(lda,
                     param_grid=params,
                     cv=5,
                     verbose=2,
                     n_jobs=-1,
                     )

# Do the Grid Search
model.fit(bow)
lda_output = model.transform(bow)

# Modéle à choisir
best_lda_model = model.best_estimator_

# meilleure paramétres :
print("Meilleurs paramétres : ", model.best_params_)

# Score de cohérence :
print("Meilleur Score de cohérence : ", model.best_score_)

# Score de perplexité :
print("Meilleur score de perplexité : ", best_lda_model.perplexity(bow))

# A mettre en commentaire une fois le best parameter trouvé
## Wall time: 17min 39s"""

In [None]:
%%time
# Define Search Param
params = {'n_components': [5],
          'learning_decay': [.5]
         }

# Init the Model
lda = LatentDirichletAllocation(random_state=0)

# Init Grid Search Class
model = GridSearchCV(lda,
                     param_grid=params,
                     cv=5,
                     verbose=2,
                     n_jobs=-1,
                    )

# Do the Grid Search
model.fit(bow)
lda_output = model.transform(bow)

# Modéle à choisir
best_lda_model = model.best_estimator_

# Score de cohérence :
print("Meilleur Score de cohérence : ", model.best_score_)

# Score de perplexité :
print("Meilleur score de perplexité : ", best_lda_model.perplexity(bow))

On a confirmation que le nombre de LDA optimal pour le BOW est 5.

#### Visualisation : 

In [None]:
"""# Get Log Likelyhoods from Grid Search Output
n_topics = [5, 6, 7, 8, 9, 10, 20, 30]

log_likelyhoods_5 = [round(model.cv_results_['mean_test_score'][index]) for index, gscore in enumerate(
    model.cv_results_['params']) if gscore['learning_decay'] == 0.5]


log_likelyhoods_7 = [round(model.cv_results_['mean_test_score'][index]) for index, gscore in enumerate(
    model.cv_results_['params']) if gscore['learning_decay'] == 0.7]

log_likelyhoods_9 = [round(model.cv_results_['mean_test_score'][index]) for index, gscore in enumerate(
    model.cv_results_['params']) if gscore['learning_decay'] == 0.9]

# Show graph
plt.figure(figsize=(12, 8))
plt.plot(n_topics, log_likelyhoods_5, label='0.5')
plt.plot(n_topics, log_likelyhoods_7, label='0.7')
plt.plot(n_topics, log_likelyhoods_9, label='0.9')
plt.title("Choosing Optimal LDA Model")
plt.xlabel("Num Topics")
plt.ylabel("Log Likelyhood Scores")
plt.legend(title='Learning decay', loc='best')
plt.show()"""

#### Topic dominant :

Une partie du code provient de ce site :

source : https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/

In [None]:
%%time
# Create Document - Topic Matrix
lda_output = best_lda_model.transform(bow)

# column names
topicnames = ['topic' + str(i) for i in range(best_lda_model.n_components)] # crée une liste de colonne  en fonction du nombre de composante n

# index names
docnames = ['Document' + str(i) for i in range(bow.toarray().shape[0])] # crée une liste de ligne en fonction du nombre de lignes de Bag of Words


# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

#### Review des topics en fonction des documents :

In [None]:
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
df_topic_distribution

#### Visualisation :

In [None]:
%%time
# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(best_lda_model.components_)

# Assign Column and Index
df_topic_keywords.columns = cv.get_feature_names_out()
df_topic_keywords.index = topicnames

# View
df_topic_keywords.head(10)

#### Visualisation du top des mots par topic : 

In [None]:
for topic in range(df_topic_keywords.shape[0]):
    tmp = df_topic_keywords.iloc[topic]
    print(f'Pour le Topic {topic+1}, les mots avec les plus hautes valeurs sont :')
    print(tmp.nlargest(15))
    print('\n')

### lda opti pour le IDF :

On peut voir ici que nos topics comment à être plutôt bien défini

In [None]:
"""%%time
# Define Search Param
params = {'n_components': [5, 6, 7, 8, 9, 10, 20, 30],
          'learning_decay': [.5, .7, .9]
          }

# Init the Model
lda_idf = LatentDirichletAllocation(random_state=0)

# Init Grid Search Class
model_idf = GridSearchCV(lda_idf,
                     param_grid=params,
                     cv=5,
                     verbose=2,
                     #n_jobs=-1,
                     )

# Do the Grid Search
model_idf.fit(idf)
lda_output = model_idf.transform(idf)

# Modéle à choisir
best_lda_model_idf = model_idf.best_estimator_

# meilleure paramétres :
print("Meilleurs paramétres : ", model_idf.best_params_)

# Score de cohérence :
print("Meilleur Score de cohérence : ", model_idf.best_score_)

# Score de perplexité :
print("Meilleur score de perplexité : ", best_lda_model_idf.perplexity(idf))

# A mettre en commentaire une fois le best parameter trouvé
## Wall time: 50min 43s"""

In [None]:
%%time
# Define Search Param
params = {'n_components': [5],
          'learning_decay': [.5]
          }

# Init the Model
lda_idf = LatentDirichletAllocation(random_state=0)

# Init Grid Search Class
model_idf = GridSearchCV(lda_idf,
                     param_grid=params,
                     cv=5,
                     verbose=2,
                     #n_jobs=-1,
                     )

# Do the Grid Search
model_idf.fit(idf)
lda_output = model_idf.transform(idf)

# Modéle à choisir
best_lda_model_idf = model_idf.best_estimator_

# meilleure paramétres :
print("Meilleurs paramétres : ", model_idf.best_params_)

# Score de cohérence :
print("Meilleur Score de cohérence : ", model_idf.best_score_)

# Score de perplexité :
print("Meilleur score de perplexité : ", best_lda_model_idf.perplexity(idf))

#### Topic dominant :

In [None]:
%%time
# Create Document - Topic Matrix
lda_output = best_lda_model_idf.transform(bow)

# column names
topicnames = ['topic' + str(i) for i in range(best_lda_model_idf.n_components)] # crée une liste de colonne  en fonction du nombre de composante n

# index names
docnames = ['Document' + str(i) for i in range(idf.toarray().shape[0])] # crée une liste de ligne en fonction du nombre de lignes de Bag of Words


# Make the pandas dataframe
df_document_topic_2 = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic_2 = np.argmax(df_document_topic_2.values, axis=1)
df_document_topic_2['dominant_topic'] = dominant_topic_2

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topic_2 = df_document_topic_2.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topic_2

#### Recherche des topics en fonction des documents :

In [None]:
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
df_topic_distribution

#### Visualisation :

In [None]:
%%time
# Topic-Keyword Matrix
df_topic_keywords_2 = pd.DataFrame(best_lda_model_idf.components_)

# Assign Column and Index
df_topic_keywords_2.columns = cv.get_feature_names_out()
df_topic_keywords_2.index = topicnames

# View
df_topic_keywords_2.head(10)

#### Visualisation du top des mots par topic : 

In [None]:
for topic in range(df_topic_keywords_2.shape[0]):
    tmp = df_topic_keywords_2.iloc[topic]
    print(f'Pour le Topic {topic+1}, les mots avec les plus hautes valeurs sont :')
    print(tmp.nlargest(15))
    print('\n')

On peut voir ici que les topics commencent également à être bien identifiés.

## NMF :

En algèbre linéaire et en analyse à plusieurs variables, la factorisation matricielle non négative est un groupe d’algorithmes qui permet de factoriser une matrice V en deux matrices (W et H) qui ne contiennent que des valeurs positives ou nulles et dont le produit est proche de V.

source : https://datafranca.org/wiki/Factorisation_matricielle_non_négative
source : https://predictivehacks.com/topic-modelling-with-nmf-in-python/

In [None]:
top_topics = 5
X = idf

In [None]:
%%time
# Create an NMF instance: model
# the 10 components will be the topics
model_nmf = NMF(n_components=top_topics, random_state= 0)
 
# Fit the model to TF-IDF
model_nmf.fit(X)
 
# Transform the TF-IDF: nmf_features
nmf_features = model_nmf.transform(X)

In [None]:
print(f"Shape de X :")
display(X.shape)
print('')
print(f"Shape des features de NMF :")
display(nmf_features.shape)
print('')
print(f"Shape des composantes de NMF :")
display(model_nmf.components_.shape)

#### Visualisation du DataFrame :

In [None]:
# Create a DataFrame: components_df
components_df = pd.DataFrame(model_nmf.components_, columns=cv.get_feature_names_out())
components_df

#### Visualisation du top des mots par topic : 

In [None]:
for topic in range(components_df.shape[0]):
    tmp = components_df.iloc[topic]
    print(f'Pour le Topic {topic+1}, les mots avec les plus hautes valeurs sont :')
    print(tmp.nlargest(10))
    print('\n')

On peut commencer à voir ici que chaque Topic commence à être bien défini en fonction du type de demande.

Ce lien m'a fortement aidé : https://www.kaggle.com/code/pierremegret/gensim-word2vec-tutorial/notebook

# Approche supervisée :

## Création de colonne pour le futur X et y :

In [None]:
df_visualisation

In [None]:
df_modellisation = df_visualisation[['corpus_preprocessing','tags_preprocessing']]

In [None]:
df_modellisation['corpus_finish'] = df_modellisation['corpus_preprocessing'].apply(lambda x : " ".join(x))
df_modellisation['tags_finish'] = df_modellisation['tags_preprocessing'].apply(lambda x : " ".join(x))

In [None]:
df_modellisation

In [None]:
# df_modellisation.to_csv("/content/drive/My Drive/Colab Notebooks/Projet_5/P5_yyy.csv", index=True) # Cela permettra d'éviter d'attendre de relancer le notebook.

## Reduction du nombre de tags :

In [None]:
from nltk import FreqDist

In [None]:
frequence = FreqDist(df_modellisation['tags_finish'].tolist())

In [None]:
# on stock un dictionnaire du nombre de tags que l'on souhaite garder
from collections import Counter
top_50 = dict(Counter(frequence).most_common(50))

In [None]:
top_50_tags = []
for key,value in top_50.items() :
    top_50_tags.append(key)
# garder que les clefs

In [None]:
df_modellisation['tags_final'] = df_modellisation['tags_preprocessing'].apply(lambda x : [element for element in x if element in top_50_tags ])

In [None]:
liste_index = []
for i,l in df_modellisation.iterrows() :
    if len(l['tags_final']) == 0 :
        liste_index.append(i)
print(f"Le nombre de lignes qui seront supprimés est de :", len(liste_index))
print("fLe nombre de ligne du dataframe avant suppression est de :", df_modellisation.shape[0])
df_modellisation.drop(liste_index, inplace = True) # suppression de la liste
print("fLe nombre de ligne du dataframe aprés suppression est de :", df_modellisation.shape[0])

In [None]:
df_modellisation

# CSV pour les algo : 

In [None]:
# Styling
def color_yellow(val):
    color = 'yellow' if val > .2 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .2 else 700
    return 'font-weight: {weight}'.format(weight=weight)

In [None]:
#df_modellisation.to_csv("/content/drive/My Drive/Colab Notebooks/Projet_5/P5_xxx.csv", index=True, encoding='utf-8',na_rep='NULL') # Cela permettra d'éviter d'attendre de relancer le notebook.

In [None]:
#data_test = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Projet_5/P5_test.csv")

## Algorithme utilisant le BOW : 

In [None]:
vectorizer = CountVectorizer(min_df = 200)
corpus_bow = vectorizer.fit_transform(df_modellisation['corpus_finish'])

### Train test via BOW :

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import confusion_matrix

In [None]:
y = df_modellisation['tags_final']
multi_lab = MultiLabelBinarizer(classes= top_50_tags)
Y = multi_lab.fit_transform(y)

print("Affichage des classes du multilabel :")
display(multi_lab.classes_)

In [None]:
X = corpus_bow # a modifier par tfidf et par wordtovec
y = Y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
"""model_metrics['Regression_logistique']  = {'Accuracy': round(metrics.accuracy_score(y_test, y_pred), 2) ,
                  'Hamming loss' : round(metrics.hamming_loss(y_test, y_pred), 2),
                  'Jaccard_score' : round(metrics.jaccard_score(y_test, y_pred, average="weighted"), 2),
                  'f1_macro_score' : round(metrics.f1_score(y_test, y_pred, average='macro'), 2),
                  'f1_micro_score' : round(metrics.f1_score(y_test, y_pred, average='micro'), 2),
                  'Recall_micro_score' : round(metrics.f1_score(y_test, y_pred, average='micro'), 2),
                  'Recall_macro_score' : round(metrics.f1_score(y_test, y_pred, average='macro'), 2)
                 }
model_metrics"""
# mis en fonction

### Création d'une liste pour les scores des métrics :

In [None]:
model_metrics_bow = {} # Garde en mémoire les scores

### Regression logistique :

In [None]:
def regression_log (X_train, y_train, X_test) :
  
  model_svm = OneVsRestClassifier(LogisticRegression())
  model_svm.fit(X_train, y_train)
  y_pred = model_svm.predict(X_test)
  
  return y_pred

In [None]:
%%time
y_pred = regression_log (X_train, y_train, X_test)
#multiscore (y_test, y_pred)
dico_metric (model_metrics_bow, "reg_log")

In [None]:
"""%%time
model_log = OneVsRestClassifier(LogisticRegression())
model_log.fit(X_train, y_train)
y_pred = model_log.predict(X_test)
# Mis en fonction """

### Random Forest :

In [None]:
def random_forest (X_train, y_train, X_test) :

  model_rf = OneVsRestClassifier(RandomForestClassifier())
  model_rf.fit(X_train, y_train)
  y_pred = model_rf.predict(X_test) 
  
  return y_pred

In [None]:
%%time
y_pred = random_forest (X_train, y_train, X_test)
#multiscore (y_test, y_pred)
dico_metric (model_metrics_bow, "random_forest")

In [None]:
"""%%time
model_rf = OneVsRestClassifier(RandomForestClassifier())
model_rf.fit(X_train, y_train)
y_pred = model_rf.predict(X_test)"""
# mis en fonction

### Decision tree : 

In [None]:
def decision_tree (X_train, y_train, X_test) :

  model_tree = OneVsRestClassifier(DecisionTreeClassifier())
  model_tree.fit(X_train, y_train)
  y_pred = model_tree.predict(X_test) 
  
  return y_pred

In [None]:
%%time
y_pred = decision_tree (X_train, y_train, X_test)
# multiscore (y_test, y_pred)
dico_metric (model_metrics_bow, "decision_tree")

In [None]:
"""%%time
model_tree = OneVsRestClassifier(DecisionTreeClassifier())
model_tree.fit(X_train, y_train)
y_pred = model_tree.predict(X_test)"""
# Mis en fonction

### KNN : 

In [None]:
def KNeighbors (X_train, y_train, X_test) :

  model_knn = OneVsRestClassifier(KNeighborsClassifier())
  model_knn.fit(X_train, y_train)
  y_pred = model_knn.predict(X_test)
  
  return y_pred

In [None]:
%%time
y_pred = KNeighbors (X_train, y_train, X_test)
# multiscore (y_test, y_pred)
dico_metric (model_metrics_bow, "knn")

In [None]:
"""%%time
model_knn = OneVsRestClassifier(KNeighborsClassifier())
model_knn.fit(X_train, y_train)
y_pred = model_knn.predict(X_test)"""
# Mis en fonction

### Xgboost :

In [None]:
def xgboost (X_train, y_train, X_test) :

  model_xgb = OneVsRestClassifier(XGBRegressor())
  model_xgb.fit(X_train, y_train)
  y_pred = model_xgb.predict(X_test)  
  return y_pred

In [None]:
%%time
y_pred = xgboost (X_train, y_train, X_test)
# multiscore (y_test, y_pred)
dico_metric (model_metrics_bow, "xgboost")

In [None]:
"""%%time
model_xgb = OneVsRestClassifier(XGBRegressor())
model_xgb.fit(X_train, y_train)
y_pred = model_xgb.predict(X_test)"""
# mis en fonction

### Gradient Boosting :

In [None]:
def gradient_boost (X_train, y_train, X_test) :
  
  model_svm = OneVsRestClassifier(GradientBoostingClassifier())
  model_svm.fit(X_train, y_train)
  y_pred = model_svm.predict(X_test)
  
  return y_pred

In [None]:
%%time
y_pred = gradient_boost (X_train, y_train, X_test)
# model_svc = multiscore (y_test, y_pred)
dico_metric (model_metrics_bow, "gradient_boost")

#### Dataframe des scores pour le BOW :

In [None]:
score_bow = pd.DataFrame.from_dict(model_metrics_bow)
# score_bow = score_bow.style.applymap(color_yellow).applymap(make_bold)
print("-"*100)
print("-"*44 + f"Via le BOW :" + "-"*44)
print("-"*100)
display(score_bow)

In [None]:
score_bow.to_csv("/content/drive/My Drive/Colab Notebooks/Projet_5/Resultat_score_bow.csv", index=True) # Cela permettra d'éviter d'attendre de relancer le notebook.

## Algorithme utilisant le tf-idf :

In [None]:
model_metrics_tfidf = {}

### Train test via tf-idf :

In [None]:
tf_idf_vec = TfidfVectorizer(min_df = 200)
corpus_idf = tf_idf_vec.fit_transform(df_modellisation['corpus_finish'])

In [None]:
X = corpus_idf # a modifier par tfidf et par wordtovec
y = Y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Regression Logistique :

In [None]:
%%time
y_pred = regression_log (X_train, y_train, X_test)
# multiscore (y_test, y_pred)
dico_metric (model_metrics_tfidf, "reg_log")

### Random Forest :

In [None]:
%%time
y_pred = random_forest (X_train, y_train, X_test)
# multiscore (y_test, y_pred)
dico_metric (model_metrics_tfidf, "random_forest")

### Decision tree : 

In [None]:
%%time
y_pred = decision_tree (X_train, y_train, X_test)
# multiscore (y_test, y_pred)
dico_metric (model_metrics_tfidf, "decision_tree")

### KNN : 

In [None]:
%%time
y_pred = KNeighbors (X_train, y_train, X_test)
# multiscore (y_test, y_pred)
dico_metric (model_metrics_tfidf, "knn")

### Xgboost :

In [None]:
%%time
y_pred = xgboost (X_train, y_train, X_test)
# multiscore (y_test, y_pred)
dico_metric (model_metrics_tfidf, "xgboost")

### Gradient Boosting :

In [None]:
%%time
y_pred = gradient_boost (X_train, y_train, X_test)
# model_svc = multiscore (y_test, y_pred)
dico_metric (model_metrics_tfidf, "gradient_boost")

In [None]:
"""score_tfidf = pd.DataFrame.from_dict(model_metrics_tfidf)
score_tfidf = score_tfidf.style.applymap(color_yellow).applymap(make_bold)"""

In [None]:
score_tfidf = pd.DataFrame.from_dict(model_metrics_tfidf)
# score_tfidf = score_tfidf.style.applymap(color_yellow).applymap(make_bold)
print("-"*100)
print("-"*43 + f"Via le TFIDF :" + "-"*43)
print("-"*100)
display(score_tfidf)

#### Dataframe des scores pour le tf-idf :

In [None]:
score_tfidf.to_csv("/content/drive/My Drive/Colab Notebooks/Projet_5/Resultat_score_tfidf.csv") # Cela permettra d'éviter d'attendre de relancer le notebook.

## Algorithme utilisant le word2vec :

### Connaître le nombre de mots max :

In [None]:
# Pour connaitre le max len :
df_modellisation['length_corpus'] = df_modellisation['corpus_finish'].apply(lambda x : len(tokenize(x)))
print(f"Dans le corpus, le nombre de mots maximum est de : ", df_modellisation['length_corpus'].max())

### Train test via word2vec :

In [None]:
corpus = df_modellisation['corpus_finish']
corpus_count = 270

In [None]:
"""%%time
# Création d'une boucle pour ne pas relancer le modéle :
if not os.path.exists("/content/drive/My Drive/Colab Notebooks/Projet_5/w2v_model_train") :

  # Création d'un modéle :
  w2v = Word2Vec(min_count=30,
                vector_size = 300)

  # Nourrir le modéle avec nos phrases :
  w2v.build_vocab(corpus, progress_per = 1000)

  # Entrainement du modéle :
  w2v.train(corpus, total_examples=w2v.corpus_count, epochs=100)

  # Sauvegarder le modéle :
  w2v.save("/content/drive/My Drive/Colab Notebooks/Projet_5/w2v_model_train")

else :
  # si le modéle est dans la racine, lance directement le modéle  
  w2v = Word2vec.load("/content/drive/My Drive/Colab Notebooks/Projet_5/w2v_model_train")"""

In [None]:
"""
# librairie à installer :
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D

embedding_dim = 300

2_vec_model = Sequential(df_modellisation['corpus_finish'])
2_vec_model.add(Embedding(vocab_size, embedding_dim))
2_vec_model.add(GlobalAveragePooling1D())
2_vec_model.add(Dense(vocab_size, activation='softmax'))



2_vec_model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

2vec_sentences = 2_vec_model.fit(X, y,
    batch_size = 256,
    epochs=10)


"""


Les données du cours Openclassrooms m'ont fortement aidé pour créer cette algorithme 




In [None]:
!pip install gensim==4.2.0 # -> pour relancer le modéle 

In [None]:
print(gensim.__version__)

In [None]:
w2v_size=300
w2v_window=5
w2v_min_count=25
w2v_epochs=100
maxlen = 270 # taille des phrases à vérifier pour le max
sentences = df_modellisation['corpus_finish'].tolist()
sentences = [gensim.utils.simple_preprocess(text) for text in sentences]

In [None]:
%%time
# Création et entraînement du modèle Word2Vec

print("Build & train Word2Vec model ...")
w2v_model = gensim.models.Word2Vec(min_count=w2v_min_count, window=w2v_window,
                                                vector_size=w2v_size,
                                                seed=42,
                                                workers=1)
#                                                workers=multiprocessing.cpu_count())
w2v_model.build_vocab(sentences)
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=w2v_epochs)
model_vectors = w2v_model.wv
w2v_words = model_vectors.index_to_key
print("Vocabulary size: %i" % len(w2v_words))
print("Word2Vec trained")

In [None]:
w2v_model.save("/content/drive/My Drive/Colab Notebooks/Projet_5/word2vec.model")
w2v_model = Word2Vec.load("/content/drive/My Drive/Colab Notebooks/Projet_5/word2vec.model")

In [None]:
%%time
# Préparation des sentences (tokenization)

print("Fit Tokenizer ...")
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
x_sentences = pad_sequences(tokenizer.texts_to_sequences(sentences),
                                                     maxlen=maxlen,
                                                     padding='post') 
                                                   
num_words = len(tokenizer.word_index) + 1
print("Number of unique words: %i" % num_words)

In [None]:
# Création de la matrice d'embedding

print("Create Embedding matrix ...")
w2v_size = 300
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
embedding_matrix = np.zeros((vocab_size, w2v_size))
i=0
j=0
    
for word, idx in word_index.items():
    i +=1
    if word in w2v_words:
        j +=1
        embedding_vector = model_vectors[word]
        if embedding_vector is not None:
            embedding_matrix[idx] = model_vectors[word]
            
word_rate = np.round(j/i,4)
print("Word embedding rate : ", word_rate)
print("Embedding matrix: %s" % str(embedding_matrix.shape))

In [None]:
# Création du modèle

input=Input(shape=(len(x_sentences),maxlen),dtype='float64')
word_input=Input(shape=(maxlen,),dtype='float64')  
word_embedding=Embedding(input_dim=vocab_size,
                         output_dim=w2v_size,
                         weights = [embedding_matrix],
                         input_length=maxlen)(word_input)
word_vec=GlobalAveragePooling1D()(word_embedding)  
embed_model = Model([word_input],word_vec)

embed_model.summary()

In [None]:
X = x_sentences # a modifier par tfidf et par wordtovec
y = Y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
model_metrics_word2vec = {}

### Regression Logistique :

In [None]:
%%time
y_pred = regression_log (X_train, y_train, X_test)
# multiscore (y_test, y_pred)
dico_metric (model_metrics_word2vec, "word2vec_reg_log")

### Random Forest :

In [None]:
%%time
y_pred = random_forest (X_train, y_train, X_test)
# multiscore (y_test, y_pred)
dico_metric (model_metrics_word2vec, "word2vec_random_forest")

### Decision Tree Classifier :

In [None]:
%%time
y_pred = decision_tree (X_train, y_train, X_test)
# multiscore (y_test, y_pred)
dico_metric (model_metrics_word2vec, "word2vec_decision_tree")

### K.N.N :

In [None]:
%%time
y_pred = KNeighbors (X_train, y_train, X_test)
# multiscore (y_test, y_pred)
dico_metric (model_metrics_word2vec, "word2vec_knn")

### Xgboost :

In [None]:
%%time
y_pred = xgboost (X_train, y_train, X_test)
# multiscore (y_test, y_pred)
dico_metric (model_metrics_word2vec, "word2vec_xgboost")

### Gradient Boosting :

In [None]:
%%time
y_pred = gradient_boost (X_train, y_train, X_test)
# model_svc = multiscore (y_test, y_pred)
dico_metric (model_metrics_word2vec, "word2vec_gradient_boost")

In [None]:
score_word2vec = pd.DataFrame.from_dict(model_metrics_word2vec)
# score_bow = score_bow.style.applymap(color_yellow).applymap(make_bold)
print("-"*100)
print("-"*42 + f"Via le word2vec :" + "-"*42)
print("-"*100)
display(score_word2vec)

#### Dataframe des scores pour le word2vec :

In [None]:
score_word2vec.to_csv("/content/drive/My Drive/Colab Notebooks/Projet_5/Resultat_score_word2vec.csv") # Cela permettra d'éviter d'attendre de relancer le notebook.

## Algorithme utilisant Bert : 

### Train test via Bert :

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# source : https://towardsdatascience.com/multi-class-classification-with-transformers-6cf7b59a033a

In [None]:
# Setting dimension :
seq_len = 270
num_samples = len(df_modellisation)

# initialize empty zero arrays
Xids = np.zeros((num_samples, seq_len))
Xmask = np.zeros((num_samples, seq_len))

# check shape
display(Xids.shape)
display(Xmask.shape)
# initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

for i, phrase in enumerate(df_modellisation['corpus_finish']):
    tokens = tokenizer.encode_plus(phrase, max_length=seq_len, truncation=True,
                                   padding='max_length', add_special_tokens=True,
                                   return_tensors='tf')
    # assign tokenized outputs to respective rows in numpy arrays
    Xids[i, :] = tokens['input_ids']
    Xmask[i, :] = tokens['attention_mask']

In [None]:
#n_tags = len(df_modellisation['tags_preprocessing'])
tag_num = df_modellisation['tags_final'].explode().nunique()
labels = np.zeros((num_samples, tag_num))
tag_to_num = dict(zip(df_modellisation['tags_final'].explode().unique(), range(tag_num)))
for index, tag_list in enumerate (df_modellisation['tags_final']):
  tag_list = [tag_to_num[tag] for tag in tag_list]
  for n_tags in tag_list:
    labels[index, n_tags] = 1

In [None]:
# create the dataset object
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))
display(dataset.take(1))

def map_func(input_ids, masks, labels):
    # we convert our three-item tuple into a two-item tuple where the input item is a dictionary
    return {'input_ids': input_ids, 'attention_mask': masks}, labels

# then we use the dataset map method to apply this transformation
dataset = dataset.map(map_func)

In [None]:
# we will split into batches of 16
batch_size = 16

# shuffle and batch - dropping any remaining samples that don't cleanly
# fit into a batch of 16
dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)
display(dataset.take(1))

In [None]:
# set split size (33% training data) and calculate training set size
split = 0.33
size = int((Xids.shape[0]/batch_size)*split)

# get training and validation sets
train_ds = dataset.take(size)
test_ds = dataset.skip(size)

del dataset # free memoire

In [None]:
# TFAutoModel for TensorFlow
from transformers import TFAutoModel

bert = TFAutoModel.from_pretrained('bert-base-cased')

display(bert.summary())

In [None]:
# two input layers, we ensure layer name variables match to dictionary keys in TF dataset
input_ids = tf.keras.layers.Input(shape=(270,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(270,), name='attention_mask', dtype='int32')

# we access the transformer model within our bert object using the bert attribute (eg bert.bert instead of bert)
embeddings = bert.bert(input_ids, attention_mask=mask)[1]  # access pooled activations with [1]

# convert bert embeddings into 5 output classes
x = tf.keras.layers.Dense(1024, activation='relu')(embeddings)
y = tf.keras.layers.Dense(28, activation='softmax', name='outputs')(x)

In [None]:
# initialize model
model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)
# freeze bert layer
model.layers[2].trainable = False
display(model.summary())

In [None]:
optimizer = tf.keras.optimizers.Adam(lr=1e-5, decay=1e-6)
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

# Measure accuracy
auc_area = tf.keras.metrics.AUC()

model.compile(optimizer=optimizer, loss=loss, metrics=[auc_area])

In [None]:
accuracy = tf.keras.metrics.Accuracy()
Recall = tf.keras.metrics.Recall()
auc_area = tf.keras.metrics.AUC()

In [None]:
%%time
history = model.fit(
    train_ds,
    validation_data = test_ds,
    epochs = 3,
    verbose =2)

In [None]:
%%time
# 5 questions of the test set are evaluated
tag_list = df_modellisation['tags_final'].explode().unique()

for index in range(0,5):
    data = test_ds.take(1)
    
    # Recovering tags
    label_arr = list(data)[0][1][index]
    tag_doc = (label_arr.numpy().astype(bool) * tag_list)
    tag_doc = tag_doc[tag_doc!='']
    
    # Recovering doc
    doc = tokenizer.decode([n.numpy() for n in list(data)[0][0]['input_ids'][0] if n != 0])
    
    # Predictiong value
    pred_arr = model.predict(data)[0]
    pred_tag = tag_list[pred_arr.argmax()]
    
    
    print('-'*50)
    print('Tag : ', tag_doc)
    print('-'*50)
    print('Prediction : ', pred_tag)
    print('-'*50)
    print(doc)
    print('')
    print('-'*50)
    print('')
    print('')

In [None]:
"""!pip install transformers
os.environ["TF_KERAS"]='1'"""

In [None]:
"""# Fonction de préparation des sentences
def bert_inp_fct(sentences, bert_tokenizer, max_length) :
    input_ids=[]
    token_type_ids = []
    attention_mask=[]
    bert_inp_tot = []

    for sent in sentences:
        bert_inp = bert_tokenizer.encode_plus(sent,
                                              add_special_tokens = True,
                                              max_length = max_length,
                                              padding='max_length',
                                              return_attention_mask = True, 
                                              return_token_type_ids=True,
                                              truncation=True,
                                              return_tensors="tf")
    
        input_ids.append(bert_inp['input_ids'][0])
        token_type_ids.append(bert_inp['token_type_ids'][0])
        attention_mask.append(bert_inp['attention_mask'][0])
        bert_inp_tot.append((bert_inp['input_ids'][0], 
                             bert_inp['token_type_ids'][0], 
                             bert_inp['attention_mask'][0]))

    input_ids = np.asarray(input_ids)
    token_type_ids = np.asarray(token_type_ids)
    attention_mask = np.array(attention_mask)
    
    return input_ids, token_type_ids, attention_mask, bert_inp_tot
    

# Fonction de création des features
def feature_BERT_fct(model, model_type, sentences, max_length, b_size, mode='HF') :
    batch_size = b_size
    batch_size_pred = b_size
    bert_tokenizer = AutoTokenizer.from_pretrained(model_type)
    time1 = time.time()

    for step in range(len(sentences)//batch_size) :
        idx = step*batch_size
        input_ids, token_type_ids, attention_mask, bert_inp_tot = bert_inp_fct(sentences[idx:idx+batch_size], 
                                                                      bert_tokenizer, max_length)
        
        if mode=='HF' :    # Bert HuggingFace
            outputs = model.predict([input_ids, attention_mask, token_type_ids], batch_size=batch_size_pred)
            last_hidden_states = outputs.last_hidden_state

        if mode=='TFhub' : # Bert Tensorflow Hub
            text_preprocessed = {"input_word_ids" : input_ids, 
                                 "input_mask" : attention_mask, 
                                 "input_type_ids" : token_type_ids}
            outputs = model(text_preprocessed)
            last_hidden_states = outputs['sequence_output']
             
        if step ==0 :
            last_hidden_states_tot = last_hidden_states
            last_hidden_states_tot_0 = last_hidden_states
        else :
            last_hidden_states_tot = np.concatenate((last_hidden_states_tot,last_hidden_states))
    
    features_bert = np.array(last_hidden_states_tot).mean(axis=1)
    
    time2 = np.round(time.time() - time1,0)
    print("temps traitement : ", time2)
     
    return features_bert, last_hidden_states_tot"""

In [None]:
"""max_length = 270 # taille des phrases à vérifier pour le max
batch_size = 5
model_type = 'bert-base-uncased'
model = TFAutoModel.from_pretrained(model_type)
sentences = df_modellisation['corpus_finish'].tolist()"""

In [None]:
"""%%time
import time
# Création des features

features_bert, last_hidden_states_tot = feature_BERT_fct(model, model_type, sentences, 
                                                         max_length, batch_size, mode='HF')"""

In [None]:
"""features_bert"""

In [None]:
"""X = features_bert
y = Y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)"""

### Regression Logistique :

In [None]:
"""%%time
y_pred = regression_log (X_train, y_train, X_test)
# multiscore (y_test, y_pred)
dico_metric (model_metrics, "BERT_gradient_boost")"""

### Random Forest :

In [None]:
"""%%time
y_pred = random_forest (X_train, y_train, X_test)
# multiscore (y_test, y_pred)
dico_metric (model_metrics, "BERT_gradient_boost")"""

### Decision Tree Classifier :

In [None]:
"""%%time
y_pred = decision_tree (X_train, y_train, X_test)
# multiscore (y_test, y_pred)
dico_metric (model_metrics, "BERT_gradient_boost")"""

### K.N.N :

In [None]:
"""%%time
y_pred = KNeighbors (X_train, y_train, X_test)
# multiscore (y_test, y_pred)
dico_metric (model_metrics, "BERT_gradient_boost")"""

### Xgboost :

In [None]:
"""%%time
y_pred = xgboost (X_train, y_train, X_test)
# multiscore (y_test, y_pred)
dico_metric (model_metrics, "BERT_gradient_boost")"""

### Gradient Boosting :

In [None]:
"""%%time
y_pred = gradient_boost (X_train, y_train, X_test)
# model_svc = multiscore (y_test, y_pred)
dico_metric (model_metrics, "BERT_gradient_boost")"""

## Algorithme utilisant USE :

In [None]:
model_metrics_use = {}

### Train test via USE :

In [None]:
%%time
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model =  tf_hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
  return model(input)

In [None]:
x1 = embed(df_modellisation['corpus_finish'].tolist())

In [None]:
x1 = np.array(x1)

In [None]:
x1.shape

In [None]:
X = x1 # pour la méthode USE
y = Y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Regression Logistique :

In [None]:
%%time
y_pred = regression_log (X_train, y_train, X_test)
# multiscore (y_test, y_pred)
dico_metric (model_metrics_use, "USE_reg_log")

### Random Forest :

In [None]:
%%time
y_pred = random_forest (X_train, y_train, X_test)
# multiscore (y_test, y_pred)
dico_metric (model_metrics_use, "USE_random_forest")

### Decision Tree Classifier :

In [None]:
%%time
y_pred = decision_tree (X_train, y_train, X_test)
# multiscore (y_test, y_pred)
dico_metric (model_metrics_use, "USE_decision_tree")

### K.N.N :

In [None]:
%%time
y_pred = KNeighbors (X_train, y_train, X_test)
# multiscore (y_test, y_pred)
dico_metric (model_metrics_use, "USE_knn")

### Xgboost :

In [None]:
%%time
y_pred = xgboost (X_train, y_train, X_test)
# multiscore (y_test, y_pred)
dico_metric (model_metrics_use, "USE_xgboost")

### Gradient Boosting :

In [None]:
%%time
y_pred = gradient_boost (X_train, y_train, X_test)
# model_svc = multiscore (y_test, y_pred)
dico_metric (model_metrics_use, "USE_gradient_boost")

In [None]:
score_use = pd.DataFrame.from_dict(model_metrics_use)
# score_bow = score_bow.style.applymap(color_yellow).applymap(make_bold)
print("-"*100)
print("-"*42 + f"Via le use :" + "-"*42)
print("-"*100)
display(score_use)

#### Dataframe des scores pour le USE :

In [None]:
score_use.to_csv("/content/drive/My Drive/Colab Notebooks/Projet_5/Resultat_score_use.csv", index=False) # Cela permettra d'éviter d'attendre de relancer le notebook.

## Choix du meilleur algorithme :

In [None]:
# Styling
def color_green(val):
    color = 'green' if val > .2 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .2 else 700
    return 'font-weight: {weight}'.format(weight=weight)

In [None]:
score_metrics = pd.DataFrame.from_dict(model_metrics)
score_metrics = score_metrics.style.applymap(color_green).applymap(make_bold)
score_metrics

In [None]:
# FAIRE UN DICTIONNAIRE POUR LES ALGORITHMES 
## Algo BOW algo tf-idf Algo USE algo word2vec SONT OKAY
### Faire un dictionnaire pour chaque utilisation ou global ?