In [None]:
# !pip install spacy gensim pyLDAvis spacy

In [None]:
# General stuff | trucs généraux
import re
import numpy as np
import pandas as pd
from pprint import pprint
from pathlib import Path
import json


#NLTK
import nltk
nltk.download('stopwords')

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, LdaModel, LdaMulticore

# spacy for lemmatization | spacy pour lemmatisation
import spacy


# Plotting tools | outils graphiques
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional | activé le registre pour gensim - en option
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [None]:
# NLTK Stop words | NLTK Mots vides
from nltk.corpus import stopwords
#stop_words = stopwords.words('english')
stop_words = stopwords.words('french')
print(stop_words) #see the default list | voir la liste pas défaut
stop_words.extend([])  #add your custom stop words | ajoutez vos mots vides personnalisés 
print(stop_words) #see the final list of stop words | voir la liste complète

In [None]:
#txt_folder = Path('data_folder/').rglob('*.txt') #gather the paths for all your text files 
txt_folder = Path('donnee/').rglob('*.txt') #collecter les chemins de fichiers pour tous vos fichiers texte
files = [x for x in txt_folder] 
files

In [None]:
#create a dictionary which contains all the file names and matches them to their text contents
#créer un dictionnaire qui contient tous les noms de fichiers et les associe à leur texte
papers={}
papers['target_names']=[]
papers['content']=[]
for name in files:
    f = open(name, 'r', encoding='utf-8')
    print(str(name).split('\\')[1])
    papers['target_names'].append(str(name).split('\\')[1])
    papers['content'].append(' '.join(f.readlines()))
    f.close()

In [None]:
#convert the dictionary to a pandas data frame 
#convertir le dictionnaire en dataframe pandas
df = pd.DataFrame.from_dict(papers)
df.head()

In [None]:
# Convert the text content to a list
# Convertir le contenu du texte en liste
data = df.content.values.tolist()

#Remove roman numerals | Supprimer les chiffres romains
data = [re.sub('[MDCLXVI]+(\.|\b\w\n)', ' ', sent) for sent in data]

#Remove new line characters | Supprimer les caractères de nouvelle ligne
data = [re.sub('\s+', ' ', sent) for sent in data]


#Remove distracting quotes | Supprimer les citations distrayantes
#data = [re.sub("\'", "", sent) for sent in data]

pprint(data[0:5])

In [None]:
#remove the punctuation and collect all the individual words
#supprimer la ponctuation et collecter tous les mots individuels
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

In [None]:
# Build the bigram and trigram models
#Construire les modèles bigramme et trigramme
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=10) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=10)  

# Faster way to get a sentence identified as a trigram/bigram
## Moyen plus rapide d'obtenir une phrase identifiée comme un trigramme/bigramme
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example | voir l'exemple trigramme
print(trigram_mod[bigram_mod[data_words[0]]])

In [None]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
## Définir des fonctions pour les mots vides, les bigrammes, les trigrammes et la lemmatisation
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
# Remove Stop Words | Supprimer les mots vides
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams | faire les bigrammes
data_words_bigrams = make_bigrams(data_words_nostops)

# Form trigrams | faire les trigrammes
data_words_trigrams = make_trigrams(data_words_bigrams)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
#nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
# Initialiser le modèle spacy 'fr', en ne gardant que le composant tagger (pour plus d'efficacité)
#nlp = spacy.load('fr_core_news_sm', disable=['parser', 'ner'])
nlp = spacy.load("fr_core_news_sm")
# Do lemmatization keeping only noun, adj, vb, adv
# Faire la lemmatisation en ne gardant que le nom, l'adj, le vb, l'adv
data_lemmatized = lemmatization(data_words_trigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

In [None]:
# Create Dictionary | créer le dictionnaire
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus | créer le corpus
texts = data_lemmatized

# Term Document Frequency | Durée Document Fréquence
corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
# readable format of corpus | format lisible du corpus
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:4]]

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=1):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Ceiling for num of topics, model will evaluate up to but not including this number

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path=mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
# Can take a long time to run. 
# il peut executé très longtemps
set_limit = 11  #choose the max ceiling for number of topics, your model will have a max of one less than this ceiling 
#choisissez le nombre maximum plafond de thème, votre modèle aura un thème de moins 
set_start = 2 #set the minium number of topics your model will run | choisissez le nombre minimum de thème
set_step = 2 #set the step width for number of topics per model | choisissez la taille du pas
mallet_path = 'C:/Users/lydia/mallet-2.0.8/bin/mallet' # update this path to the path to your mallet program
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=set_start, limit=set_limit, step=set_step)

In [None]:
# Show graph | voir le graphique

x = range(set_start, set_limit, set_step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Print the coherence scores | voir les cohérences
for m, cv in zip(x, coherence_values):
    print("Num Topics|Numero de Théme =", m, " has Coherence Value of|a une cohérence de", round(cv, 4))

In [None]:
# Select the model and print the topics
#choissisez le mieux modéle et voir les thémes
optimal_model = model_list[3] #choose which model in the list you think is the best, remember python started indexing from 0
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

In [None]:

ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=8, id2word=id2word)  #now run just that model with the exact number of topics you want

In [None]:
# Show Topics | voir les thèmes
pprint(ldamallet.show_topics(formatted=False))

# see the Coherence Score | voir la cohérance
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)

In [None]:
# Visualize the topics
#visualiser les thèmes
# Can't use the gensim method for MALLET directly so converting LdaMallet Model to LdaModel as per https://radimrehurek.com/gensim/models/wrappers/ldamallet.html
# Note that a "by hand" version of doing thing can be found at https://jeriwieringa.com/2018/07/17/pyLDAviz-and-Mallet/

lda_model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet, gamma_threshold=0.01, iterations=20)

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [None]:
def format_topics_sentences(ldamodel=ldamallet, corpus=corpus, texts=df):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = texts
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)



In [None]:

df_topic_sents_keywords = format_topics_sentences(ldamodel=ldamallet, corpus=corpus, texts=df)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document number','Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'file_name','Text']

In [None]:
# Show
df_dominant_topic