# TOPIC MODELLING - TOP2VEC & GENSIM

## Initialisation : séparation de l'ensemble du corpus et du sous-corpus d'urgence en 3 périodes (1789-1792, 1792-1795, 1795-1799)

In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

#pour l'approche top2vec : 
import json
import os
import ipywidgets as widgets
from IPython.display import clear_output, display
import hdbscan
from top2vec import Top2Vec

#pour représenter dans l'espaces les topic top2vec : 
import umap
import umap.plot

#pour sauvegarder le wordcloud top2vec: 
import matplotlib.pyplot
from wordcloud import WordCloud

In [2]:
import spacy

In [3]:
ensemble_decrets = pd.read_csv('sous_corpus/Ensemble_des_vols_concatenes.tsv', sep='\t')
decrets_urgence = pd.read_csv('sous_corpus/Ensemble_decrets_urgence.tsv', sep='\t')

ensemble_decrets = ensemble_decrets['texte'].dropna()
decrets_urgence = decrets_urgence['texte'].dropna()

ensemble_decrets_1789_1795 = ensemble_decrets[0:19877]
decrets_urgence_1789_1795 = decrets_urgence[0:1102]

decrets_non_urgents = ensemble_decrets[0:19877]
mask = decrets_non_urgents.str.contains('urgence', case=False, na=False)
decrets_non_urgent = decrets_non_urgents[mask== False]

decrets_courrier_extraordinaire = ensemble_decrets[0:19877]
mask = decrets_courrier_extraordinaire.str.contains(r'courrier.? extraordinaire.?', case=False, na=False)
decrets_courrier_extraordinaire = ensemble_decrets_1789_1795[mask==True]

decrets_bulletins = ensemble_decrets[0:19877]
mask = decrets_bulletins.str.contains(r'au bulletin.?', case=False, na=False)
decrets_bulletins = ensemble_decrets_1789_1795[mask==True]

In [4]:
#Découpe des sous-corpus
ensemble_decrets_1789_1792 = ensemble_decrets[0:5038]
ensemble_decrets_1792_1795 = ensemble_decrets[5039:19877]
decrets_urgence_1789_1792 = decrets_urgence[0:1101]

In [5]:
liste_ensemble_decrets_1789_1795 = ensemble_decrets_1789_1795.tolist()
liste_decrets_urgence_1789_1795 = decrets_urgence_1789_1795.tolist()
liste_ensemble_decrets_urgence_avec_Directoire = decrets_urgence.tolist()
liste_decrets_non_urgent = decrets_non_urgent.tolist()
liste_decrets_courrier_extraordinaire = decrets_courrier_extraordinaire.tolist()
liste_decrets_bulletins = decrets_bulletins.tolist()

In [None]:
#Sauvegarde des sous-corpus
ensemble_decrets_1789_1795.to_csv('sous_corpus/ensemble_decrets_1789_1795.tsv', sep='\t', encoding="utf-8")
decrets_urgence_1789_1795.to_csv('sous_corpus/decrets_urgence_1789_1795.tsv', sep='\t', encoding="utf-8")
decrets_non_urgent.to_csv('sous_corpus/decrets_non_urgent.tsv', sep='\t', encoding="utf-8")
decrets_courrier_extraordinaire.to_csv('sous_corpus/decrets_courrier_extraordinaire.tsv', sep='\t', encoding="utf-8")
decrets_bulletins.to_csv('sous_corpus/decrets_bulletins.tsv', sep='\t', encoding="utf-8")

In [None]:
ensemble_decrets_1789_1795 = pd.read_csv('sous_corpus/ensemble_decrets_1789_1795.tsv', sep='\t', encoding="utf-8")
decrets_urgence_1789_1795 = pd.read_csv('sous_corpus/decrets_urgence_1789_1795.tsv', sep='\t', encoding="utf-8")
decrets_non_urgent = pd.read_csv('sous_corpus/decrets_non_urgent.tsv', sep='\t', encoding="utf-8")
decrets_courrier_extraordinaire =  pd.read_csv('sous_corpus/decrets_courrier_extraordinaire.tsv', sep='\t', encoding="utf-8")
decrets_bulletins = pd.read_csv('sous_corpus/decrets_bulletins.tsv', sep='\t', encoding="utf-8")

# Approches Top2Vec

In [None]:
def dictionnaires_topics(model, corpus_liste):
    '''Fonction sortant tous les topics du corpus détectés par top2vec. Chaque topic comprend les différents mots
    triés par poids, sous forme d'une liste de dictionnaire. 
    
    corpus_liste : ensemble de textes au format liste. 
    '''
    top2vec = model
    nbre_topics = model.get_num_topics()
    taille_topics = model.get_topic_sizes()

    #pour avoir les topics en dictionnaire
    topic_words, word_scores, topic_nums = top2vec.get_topics(nbre_topics)
    liste_dicts = []
    titres_plt = []
    for numero_topic in topic_nums: 
        increment = 0
        liste_tuples = []
        sous_dictionnaire = {}

        topic_size = taille_topics[0][numero_topic]
        pourcentage_topic = topic_size/(len(corpus_liste))*100

        for mot in topic_words[numero_topic]:
            word_score_list = word_scores[numero_topic].tolist()

            if increment in range(len(word_score_list)):
                score = word_score_list[increment]
                liste_mot = (mot, score)
                liste_tuples.append(liste_mot)

                increment += 1

            else: increment=0

        dict = {}
        for a, b in liste_tuples:
            dict.setdefault(a, b)
        titres_plt.append(f" le topic {numero_topic} représente {pourcentage_topic}  des documents; il est composé des mots:'")
        liste_dicts.append(dict)
    return(liste_dicts)

def titres_dico_topics(model, corpus_liste):
    '''Fonction identique à la précédente, mais pour return le titre des dictionnaires (code redondant à reprendre) 
    '''
    top2vec = model
    nbre_topics = model.get_num_topics()
    taille_topics = model.get_topic_sizes()

    #pour avoir les topics en dictionnaire
    topic_words, word_scores, topic_nums = top2vec.get_topics(nbre_topics)
    liste_dicts = []
    titres_plt = []
    for numero_topic in topic_nums: 
        increment = 0
        liste_tuples = []
        sous_dictionnaire = {}

        topic_size = taille_topics[0][numero_topic]
        pourcentage_topic = topic_size/(len(corpus_liste))*100

        for mot in topic_words[numero_topic]:
            word_score_list = word_scores[numero_topic].tolist()

            if increment in range(len(word_score_list)):
                score = word_score_list[increment]
                liste_mot = (mot, score)
                liste_tuples.append(liste_mot)

                increment += 1

            else: increment=0

        dict = {}
        for a, b in liste_tuples:
            dict.setdefault(a, b)
        pourcentage_topic = round(pourcentage_topic, 2)
        print(pourcentage_topic)
        titres_plt.append(f"Topic {numero_topic} ({pourcentage_topic} %  des documents)")
        liste_dicts.append(dict)
    return(titres_plt)

def spatialisation_topic(model, topic_reduction, path_plot_sortie):
    ''' Fonction permettant de visualiser les topics dans l'espace. Ne return pas de variables mais enregistre les images.
    
    model : modèle top2vec entraîné; les données sont dans le modèle.
    topic_reduction : forcément un nombre entier; c'est le nombre de topics qui sortiront de la réduction de topic
                       nécessaire à l'affichage du plot. Ne peut être supérieur au nombres de topics totaux.
    path_plot_sortie: chemin et nom du fichier à préciser, type path/file.jpg ; permet d'enregistrer la figure. 
    
    '''
    print(model.hierarchical_topic_reduction(topic_reduction))
    #Présente les topics reliés entre eux dans la réduction du nombre de topics

    umap_args = {
        "n_neighbors": 15,
        "n_components": 2, # 5 -> 2 for plotting 
        "metric": "cosine",
    }

    umap_model = umap.UMAP(**umap_args).fit(model._get_document_vectors(norm=False))
    umap_plot = umap.plot.points(umap_model, labels=model.doc_top_reduced)
    
    umap_plot.figure.savefig(path_plot_sortie)
    model.get_topics(topic_reduction,reduced=True)
    
def wordclouds(model, corpus_liste, savepath, background_color="white", colormap="matplotlib.cm.ocean"):
    '''Fonction générant un nuage de mot suivant le dictionnaire des topics proposés. 
    Enregistre également ces nuages dans des dossiers spécifiés.
    model : modèle top2vec
    corpus_liste : liste des textes du corpus (pour calcul du poids du topic)
    savepath : dossier dans lequel sauvegarder le fichier
    colormap : à choisir parmi les colormap de matplotlib.cm'''
    
    dictionnaires = dictionnaires_topics(model,corpus_liste)
    titre = titres_dico_topics(model, corpus_liste)
    
    count=0
    for dico in dictionnaires:
        plt.figure(figsize=(16,4), dpi=200)
        wordcloud = WordCloud(background_color=background_color, width=1600, height=400, color_func=lambda *args, **kwargs: (0,0,0), colormap=colormap).generate_from_frequencies(dico)
        plt.figure()
        plt.imshow(wordcloud, interpolation="bilinear")
        plt.axis('off')
        plt.title(f"{titre[count]}")
        plt.savefig(f'{savepath}/topic_{count}.png')
        count+=1

### Par période (plus précis mais moins de données en entrée)

In [None]:
liste_ensemble_decrets_1789_1795 = ensemble_decrets_1789_1795.tolist()
liste_decrets_urgence_1789_1795 = decrets_urgence_1789_1795.tolist()
liste_ensemble_decrets_urgence_avec_Directoire = decrets_urgence.tolist()
liste_decrets_non_urgent = decrets_non_urgent.tolist()
liste_decrets_courrier_extraordinaire = decrets_courrier_extraordinaire.tolist()
liste_decrets_bulletins = decrets_bulletins.tolist()

In [None]:
top2vec_ensemble_decrets_1789_1795 = Top2Vec(documents=liste_ensemble_decrets_1789_1795, speed="learn", workers=4)
top2vec_ensemble_decrets_1789_1795.save("modeles/top2vec_ensemble_decrets_1789_1795")

In [None]:
top2vec_decrets_urgence_1789_1795 = Top2Vec(documents=liste_decrets_urgence_1789_1795, speed="learn", workers=4)
top2vec_decrets_urgence_1789_1795.save("modeles/top2vec_decrets_urgence_1789_1795")

In [None]:
top2vec_decrets_urgence_1789_1795_deeplearn = Top2Vec(documents=liste_decrets_urgence_1789_1795, speed="deep-learn", workers=4)
top2vec_decrets_urgence_1789_1795_deeplearn.save("modeles/top2vec_decrets_urgence_1789_1795_deeplearn")

In [None]:
top2vec_liste_decrets_non_urgent = Top2Vec(documents=liste_decrets_non_urgent, speed="learn", workers=4)
top2vec_liste_decrets_non_urgent.save("modeles/top2vec_liste_decrets_non_urgent")

In [None]:
top2vec_liste_decrets_courrier_extraordinaire = Top2Vec(documents=liste_decrets_courrier_extraordinaire, speed="learn", workers=4)
top2vec_liste_decrets_courrier_extraordinaire.save("modeles/top2vec_liste_decrets_courrier_extraordinaire")

In [None]:
top2vec_decrets_bulletins = Top2Vec(documents=liste_decrets_bulletins, speed="deep-learn", workers=4)
top2vec_decrets_bulletins.save("modeles/top2vec_decrets_bulletins")

# CHARGEMENT ET ANALYSE

In [None]:
ensemble_decrets_1789_1795 = pd.read_csv('sous_corpus/ensemble_decrets_1789_1795.tsv', sep='\t', encoding="utf-8")
decrets_urgence_1789_1795 = pd.read_csv('sous_corpus/decrets_urgence_1789_1795.tsv', sep='\t', encoding="utf-8")
decrets_non_urgent = pd.read_csv('sous_corpus/decrets_non_urgent.tsv', sep='\t', encoding="utf-8")
decrets_bulletins = pd.read_csv('sous_corpus/decrets_bulletins.tsv', sep='\t', encoding="utf-8")


top2vec_decrets_bulletins = Top2Vec.load("modeles/top2vec_decrets_bulletins")
top2vec_liste_decrets_non_urgent = Top2Vec.load("modeles/top2vec_liste_decrets_non_urgent")
top2vec_decrets_urgence_1789_1795 = Top2Vec.load("modeles/top2vec_decrets_urgence_1789_1795")
top2vec_decrets_urgence_1789_1795_deeplearn = Top2Vec.load("modeles/top2vec_decrets_urgence_1789_1795_deeplearn")
top2vec_ensemble_decrets_1789_1795 = Top2Vec.load("modeles/top2vec_ensemble_decrets_1789_1795")
top2vec_ensemble_decrets_urgence = Top2Vec.load("modeles_Top2Vec/Top2Vec_learn_4workers_ensemble_decrets_urgence")

liste_ensemble_decrets_1789_1795 = ensemble_decrets_1789_1795.tolist()
liste_decrets_urgence_1789_1795 = decrets_urgence_1789_1795.tolist()
liste_ensemble_decrets_urgence_avec_Directoire = decrets_urgence.tolist()
liste_decrets_non_urgent = decrets_non_urgent.tolist()
liste_decrets_bulletins = decrets_bulletins.tolist()

## Creation des wordclouds

In [None]:
wordclouds(top2vec_decrets_bulletins, liste_decrets_bulletins, 'WORDCLOUDS/inscription_bulletin_deeplearn', background_color="white", colormap="copper")
wordclouds(top2vec_liste_decrets_non_urgent, liste_decrets_non_urgent, 'WORDCLOUDS/non_urgents', background_color="white", colormap="copper")
wordclouds(top2vec_ensemble_decrets_urgence, liste_ensemble_decrets_urgence_avec_Directoire, 'WORDCLOUDS/urgence_avec_directoire', background_color="white", colormap="copper")
wordclouds(top2vec_decrets_urgence_1789_1795, liste_decrets_urgence_1789_1795, 'WORDCLOUDS/urgence_1789_1795', background_color="white", colormap="copper")
wordclouds(top2vec_decrets_urgence_1789_1795_deeplearn, liste_decrets_urgence_1789_1795, 'WORDCLOUDS/urgence_1789_1795_deeplearn', background_color="white", colormap="copper")
wordclouds(top2vec_ensemble_decrets_1789_1795, liste_ensemble_decrets_1789_1795, 'WORDCLOUDS/ensemble_decrets_1789_1795', background_color="white", colormap="copper")

In [None]:
print(top2vec_decrets_bulletins.get_num_topics(),
top2vec_liste_decrets_courrier_extraordinaire.get_num_topics(),
top2vec_liste_decrets_non_urgent.get_num_topics(),
top2vec_decrets_urgence_1789_1795.get_num_topics(),
top2vec_ensemble_decrets_1789_1795.get_num_topics(), 
top2vec_ensemble_decrets_urgence.get_num_topics(),
top2vec_decrets_urgence_1789_1795_deeplearn.get_num_topics())

In [None]:
dataall = {'doc_top': ['0', '1', '2', '3', '4', '5', '6', '7', '8'], 'color': ['#FF0000', '#FF8000', '#00FF00', '#00FFFF','#0000FF','#8000FF','#FF00FF','#0080FF','#FFFF00']}
dfall = pd.DataFrame(dataall)

data0 = {'doc_top': ['0', '1', '2', '3', '4', '5', '6', '7', '8'], 'color': ['#FF0000', '#CECECE', '#CECECE', '#CECECE','#CECECE','#CECECE','#CECECE','#CECECE','#CECECE']}
df0 = pd.DataFrame(data0)

data1 = {'doc_top': ['0', '1', '2', '3', '4', '5', '6', '7', '8'], 'color': ['#CECECE', '#FF8000', '#CECECE', '#CECECE','#CECECE','#CECECE','#CECECE','#CECECE','#CECECE']}
df1 = pd.DataFrame(data1)

data2 = {'doc_top': ['0', '1', '2', '3', '4', '5', '6', '7', '8'], 'color': ['#CECECE', '#CECECE', '#00FF00', '#CECECE','#CECECE','#CECECE','#CECECE','#CECECE','#CECECE']}
df2 = pd.DataFrame(data2)

data3 = {'doc_top': ['0', '1', '2', '3', '4', '5', '6', '7', '8'], 'color': ['#CECECE', '#CECECE', '#CECECE', '#00FFFF','#CECECE','#CECECE','#CECECE','#CECECE','#CECECE']}
df3 = pd.DataFrame(data3)

data4 = {'doc_top': ['0', '1', '2', '3', '4', '5', '6', '7', '8'], 'color': ['#CECECE', '#CECECE', '#CECECE', '#CECECE','#0000FF','#CECECE','#CECECE','#CECECE','#CECECE']}
df4 = pd.DataFrame(data4)

data5 = {'doc_top': ['0', '1', '2', '3', '4', '5', '6', '7', '8'], 'color': ['#CECECE', '#CECECE', '#CECECE', '#CECECE','#CECECE','#8000FF','#CECECE','#CECECE','#CECECE']}
df5 = pd.DataFrame(data5)

data6 = {'doc_top': ['0', '1', '2', '3', '4', '5', '6', '7', '8'], 'color': ['#CECECE', '#CECECE', '#CECECE', '#CECECE','#CECECE','#CECECE','#FF00FF','#CECECE','#CECECE']}
df6 = pd.DataFrame(data6)

data7 = {'doc_top': ['0', '1', '2', '3', '4', '5', '6', '7', '8'], 'color': ['#CECECE', '#CECECE', '#CECECE', '#CECECE','#CECECE','#CECECE','#CECECE','#0080FF','#CECECE']}
df7 = pd.DataFrame(data7)

data8 = {'doc_top': ['0', '1', '2', '3', '4', '5', '6', '7', '8'], 'color': ['#CECECE', '#CECECE', '#CECECE', '#CECECE','#CECECE','#CECECE','#CECECE','#CECECE','#FFFF00']}
df8 = pd.DataFrame(data8)

umap_args_model = {
"n_neighbors": 15,
"n_components": 2,
"metric": "cosine",
'min_dist':0,
}
umap_model = umap.UMAP(**umap_args_model).fit(top2vec_decrets_urgence_1789_1795_deeplearn._get_document_vectors(norm=False))
model2 = umap.plot.points(umap_model, labels = top2vec_decrets_urgence_1789_1795_deeplearn.doc_top, color_key = dfall['color'], background='white')
print("""All the topics """)
umap.plot.show(model2)


model2 = umap.plot.points(umap_model, labels = top2vec_decrets_urgence_1789_1795_deeplearn.doc_top, color_key = df0['color'], background='white')
print("""Projection du topic 0 :'patrie', 'empire', 'defense', 'armes', 'citoyens', 'frontieres',
         'requisitions', 'pouvoir', 'zele', 'executif', 'ennemis',
         'requisition', 'liberte', 'peuple', 'publique', 'moyens',
         'mesures', 'surete', 'importe', 'ennemi', 'seroit', 'force',
         'formation', 'prompte', 'colonies', 'armement', 'champ', 'salut',
         'tous', 'tranquillite', 'promptement', 'servir', 'sections',
         'gardes', 'bataillons', 'se', 'plus', 'nationales', 'sont',
         'francaise', 'constitution', 'representans', 'administratifs',
         'considerant', 'territoire', 'convenable', 'nation', 'poste',
         'volontaires', 'faire' """)
umap.plot.show(model2)

model2 = umap.plot.points(umap_model, labels = top2vec_decrets_urgence_1789_1795_deeplearn.doc_top, color_key = df1['color'], background='white')
print("""Projection du topic 1 :'fonds', 'recettes', 'depenses', 'secours', 'extraordinaires',
         'somme', 'tresorerie', 'disposition', 'interieur', 'millions',
         'extraordinaire', 'finances', 'avance', 'comites', 'caisse',
         'avances', 'tiendra', 'travaux', 'ordinaire', 'mille',
         'accordes', 'besoins', 'livres', 'montant', 'publics', 'urgence',
         'accorder', 'decrete', 'payeurs', 'liv', 'sieurs', 'enfans',
         'tresor', 'interet', 'ministre', 'nationale', 'vu', 'rapport',
         'considerant', 'public', 'due', 'sieur', 'mois', 'hopitaux',
         'entendu', 'derniers', 'proportion', 'liquidation', 'mis',
         'compte'""")
umap.plot.show(model2)

model2 = umap.plot.points(umap_model, labels = top2vec_decrets_urgence_1789_1795_deeplearn.doc_top, color_key = df2['color'], background='white')
print("""Projection du topic 2 :'gendarmerie', 'gendarmes', 'rang', 'anciennete', 'grade',
         'lieutenans', 'grades', 'lieutenant', 'logis', 'vaisseaux',
         'places', 'marechal', 'colonel', 'capitaines', 'choix',
         'organisation', 'compagnies', 'officiers', 'divisions',
         'service', 'services', 'militaire', 'activite', 'marine',
         'colonels', 'jouiront', 'adjudans', 'supplement', 'appointemens',
         'parmi', 'division', 'major', 'capitaine', 'formation', 'age',
         'nomination', 'servi', 'artillerie', 'remplir', 'commissions',
         'augmentation', 'regimens', 'camp', 'pied', 'campagne',
         'troupes', 'infanterie', 'moitie', 'traitement', 'ans'""")
umap.plot.show(model2)

model2 = umap.plot.points(umap_model, labels = top2vec_decrets_urgence_1789_1795_deeplearn.doc_top, color_key = df3['color'], background='white')
print("""Projection du topic 3 :'juges', 'tribunal', 'jures', 'tribunaux', 'fonctions',
         'remplir', 'procureur', 'election', 'membres', 'syndic',
         'nomination', 'assemblees', 'jugement', 'proceder', 'commune',
         'haute', 'exercice', 'paix', 'jugemens', 'sections', 'defaut',
         'cour', 'aupres', 'president', 'arrondissement', 'tranquillite',
         'commis', 'section', 'juge', 'nommes', 'seul', 'ville', 'loi',
         'epoux', 'pourvoir', 'directoire', 'conseil', 'instruction',
         'heures', 'sauf', 'paris', 'sans', 'district', 'nomme', 'police',
         'commissaires', 'expedition', 'parmi', 'conduite', 'liste'""")
umap.plot.show(model2)

model2 = umap.plot.points(umap_model, labels = top2vec_decrets_urgence_1789_1795_deeplearn.doc_top, color_key = df4['color'], background='white')
print("""Projection du topic 4 :'rentes', 'creanciers', 'liquidation', 'biens', 'payeurs',
         'contrats', 'emigres', 'titres', 'enregistrement', 'paiement',
         'dues', 'quittances', 'edit', 'certificats', 'domaines',
         'remboursement', 'rente', 'receveurs', 'certificat', 'femme',
         'etats', 'formalites', 'supprimes', 'montant', 'pensions',
         'epoux', 'contrat', 'profit', 'acte', 'caisse', 'bureaux',
         'obtenir', 'mains', 'residence', 'reconnoissance',
         'extraordinaire', 'francais', 'tenus', 'presenter', 'etoient',
         'jugemens', 'prescrite', 'agent', 'marie', 'derniers',
         'recettes', 'personne', 'regie', 'inscrite', 'interet'""")
umap.plot.show(model2)

model2 = umap.plot.points(umap_model, labels = top2vec_decrets_urgence_1789_1795_deeplearn.doc_top, color_key = df5['color'], background='white')
print("""Projection du topic 5 :'vu', 'adjudication', 'loire', 'departement', 'commune', 'avis',
         'etablissement', 'ville', 'directoire', 'district', 'saint',
         'proceder', 'domaines', 'maison', 'envoye', 'finances',
         'administrateurs', 'administration', 'oui', 'rapport', 'bureaux',
         'haute', 'demande', 'marche', 'tribunal', 'administratifs',
         'regie', 'assemblees', 'definitivement', 'necessite', 'acte',
         'due', 'biens', 'autorise', 'arrete', 'depense', 'petition',
         'montant', 'municipalite', 'interieur', 'urgence', 'ladite',
         'comite', 'vente', 'arretes', 'apres', 'contributions', 'avoir',
         'son', 'etablir'""")
umap.plot.show(model2)

model2 = umap.plot.points(umap_model, labels = top2vec_decrets_urgence_1789_1795_deeplearn.doc_top, color_key = df6['color'], background='white')
print("""Projection du topic 6 :'monnoies', 'especes', 'publiques', 'fabrication',
         'contributions', 'hotel', 'commission', 'poids', 'etablissemens',
         'directeurs', 'matieres', 'administrations', 'argent',
         'surveillance', 'pieces', 'quantite', 'commissaires', 'comptes',
         'assignats', 'etablissement', 'regie', 'employer', 'receveurs',
         'circulation', 'travaux', 'maisons', 'remis', 'operations',
         'enregistrement', 'objets', 'convenable', 'demeure', 'vivres',
         'concerne', 'bureau', 'examen', 'distribution', 'responsabilite',
         'etats', 'passer', 'etablis', 'directeur', 'arrondissement',
         'supprimes', 'particuliers', 'travail', 'peuvent', 'armees',
         'art', 'remettre'""")
umap.plot.show(model2)

model2 = umap.plot.points(umap_model, labels = top2vec_decrets_urgence_1789_1795_deeplearn.doc_top, color_key = df7['color'], background='white')
print("""Projection du topic 7 :'assignats', 'fabrication', 'circulation', 'papier', 'millions',
         'extraordinaire', 'caisse', 'monnoies', 'archives', 'valeur',
         'dessous', 'publiques', 'directeurs', 'caisses', 'papiers',
         'contributions', 'procede', 'receveurs', 'tresorerie',
         'cinquante', 'directeur', 'marches', 'quantite', 'finances',
         'tresorier', 'districts', 'recettes', 'livres', 'mesure',
         'administrateur', 'operations', 'cinq', 'cent', 'remis',
         'responsabilite', 'domaines', 'decrets', 'dix', 'livre',
         'sections', 'hotel', 'assemblee', 'commissaires', 'sols',
         'etablissement', 'somme', 'an', 'ordonne', 'remettre', 'quinze'""")
umap.plot.show(model2)

model2 = umap.plot.points(umap_model, labels = top2vec_decrets_urgence_1789_1795_deeplearn.doc_top, color_key = df8['color'], background='white')
print("""Projection du topic 8 :'commune', 'interets', 'vu', 'avis', 'ladite', 'employee',
         'vente', 'remboursement', 'directoire', 'adjudication',
         'contributions', 'administratifs', 'district', 'ville',
         'necessite', 'departement', 'loire', 'montant', 'oui',
         'employer', 'territoire', 'charge', 'grains', 'municipalite',
         'ses', 'biens', 'pourvoir', 'somme', 'subsistances', 'paiement',
         'ordinaire', 'extraordinaire', 'finances', 'demande', 'acte',
         'annees', 'regie', 'pourroit', 'domaines', 'besoins', 'procurer',
         'subsistance', 'surveillance', 'sols', 'definitivement',
         'autorise', 'interieur', 'faire', 'etablis', 'caisse'""")
umap.plot.show(model2)

# TOPIC MODELLING - GENSIM

In [6]:
import numpy as np
import pandas as pd
import json
import glob
import tqdm.notebook as tqdm
#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy
from nltk.corpus import stopwords

#vis
import pyLDAvis
import pyLDAvis.gensim_models

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
def lemmatization(texts, allowed_postags=["NOUN"], stopwords = ['décréter','citoyen','assemblee', 'decret', 'urgence', 'que', 'ne', 'il', 'ci', 'loi', 'teneur', 'ce', 'celle', 'assemblée', 'considérer', 'approuve', 'acte', 'conseil', 'adopter', 'signer', 'preceder', 'suivant', 'suivre','au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient', 'fusse', 'fusses', 'fût', 'fussions', 'fussiez', 'fussent', 'ayant', 'ayante', 'ayantes', 'ayants', 'eu', 'eue', 'eues', 'eus', 'ai', 'as', 'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons', 'aurez', 'auront', 'aurais', 'aurait', 'aurions', 'auriez', 'auraient', 'avais', 'avait', 'avions', 'aviez', 'avaient', 'eut', 'eûmes', 'eûtes', 'eurent', 'aie', 'aies', 'ait', 'ayons', 'ayez', 'aient', 'eusse', 'eusses', 'eût', 'eussions', 'eussiez', 'eussent', 'I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI', 'XII', 'XIII', 'XIV', 'XV', 'XVI', 'XVII', 'XVIII', 'XIX', 'XX', 'XXI', 'XXII', 'XXIII', 'XXIV', 'XXV', 'XXVI', 'XXVII', 'XXVIII', 'XXIX', 'XXX', 'XXXI', 'XXXII', 'XXXIII', 'XXXIV', 'XXXV', 'XXXVI', 'XXXVII', 'XXXVIII', 'XXXIX', 'XL', 'XLI', 'XLII', 'XLIII', 'XLIV', 'XLV', 'XLVI', 'XLVII', 'XLVIII', 'XLIX', 'L', 'LI', 'LII', 'LIII', 'LIV', 'LV', 'LVI', 'LVII', 'LVIII', 'LIX', 'LX', 'LXI', 'LXII', 'LXIII', 'LXIV', 'LXV', 'LXVI', 'LXVII', 'LXVIII', 'LXIX', 'LXX', 'LXXI', 'LXXII', 'LXXIII', 'LXXIV', 'LXXV', 'LXXVI', 'LXXVII', 'LXXVIII', 'LXXIX', 'LXXX', 'LXXXI', 'LXXXII', 'LXXXIII', 'LXXXIV', 'LXXXV', 'LXXXVI', 'LXXXVII', 'LXXXVIII', 'LXXXIX', 'XC', 'XCI', 'XCII', 'XCIII', 'XCIV', 'XCV', 'XCVI', 'XCVII', 'XCVIII', 'XCIX', 'C']):
    '''Fonction lemmatisant un texte en vue d'une approche LDA. Permet de cleaner le texte également en utilisant SpaCy. 
    
    texts : textes à lemmatiser (format df['texte']). 
    allowed_postags: POS qui seront conservés (détectés par Spacy). 
    stopwords: listes de mots qui ne seront pas considérés et supprimés du corpus lemmatisé. 
    
    '''
    nlp = spacy.load("fr_core_news_md", disable=["parser", "ner"])
    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags and token.lemma_ not in stopwords:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return (texts_out)

def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

def id2word_construction(data_words):
    id2word = corpora.Dictionary(data_words)
    corpus = []
    for text in data_words:
        new = id2word.doc2bow(text)
        corpus.append(new)
    return(corpus)

def lda_model_construction(corpus, id2word, nbre_topics=int):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                               id2word=id2word,
                                               num_topics=nbre_topics, #à faire correspondre avec le nbre trouvé par Top2Vec,
                                               update_every=1,
                                               chunksize=100,
                                               passes=10,
                                               alpha="auto")
    return(lda_model)

stopwords = ['décréter', 'citoyen', 'assemblee', 'decret', 'urgence', 'que', 'ne', 'il', 'ci', 'loi', 'teneur', 'ce', 'celle', 'assemblée', 'considérer', 'approuve', 'acte', 'conseil', 'adopter', 'signer', 'preceder', 'suivant', 'suivre','au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient', 'fusse', 'fusses', 'fût', 'fussions', 'fussiez', 'fussent', 'ayant', 'ayante', 'ayantes', 'ayants', 'eu', 'eue', 'eues', 'eus', 'ai', 'as', 'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons', 'aurez', 'auront', 'aurais', 'aurait', 'aurions', 'auriez', 'auraient', 'avais', 'avait', 'avions', 'aviez', 'avaient', 'eut', 'eûmes', 'eûtes', 'eurent', 'aie', 'aies', 'ait', 'ayons', 'ayez', 'aient', 'eusse', 'eusses', 'eût', 'eussions', 'eussiez', 'eussent', 'I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI', 'XII', 'XIII', 'XIV', 'XV', 'XVI', 'XVII', 'XVIII', 'XIX', 'XX', 'XXI', 'XXII', 'XXIII', 'XXIV', 'XXV', 'XXVI', 'XXVII', 'XXVIII', 'XXIX', 'XXX', 'XXXI', 'XXXII', 'XXXIII', 'XXXIV', 'XXXV', 'XXXVI', 'XXXVII', 'XXXVIII', 'XXXIX', 'XL', 'XLI', 'XLII', 'XLIII', 'XLIV', 'XLV', 'XLVI', 'XLVII', 'XLVIII', 'XLIX', 'L', 'LI', 'LII', 'LIII', 'LIV', 'LV', 'LVI', 'LVII', 'LVIII', 'LIX', 'LX', 'LXI', 'LXII', 'LXIII', 'LXIV', 'LXV', 'LXVI', 'LXVII', 'LXVIII', 'LXIX', 'LXX', 'LXXI', 'LXXII', 'LXXIII', 'LXXIV', 'LXXV', 'LXXVI', 'LXXVII', 'LXXVIII', 'LXXIX', 'LXXX', 'LXXXI', 'LXXXII', 'LXXXIII', 'LXXXIV', 'LXXXV', 'LXXXVI', 'LXXXVII', 'LXXXVIII', 'LXXXIX', 'XC', 'XCI', 'XCII', 'XCIII', 'XCIV', 'XCV', 'XCVI', 'XCVII', 'XCVIII', 'XCIX', 'C']

def topic_modelling_LDA(data, nombre_topics, path_model):
    lemmatized_texts = lemmatization(data)
    data_words = gen_words(lemmatized_texts)
    id2word = corpora.Dictionary(data_words)
    corpus = id2word_construction(data_words)
    lda_model = lda_model_construction(corpus, id2word, nombre_topics)
    return(lda_model)

def nmf_model_construction(corpus, id2word, nbre_topics=int):
    nmf_model = gensim.models.nmf.Nmf(corpus=corpus,
                                     id2word=id2word,
                                     num_topics=nbre_topics,
                                     random_state=100,
                                     chunksize=100)
    return(nmf_model)

def topic_modelling_NMF(data, nombre_topics, path_model):
    lemmatized_texts = lemmatization(data)
    data_words = gen_words(lemmatized_texts)
    id2word = corpora.Dictionary(data_words)
    corpus = id2word_construction(data_words)
    nmf_model = nmf_model_construction(corpus, id2word, nombre_topics)
    return(nmf_model)
    

def wordcloud_gensim(model, savepath, number_of_words, nombre_topics=int):
    for index, topic in model.show_topics(num_topics = nombre_topics,formatted=False, num_words=number_of_words):
        wordcloud_to_be = [w[0] for w in topic]
        plt.figure(figsize=(16,4), dpi=200)
        unique_string=(" ").join(wordcloud_to_be)
        wordcloud = WordCloud(background_color="white", width=1600, height=400, color_func=lambda *args, **kwargs: (0,0,0)).generate(unique_string)
        plt.figure()
        plt.imshow(wordcloud, interpolation="bilinear")
        plt.axis('off')
        plt.title(f"decrets_bulletins_topic_{index}")
        plt.savefig(f'{savepath}/topic_{index}.png', bbox_inches='tight')

## initalisation des textes

In [29]:
ensemble_decrets_1789_1795 = pd.read_csv('sous_corpus/ensemble_decrets_1789_1795.tsv', sep='\t', encoding="utf-8")
decrets_urgence_1789_1795 = pd.read_csv('sous_corpus/decrets_urgence_1789_1795.tsv', sep='\t', encoding="utf-8")
decrets_non_urgent = pd.read_csv('sous_corpus/decrets_non_urgent.tsv', sep='\t', encoding="utf-8")
decrets_bulletins = pd.read_csv('sous_corpus/decrets_bulletins.tsv', sep='\t', encoding="utf-8")
ensemble_decrets = pd.read_csv('sous_corpus/Ensemble_des_vols_concatenes.tsv', sep='\t')
decrets_urgence = pd.read_csv('sous_corpus/Ensemble_decrets_urgence.tsv', sep='\t')
ensemble_decrets = ensemble_decrets['texte'].dropna()
decrets_urgence = decrets_urgence['texte'].dropna()

## Entraînement des modèles Gensim

In [None]:
LDA_decrets_bulletins = topic_modelling_LDA(decrets_bulletins['texte'], 4, 'NEW_GENSIM_MODEL/decrets_bulletins')
wordcloud_gensim(LDA_decrets_bulletins, 'NEW_GENSIM_WORDCLOUDS/LDA/inscription_bulletin_deeplearn', 25)
NMF_decrets_bulletins = topic_modelling_NMF(decrets_bulletins['texte'], 4, 'NEW_GENSIM_MODEL/decrets_bulletins')
wordcloud_gensim(NMF_decrets_bulletins, 'NEW_GENSIM_WORDCLOUDS/NMF/inscription_bulletin_deeplearn', 25)
LDA_decrets_bulletins.save('NEW_GENSIM_MODELS/LDA/inscription_bulletin')
NMF_decrets_bulletins.save('NEW_GENSIM_MODELS/NMF/inscription_bulletin')

In [31]:
from gensim import corpora, models, similarities

In [None]:
LDA_decrets_non_urgent = topic_modelling_LDA(decrets_non_urgent['texte'], 157, 'NEW_GENSIM_MODEL/non_urgent')
LDA_decrets_non_urgent.save('NEW_GENSIM_MODELS/LDA/decrets_non_urgents')
NMF_decrets_non_urgent = topic_modelling_NMF(decrets_non_urgent['texte'], 157, 'NEW_GENSIM_MODEL/non_urgent')
NMF_decrets_non_urgent.save('NEW_GENSIM_MODELS/NMF/decrets_non_urgents')

In [None]:
NMF_decrets_non_urgent.save('NEW_GENSIM_MODELS/NMF/decrets_non_urgents')

In [None]:
LDA_decrets_non_urgent = models.LdaModel.load('NEW_GENSIM_MODELS/LDA/decrets_non_urgents')
NMF_decrets_non_urgent = models.Nmf.load('NEW_GENSIM_MODELS/NMF/decrets_non_urgents')
wordcloud_gensim(LDA_decrets_non_urgent, 'NEW_GENSIM_WORDCLOUDS/LDA/decrets_non_urgents', 25, 157)
wordcloud_gensim(NMF_decrets_non_urgent, 'NEW_GENSIM_WORDCLOUDS/NMF/decrets_non_urgents', 25, 157)

In [None]:
LDA_decrets_urgence_1789_1795 = topic_modelling_LDA(decrets_urgence_1789_1795['texte'], 10, 'NEW_GENSIM_MODEL/decrets_urgence_1789_1795')
NMF_decrets_urgence_1789_1795 = topic_modelling_NMF(decrets_urgence_1789_1795['texte'], 10, 'NEW_GENSIM_MODEL/decrets_urgence_1789_1795')
LDA_decrets_urgence_1789_1795.save('NEW_GENSIM_MODELS/LDA/decrets_urgence_1789_1795')
NMF_decrets_urgence_1789_1795.save('NEW_GENSIM_MODELS/NMF/decrets_urgence_1789_1795')

In [None]:
LDA_decrets_urgence_1789_1795 = models.LdaModel.load('NEW_GENSIM_MODELS/LDA/decrets_urgence_1789_1795')
NMF_decrets_urgence_1789_1795 = models.Nmf.load('NEW_GENSIM_MODELS/NMF/decrets_urgence_1789_1795')
wordcloud_gensim(NMF_decrets_urgence_1789_1795, 'NEW_GENSIM_WORDCLOUDS/NMF/decrets_urgence_1789_1795', 25, 10)
wordcloud_gensim(LDA_decrets_urgence_1789_1795, 'NEW_GENSIM_WORDCLOUDS/LDA/decrets_urgence_1789_1795', 25, 10)

In [None]:
LDA_ensemble_decrets_1789_1795 = topic_modelling_LDA(ensemble_decrets_1789_1795['texte'], 165, 'NEW_GENSIM_MODEL/ensemble_decrets_1789_1795')
NMF_ensemble_decrets_1789_1795 = topic_modelling_NMF(ensemble_decrets_1789_1795['texte'], 165, 'NEW_GENSIM_MODEL/ensemble_decrets_1789_1795')
LDA_ensemble_decrets_1789_1795.save('NEW_GENSIM_MODELS/LDA/ensemble_decrets_1789_1795')
NMF_ensemble_decrets_1789_1795.save('NEW_GENSIM_MODELS/NMF/ensemble_decrets_1789_1795')

In [None]:
LDA_ensemble_decrets_1789_1795 = models.LdaModel.load('NEW_GENSIM_MODELS/LDA/ensemble_decrets_1789_1795')
NMF_ensemble_decrets_1789_1795 = models.Nmf.load('NEW_GENSIM_MODELS/NMF/ensemble_decrets_1789_1795')
wordcloud_gensim(NMF_ensemble_decrets_1789_1795, 'NEW_GENSIM_WORDCLOUDS/NMF/ensemble_decrets_1789_1795', 25, 165)
wordcloud_gensim(LDA_ensemble_decrets_1789_1795, 'NEW_GENSIM_WORDCLOUDS/LDA/ensemble_decrets_1789_1795', 25, 165)

In [None]:
LDA_decrets_urgence = topic_modelling_LDA(decrets_urgence, 40, 'NEW_GENSIM_MODEL/decrets_urgence')
NMF_decrets_urgence = topic_modelling_NMF(decrets_urgence, 40, 'NEW_GENSIM_MODEL/decrets_urgence')
LDA_decrets_urgence.save('NEW_GENSIM_MODELS/LDA/decrets_urgence')
NMF_decrets_urgence.save('NEW_GENSIM_MODELS/NMF/decrets_urgence')

In [None]:
LDA_decrets_urgence = models.LdaModel.load('NEW_GENSIM_MODELS/LDA/decrets_urgence')
NMF_decrets_urgence = models.Nmf.load('NEW_GENSIM_MODELS/NMF/decrets_urgence')
wordcloud_gensim(NMF_decrets_urgence, 'NEW_GENSIM_WORDCLOUDS/NMF/decrets_urgence', 25, 40)
wordcloud_gensim(LDA_decrets_urgence, 'NEW_GENSIM_WORDCLOUDS/LDA/decrets_urgence', 25, 40)

In [None]:
LDA_decrets_urgence2 = topic_modelling_LDA(decrets_urgence, 40, 'NEW_GENSIM_MODEL/decrets_urgence_2')
LDA_decrets_urgence2.save('NEW_GENSIM_MODELS/LDA/decrets_urgence_2')

In [None]:
LDA_decrets_urgence = models.LdaModel.load('NEW_GENSIM_MODELS/LDA/decrets_urgence_2')
wordcloud_gensim(LDA_decrets_urgence, 'NEW_GENSIM_WORDCLOUDS/LDA/decrets_urgence_2', 25, 40)

In [None]:
NMF_decrets_urgence2 = topic_modelling_NMF(decrets_urgence, 40, 'NEW_GENSIM_MODEL/nmfdecrets_urgence2')
NMF_decrets_urgence2.save('NEW_GENSIM_MODELS/NMF/decrets_urgence2')

In [None]:
NMF_decrets_urgence2 = models.Nmf.load('NEW_GENSIM_MODELS/NMF/decrets_urgence2')
wordcloud_gensim(NMF_decrets_urgence2, 'NEW_GENSIM_WORDCLOUDS/NMF/decrets_urgence2', 25, 40)

# ESSAI CONCATENATION BULLETIN ET URGENCE

In [None]:
urgence_et_bulletin = decrets_urgence_1789_1795.append(decrets_bulletins, ignore_index=True)

In [None]:
urgence_et_bulletin = urgence_et_bulletin['texte'].dropna()
liste_urgence_et_bulletin = urgence_et_bulletin.tolist()
top2vec_urgence_et_bulletin = Top2Vec(documents=liste_urgence_et_bulletin, speed="deep-learn", workers=4)
top2vec_urgence_et_bulletin.save("modeles/top2vec_urgence_et_bulletin")
print(top2vec_urgence_et_bulletin.get_num_topics())
wordclouds(top2vec_urgence_et_bulletin, liste_urgence_et_bulletin, 'WORDCLOUDS/urgence_et_bulletin', background_color="white", colormap="copper")