In [3]:
import itertools
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim_models 
import bertopic

from gensim import corpora
from gensim.models import CoherenceModel,LdaModel,LsiModel
from sklearn.metrics import silhouette_samples, silhouette_score

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

  from .autonotebook import tqdm as notebook_tqdm
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):


## Chargement des données 

In [4]:
def force_format(texts):
    return [str(t) for t in texts]

def compute_word_occurences(texts):
    words = itertools.chain.from_iterable(texts)
    word_count = pd.Series(words).value_counts()
    word_count = pd.DataFrame({"Word": word_count.index, "Count": word_count.values})
    return word_count

def get_l_texts(text_file): #text_file is a .txt file from preprocessing to avoid doing it again
    l_texts=[]
    with open(text_file, "r") as f:
        line = f.readlines()
        list_line = [l.strip() for l in line]
        for l in list_line:
            l_texts.append(ast.literal_eval(l))
    return l_texts

In [5]:
dataset = pd.read_json("News_Category_Dataset_v2.json", lines=True, dtype={"headline": str})
texts = force_format(dataset["headline"])
l_texts = get_l_texts("l_texts.txt")
print(l_texts[:10],"\n",texts[:10],"\n",dataset["headline"][:10])

[['mass_shooting', 'texas', 'week', 'tv'], ['smith', 'join', 'diplo', 'nicky', 'jam', 'world_cup', 'official', 'song'], ['hugh', 'grant', 'marries', 'time', 'age'], ['jim_carrey', 'blasts', 'castrato', 'adam', 'schiff', 'democrats', 'artwork'], ['julianna', 'margulie', 'donald', 'poop', 'bag', 'pick', 'dog'], ['morgan_freeman', 'devastate', 'sexual_harassment', 'claim', 'undermine', 'legacy'], ['donald', 'lovin', 'mcdonald', 'jingle', 'tonight', 'bit'], ['watch', 'amazon', 'prime', 'week'], ['mike', 'myers', 'reveal', 'fourth', 'austin', 'power', 'film'], ['watch', 'hulu', 'week']] 
 ['There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV', "Will Smith Joins Diplo And Nicky Jam For The 2018 World Cup's Official Song", 'Hugh Grant Marries For The First Time At Age 57', "Jim Carrey Blasts 'Castrato' Adam Schiff And Democrats In New Artwork", 'Julianna Margulies Uses Donald Trump Poop Bags To Pick Up After Her Dog', "Morgan Freeman 'Devastated' That Sexual Harassment Claims Cou

In [6]:
# Create a dictionary
dictionary = corpora.Dictionary(l_texts)
# Create a corpus
corpus = [dictionary.doc2bow(text) for text in l_texts]

print("Number of unique tokens: {}".format(len(dictionary)))
print("Number of documents: {}".format(len(corpus)))

Number of unique tokens: 47649
Number of documents: 200853


## Pour les modèles disponibles dans la librairie gensim

### LDA

In [7]:
result={"num_topics":[], "coherence_score_cv":[],"coherence_score_npmi":[]}

for n in range(50, 65):
    lda_model = LdaModel(corpus, num_topics=n, id2word=dictionary)
    coherence_model_cv= CoherenceModel(model=lda_model, texts=l_texts, dictionary=dictionary, coherence='c_v')
    coherence_score_cv = coherence_model_cv.get_coherence()
    coherence_model_npmi= CoherenceModel(model=lda_model, texts=l_texts, dictionary=dictionary, coherence='c_npmi')
    coherence_score_npmi = coherence_model_npmi.get_coherence()
    print("Number of topics: ", n, " Coherence Score: ", coherence_score_npmi,coherence_score_cv)
    result["num_topics"].append(n)
    result["coherence_score_cv"].append(coherence_score_cv)
    result["coherence_score_npmi"].append(coherence_score_npmi)

Number of topics:  50  Coherence Score:  -0.282094881502408 0.5094184357556886
Number of topics:  51  Coherence Score:  -0.2887068055109819 0.5170890273807071
Number of topics:  52  Coherence Score:  -0.28654223883856744 0.5146926482367074
Number of topics:  53  Coherence Score:  -0.2921405308571968 0.5217060201315133
Number of topics:  54  Coherence Score:  -0.2915939252852845 0.5240190432138078
Number of topics:  55  Coherence Score:  -0.2963163306976359 0.5276565855854817
Number of topics:  56  Coherence Score:  -0.29463126864080774 0.5252333748127519
Number of topics:  57  Coherence Score:  -0.2967325421189156 0.5275971462989143
Number of topics:  58  Coherence Score:  -0.296684626854237 0.5267835173715181
Number of topics:  59  Coherence Score:  -0.3010507549171909 0.5323793578015006
Number of topics:  60  Coherence Score:  -0.30262437122217595 0.536823468826923


KeyboardInterrupt: 

In [None]:
optimal_n=result["num_topics"][np.argmax(result["coherence_score_cv"])]
lda_model = LdaModel(corpus, num_topics=n, id2word=dictionary)
coherence_model_cv= CoherenceModel(model=lda_model, texts=l_texts, dictionary=dictionary, coherence='c_v')
coherence_score_cv = coherence_model_cv.get_coherence()
coherence_model_npmi= CoherenceModel(model=lda_model, texts=l_texts, dictionary=dictionary, coherence='c_npmi')
coherence_score_npmi = coherence_model_npmi.get_coherence()
print("Optimal number of topics: ", optimal_n, " Coherence Score: ", coherence_score_npmi,coherence_score_cv)

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
vis

### LSI


In [None]:
lsi_model = LsiModel(corpus, num_topics=optimal_n, id2word=dictionary)
coherence_model_cv= CoherenceModel(model=lsi_model, texts=l_texts, dictionary=dictionary, coherence='c_v')
coherence_score_cv = coherence_model_cv.get_coherence()
coherence_model_npmi= CoherenceModel(model=lsi_model, texts=l_texts, dictionary=dictionary, coherence='c_npmi')
coherence_score_npmi = coherence_model_npmi.get_coherence()
print("Number of topics: ", n, " Coherence Score: ", coherence_score_npmi,coherence_score_cv)

In [None]:
# Visualize the topics
#Not possible with LSI

## Avec Bertopic


In [None]:
representation=bertopic.representation.KeyBERTInspired()
model_trained_representation= bertopic.BERTopic(representation_model=representation,nr_topics=55)
topics,probs = model_trained_representation.fit_transform(dataset['headline'])

In [None]:
dictionary = corpora.Dictionary(l_texts)
coherence_model= CoherenceModel(model=model_trained_representation, texts=l_texts, dictionary=dictionary, coherence='c_v')
coherence_score_cv = coherence_model.get_coherence()
coherence_model= CoherenceModel(model=model_trained_representation, texts=l_texts, dictionary=dictionary, coherence='c_npmi')
coherence_score_c_npmi = coherence_model.get_coherence()
print("Coherence Score: ", coherence_score_cv,coherence_score_c_npmi)

In [None]:
model_trained_representation.visualize_topics()