In [1]:
import itertools
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim_models 
import bertopic

from gensim import corpora
from gensim.models import CoherenceModel,LdaModel,LsiModel
from sklearn.metrics import silhouette_samples, silhouette_score

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

  from .autonotebook import tqdm as notebook_tqdm
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):


## Chargement des données 

In [2]:
def force_format(texts):
    return [str(t) for t in texts]

def compute_word_occurences(texts):
    words = itertools.chain.from_iterable(texts)
    word_count = pd.Series(words).value_counts()
    word_count = pd.DataFrame({"Word": word_count.index, "Count": word_count.values})
    return word_count

def get_l_texts(text_file): #text_file is a .txt file from preprocessing to avoid doing it again
    l_texts=[]
    with open(text_file, "r") as f:
        line = f.readlines()
        list_line = [l.strip() for l in line]
        for l in list_line:
            l_texts.append(ast.literal_eval(l))
    return l_texts

In [3]:
dataset = pd.read_json("News_Category_Dataset_v2.json", lines=True, dtype={"headline": str})
texts = force_format(dataset["headline"])
l_texts = get_l_texts("l_texts.txt")
print(l_texts[:10],"\n",texts[:10],"\n",dataset["headline"][:10])

[['mass_shooting', 'texas', 'week', 'tv'], ['smith', 'join', 'diplo', 'nicky', 'jam', 'world_cup', 'official', 'song'], ['hugh', 'grant', 'marries', 'time', 'age'], ['jim_carrey', 'blasts', 'castrato', 'adam', 'schiff', 'democrats', 'artwork'], ['julianna', 'margulie', 'donald', 'poop', 'bag', 'pick', 'dog'], ['morgan_freeman', 'devastate', 'sexual_harassment', 'claim', 'undermine', 'legacy'], ['donald', 'lovin', 'mcdonald', 'jingle', 'tonight', 'bit'], ['watch', 'amazon', 'prime', 'week'], ['mike', 'myers', 'reveal', 'fourth', 'austin', 'power', 'film'], ['watch', 'hulu', 'week']] 
 ['There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV', "Will Smith Joins Diplo And Nicky Jam For The 2018 World Cup's Official Song", 'Hugh Grant Marries For The First Time At Age 57', "Jim Carrey Blasts 'Castrato' Adam Schiff And Democrats In New Artwork", 'Julianna Margulies Uses Donald Trump Poop Bags To Pick Up After Her Dog', "Morgan Freeman 'Devastated' That Sexual Harassment Claims Cou

In [4]:
# Create a dictionary
dictionary = corpora.Dictionary(l_texts)
# Create a corpus
corpus = [dictionary.doc2bow(text) for text in l_texts]

print("Number of unique tokens: {}".format(len(dictionary)))
print("Number of documents: {}".format(len(corpus)))

Number of unique tokens: 47649
Number of documents: 200853


## Pour les modèles disponibles dans la librairie gensim

### LDA

In [5]:
result={"num_topics":[], "coherence_score_cv":[],"coherence_score_npmi":[]}

for n in range(50, 65):
    lda_model = LdaModel(corpus, num_topics=n, id2word=dictionary)
    coherence_model_cv= CoherenceModel(model=lda_model, texts=l_texts, dictionary=dictionary, coherence='c_v')
    coherence_score_cv = coherence_model_cv.get_coherence()
    coherence_model_npmi= CoherenceModel(model=lda_model, texts=l_texts, dictionary=dictionary, coherence='c_npmi')
    coherence_score_npmi = coherence_model_npmi.get_coherence()
    print("Number of topics: ", n, " Coherence Score: ", coherence_score_npmi,coherence_score_cv)
    result["num_topics"].append(n)
    result["coherence_score_cv"].append(coherence_score_cv)
    result["coherence_score_npmi"].append(coherence_score_npmi)

Number of topics:  50  Coherence Score:  -0.2824209588809716 0.5098044333816348
Number of topics:  51  Coherence Score:  -0.28828241735395443 0.5181689460125675
Number of topics:  52  Coherence Score:  -0.2874787327215478 0.5153331178821003
Number of topics:  53  Coherence Score:  -0.2943429399351826 0.5255018014061624
Number of topics:  54  Coherence Score:  -0.29591946141787323 0.5268862888920597
Number of topics:  55  Coherence Score:  -0.2960636395784447 0.5284503792971005
Number of topics:  56  Coherence Score:  -0.3000395592161475 0.5323435274476381
Number of topics:  57  Coherence Score:  -0.29919743955948835 0.5319902171309276
Number of topics:  58  Coherence Score:  -0.2971918680648036 0.5283476888252082
Number of topics:  59  Coherence Score:  -0.29820176195152787 0.5307283765564802
Number of topics:  60  Coherence Score:  -0.30102524437381867 0.5353576336774502
Number of topics:  61  Coherence Score:  -0.2973235104383298 0.5288722685081988
Number of topics:  62  Coherence Sc

In [6]:
optimal_n=result["num_topics"][np.argmax(result["coherence_score_cv"])]
lda_model = LdaModel(corpus, num_topics=n, id2word=dictionary)
coherence_model_cv= CoherenceModel(model=lda_model, texts=l_texts, dictionary=dictionary, coherence='c_v')
coherence_score_cv = coherence_model_cv.get_coherence()
coherence_model_npmi= CoherenceModel(model=lda_model, texts=l_texts, dictionary=dictionary, coherence='c_npmi')
coherence_score_npmi = coherence_model_npmi.get_coherence()
print("Optimal number of topics: ", optimal_n, " Coherence Score: ", coherence_score_npmi,coherence_score_cv)

Optimal number of topics:  60  Coherence Score:  -0.2963964682921284 0.5238451755756386


In [7]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
vis

### LSI


In [8]:
lsi_model = LsiModel(corpus, num_topics=optimal_n, id2word=dictionary)
coherence_model_cv= CoherenceModel(model=lsi_model, texts=l_texts, dictionary=dictionary, coherence='c_v')
coherence_score_cv = coherence_model_cv.get_coherence()
coherence_model_npmi= CoherenceModel(model=lsi_model, texts=l_texts, dictionary=dictionary, coherence='c_npmi')
coherence_score_npmi = coherence_model_npmi.get_coherence()
print("Number of topics: ", optimal_n, " Coherence Score: ", coherence_score_npmi,coherence_score_cv)

  sparsetools.csc_matvecs(
  out = (1 - tri(m.shape[0], m.shape[1], k - 1, m.dtype.char)) * m
  out = (1 - tri(m.shape[0], m.shape[1], k - 1, m.dtype.char)) * m
  out = (1 - tri(m.shape[0], m.shape[1], k - 1, m.dtype.char)) * m
  sparsetools.csc_matvecs(
  out = (1 - tri(m.shape[0], m.shape[1], k - 1, m.dtype.char)) * m
  out = (1 - tri(m.shape[0], m.shape[1], k - 1, m.dtype.char)) * m
  out = (1 - tri(m.shape[0], m.shape[1], k - 1, m.dtype.char)) * m
  out = (1 - tri(m.shape[0], m.shape[1], k - 1, m.dtype.char)) * m
  sparsetools.csc_matvecs(
  out = (1 - tri(m.shape[0], m.shape[1], k - 1, m.dtype.char)) * m
  out = (1 - tri(m.shape[0], m.shape[1], k - 1, m.dtype.char)) * m
  out = (1 - tri(m.shape[0], m.shape[1], k - 1, m.dtype.char)) * m
  out = (1 - tri(m.shape[0], m.shape[1], k - 1, m.dtype.char)) * m
  sparsetools.csc_matvecs(
  out = (1 - tri(m.shape[0], m.shape[1], k - 1, m.dtype.char)) * m
  out = (1 - tri(m.shape[0], m.shape[1], k - 1, m.dtype.char)) * m
  out = (1 - tri(m.sh

Number of topics:  60  Coherence Score:  -0.0426014760788819 0.23731591431451662


In [None]:
# Visualize the topics
#Not possible with LSI

## BERTOPIC

In [None]:
representation=bertopic.representation.KeyBERTInspired()
model_trained_representation= bertopic.BERTopic(representation_model=representation,nr_topics=optimal_n)
topics,probs = model_trained_representation.fit_transform(dataset['headline'])

In [None]:
dictionary = corpora.Dictionary(l_texts)
coherence_model= CoherenceModel(model=model_trained_representation, texts=l_texts, dictionary=dictionary, coherence='c_v')
coherence_score_cv = coherence_model.get_coherence()
coherence_model= CoherenceModel(model=model_trained_representation, texts=l_texts, dictionary=dictionary, coherence='c_npmi')
coherence_score_c_npmi = coherence_model.get_coherence()
print("Coherence Score: ", coherence_score_cv,coherence_score_c_npmi)

In [9]:
model_trained_representation.visualize_topics()

In [20]:
(model_trained_representation.get_topic_info())

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,82942,-1_life_mom_love_wedding,"[life, mom, love, wedding, house, donald, day,...",[John Legend Says It Would Take A Gun To His H...
1,0,18986,0_fashion_dress_style_makeup,"[fashion, dress, style, makeup, miley, beauty,...",[Lena Dunham's McQ Dress On 'Girls' Shows Hann...
2,1,15829,1_trump_colbert_donald_trumps,"[trump, colbert, donald, trumps, bernie, presi...",[Seth Meyers Calls Out Donald Trump Jr. As The...
3,2,7611,2_foods_healthy_food_diet,"[foods, healthy, food, diet, nutrition, eating...","[Fast Food the Healthy Way: A Guide, Healthy F..."
4,3,6092,3_meditation_meditations_meditate_mindfulness,"[meditation, meditations, meditate, mindfulnes...","[Meditation Tips for the Day 1, How We Can Hel..."
5,4,4945,4_police_cops_officers_officer,"[police, cops, officers, officer, cop, violenc...",[Florida Cops On What Ferguson Can Learn From ...
6,5,4707,5_isis_syria_syrias_yemen,"[isis, syria, syrias, yemen, syrian, iran, isr...",[Israel Has Lost the War Against Hamas in Gaza...
7,6,4659,6_parenting_mothers_tweets_twitter,"[parenting, mothers, tweets, twitter, parental...",[Best Parenting Tweets: What Moms And Dads Sai...
8,7,4527,7_trumpcare_obamacare_health_medicaid,"[trumpcare, obamacare, health, medicaid, medic...",[Republicans Could Actually Pass This Health C...
9,8,3858,8_destinations_vacation_travelers_travel,"[destinations, vacation, travelers, travel, tr...","[Best New Travel Gadgets for 2014 (PHOTOS), 7 ..."
