In [1]:
# import necessary libraries
import itertools
import pandas as pd
import numpy as np
import ast
from gensim import corpora
from gensim.models import CoherenceModel
from sklearn.metrics import silhouette_samples, silhouette_score

In [2]:
def force_format(texts):
    return [str(t) for t in texts]

In [3]:
def compute_word_occurences(texts):
    words = itertools.chain.from_iterable(texts)
    word_count = pd.Series(words).value_counts()
    word_count = pd.DataFrame({"Word": word_count.index, "Count": word_count.values})
    return word_count

In [4]:
def get_l_texts(text_file):
    l_texts=[]
    with open(text_file, "r") as f:
        line = f.readlines()
        list_line = [l.strip() for l in line]
        for l in list_line:
            l_texts.append(ast.literal_eval(l))
    return l_texts

In [5]:
dataset = pd.read_json("News_Category_Dataset_v2.json", lines=True, dtype={"headline": str})

In [6]:
texts = force_format(dataset["headline"])

In [7]:
l_texts = get_l_texts("l_texts.txt")

# BERT

In [8]:
#Import
import bertopic


  from .autonotebook import tqdm as notebook_tqdm


In [9]:
representation=bertopic.representation.KeyBERTInspired()
model_trained_representation= bertopic.BERTopic(representation_model=representation,nr_topics=55)
topics,probs = model_trained_representation.fit_transform(dataset['headline'])
model_trained_representation.visualize_topics()


In [10]:
dictionary = corpora.Dictionary(l_texts)
coherence_model= CoherenceModel(model=model_trained_representation, texts=l_texts, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print("Coherence Score: ", coherence_score)

Coherence Score:  1.0


In [20]:
(model_trained_representation.get_topic_info())

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,82942,-1_life_mom_love_wedding,"[life, mom, love, wedding, house, donald, day,...",[John Legend Says It Would Take A Gun To His H...
1,0,18986,0_fashion_dress_style_makeup,"[fashion, dress, style, makeup, miley, beauty,...",[Lena Dunham's McQ Dress On 'Girls' Shows Hann...
2,1,15829,1_trump_colbert_donald_trumps,"[trump, colbert, donald, trumps, bernie, presi...",[Seth Meyers Calls Out Donald Trump Jr. As The...
3,2,7611,2_foods_healthy_food_diet,"[foods, healthy, food, diet, nutrition, eating...","[Fast Food the Healthy Way: A Guide, Healthy F..."
4,3,6092,3_meditation_meditations_meditate_mindfulness,"[meditation, meditations, meditate, mindfulnes...","[Meditation Tips for the Day 1, How We Can Hel..."
5,4,4945,4_police_cops_officers_officer,"[police, cops, officers, officer, cop, violenc...",[Florida Cops On What Ferguson Can Learn From ...
6,5,4707,5_isis_syria_syrias_yemen,"[isis, syria, syrias, yemen, syrian, iran, isr...",[Israel Has Lost the War Against Hamas in Gaza...
7,6,4659,6_parenting_mothers_tweets_twitter,"[parenting, mothers, tweets, twitter, parental...",[Best Parenting Tweets: What Moms And Dads Sai...
8,7,4527,7_trumpcare_obamacare_health_medicaid,"[trumpcare, obamacare, health, medicaid, medic...",[Republicans Could Actually Pass This Health C...
9,8,3858,8_destinations_vacation_travelers_travel,"[destinations, vacation, travelers, travel, tr...","[Best New Travel Gadgets for 2014 (PHOTOS), 7 ..."


In [11]:
model_trained= bertopic.BERTopic(nr_topics=55)
topics,probs = model_trained.fit_transform(dataset['headline'])
dictionary = corpora.Dictionary(l_texts)
coherence_model= CoherenceModel(model=model_trained, texts=l_texts, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print("Coherence Score: ", coherence_score)

Coherence Score:  1.0


In [13]:
model_trained.visualize_topics()

In [21]:
(model_trained.get_topic_info())

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,82306,-1_the_to_in_of,"[the, to, in, of, and, for, is, on, with, photos]",[What To Expect The First Day Home From The Ho...
1,0,19160,0_photos_the_and_is,"[photos, the, and, is, fashion, her, of, on, i...",[16 Awesome Fashion Editors You Should Get To ...
2,1,18919,1_trump_donald_trumps_to,"[trump, donald, trumps, to, on, clinton, gop, ...","[Donald Trump: The President Of Id, The One Th..."
3,2,6802,2_meditation_your_you_to,"[meditation, your, you, to, the, how, of, life...","[Daily Meditation: Be Still, What Your 'Life S..."
4,3,6764,3_recipes_halloween_food_the,"[recipes, halloween, food, the, thanksgiving, ...",[What The Kitchen Staff Eat: Recipes From The ...
5,4,6384,4_abortion_parenting_parenthood_planned,"[abortion, parenting, parenthood, planned, col...",[Best Parenting Tweets: What Moms And Dads Sai...
6,5,5503,5_photos_travel_home_weekly,"[photos, travel, home, weekly, hotels, gps, ho...",[Weekly Roundup of eBay Vintage Home Finds (PH...
7,6,5171,6_iran_muslim_in_isis,"[iran, muslim, in, isis, syria, refugees, isra...","[Top ISIS Leader In Syria Killed In U.S. Raid,..."
8,7,3677,7_tax_homeless_money_wage,"[tax, homeless, money, wage, to, the, minimum,...","[Should I Give Money to the Homeless?, When Is..."
9,8,3212,8_holiday_christmas_day_valentines,"[holiday, christmas, day, valentines, holidays...","[This Christmas, Have A Nice Day, Holiday Gift..."


In [12]:
"""result=pd.DataFrame(columns=['min_size_topic','Coherence Score',"Number of topics"])
for min_size_topic in range(10, 200, 10):
    model = bertopic.BERTopic(representation_model=representation,min_topic_size=min_size_topic, verbose=True)
    topics,probs = model.fit_transform(dataset['headline'])
    print("min_size_topic =", min_size_topic, "Number of topics :", len(np.unique(topics)))
    coherence_model= CoherenceModel(model=model, texts=l_texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    print("min_size_topic =", min_size_topic, "Coherence Score: ", coherence_score)
    result=result.append({'min_size_topic':min_size_topic,'Coherence Score':coherence_score,"Number of topics":len(np.unique(topics))},ignore_index=True)"""

'result=pd.DataFrame(columns=[\'min_size_topic\',\'Coherence Score\',"Number of topics"])\nfor min_size_topic in range(10, 200, 10):\n    model = bertopic.BERTopic(representation_model=representation,min_topic_size=min_size_topic, verbose=True)\n    topics,probs = model.fit_transform(dataset[\'headline\'])\n    print("min_size_topic =", min_size_topic, "Number of topics :", len(np.unique(topics)))\n    coherence_model= CoherenceModel(model=model, texts=l_texts, dictionary=dictionary, coherence=\'c_v\')\n    coherence_score = coherence_model.get_coherence()\n    print("min_size_topic =", min_size_topic, "Coherence Score: ", coherence_score)\n    result=result.append({\'min_size_topic\':min_size_topic,\'Coherence Score\':coherence_score,"Number of topics":len(np.unique(topics))},ignore_index=True)'