In [1]:
# import necessary libraries
import itertools
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt
from gensim import corpora
from gensim.models import CoherenceModel,LdaModel
from sklearn.metrics import silhouette_samples, silhouette_score


In [2]:
def force_format(texts):
    return [str(t) for t in texts]

In [3]:
dataset = pd.read_json("News_Category_Dataset_v2.json", lines=True, dtype={"headline": str})

In [4]:
texts = force_format(dataset["headline"])

In [5]:
def compute_word_occurences(texts):
    words = itertools.chain.from_iterable(texts)
    word_count = pd.Series(words).value_counts()
    word_count = pd.DataFrame({"Word": word_count.index, "Count": word_count.values})
    return word_count

In [6]:
def get_l_texts(text_file):
    l_texts=[]
    with open(text_file, "r") as f:
        line = f.readlines()
        list_line = [l.strip() for l in line]
        for l in list_line:
            l_texts.append(ast.literal_eval(l))
    return l_texts

In [7]:
l_texts = get_l_texts("l_texts.txt")
print(l_texts[:10])

[['mass_shooting', 'texas', 'week', 'tv'], ['smith', 'join', 'diplo', 'nicky', 'jam', 'world_cup', 'official', 'song'], ['hugh', 'grant', 'marries', 'time', 'age'], ['jim_carrey', 'blasts', 'castrato', 'adam', 'schiff', 'democrats', 'artwork'], ['julianna', 'margulie', 'donald', 'poop', 'bag', 'pick', 'dog'], ['morgan_freeman', 'devastate', 'sexual_harassment', 'claim', 'undermine', 'legacy'], ['donald', 'lovin', 'mcdonald', 'jingle', 'tonight', 'bit'], ['watch', 'amazon', 'prime', 'week'], ['mike', 'myers', 'reveal', 'fourth', 'austin', 'power', 'film'], ['watch', 'hulu', 'week']]


# LDA

In [8]:
# Create a dictionary
dictionary = corpora.Dictionary(l_texts)

# Create a corpus
corpus = [dictionary.doc2bow(text) for text in l_texts]

In [9]:
result={"num_topics":[], "coherence_score":[]}

for n in range(2, 2500):
    lda_model = LdaModel(corpus, num_topics=n, id2word=dictionary)
    coherence_model= CoherenceModel(model=lda_model, texts=l_texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    print("Number of topics: ", n, " Coherence Score: ", coherence_score)
    result["num_topics"].append(n)
    result["coherence_score"].append(coherence_score)

Number of topics:  2  Coherence Score:  0.22841177018334805
Number of topics:  3  Coherence Score:  0.22898828390480822


KeyboardInterrupt: 

In [None]:
# Plot the coherence scores for elbow method

plt.plot(result["num_topics"], result["coherence_score"])
plt.xlabel("Number of Topics")
plt.ylabel("Coherence Score")
plt.legend(("coherence_values"), loc='best')
plt.show()