In [None]:

import pandas as pd
import matplotlib.pyplot as plt
from gensim import corpora, models
from gensim.models import CoherenceModel

df = pd.read_csv("preprocessed_data.csv")
df['Tokens'] = df['Tokens'].apply(eval)


In [None]:

dictionary = corpora.Dictionary(df['Tokens'])
corpus = [dictionary.doc2bow(text) for text in df['Tokens']]


In [None]:

coherence_scores = []
models_list = []
for num_topics in range(2, 16):
    lda_model = models.LdaModel(corpus=corpus,
                                 id2word=dictionary,
                                 num_topics=num_topics,
                                 random_state=42,
                                 update_every=1,
                                 chunksize=100,
                                 passes=10,
                                 alpha='auto',
                                 per_word_topics=True)
    coherence_model = CoherenceModel(model=lda_model, texts=df['Tokens'], dictionary=dictionary, coherence='c_v')
    score = coherence_model.get_coherence()
    coherence_scores.append(score)
    models_list.append(lda_model)


In [None]:

plt.figure(figsize=(10, 6))
plt.plot(range(2, 16), coherence_scores, marker='o')
plt.title('Coherence Scores by Number of Topics')
plt.xlabel('Number of Topics')
plt.ylabel('Coherence Score (c_v)')
plt.grid(True)
plt.savefig("coherence_scores.png")
plt.show()


In [None]:

perplexity_scores = [model.log_perplexity(corpus) for model in models_list]

plt.figure(figsize=(10, 6))
plt.plot(range(2, 16), perplexity_scores, marker='o', color='red')
plt.title('Perplexity Scores by Number of Topics')
plt.xlabel('Number of Topics')
plt.ylabel('Log Perplexity')
plt.grid(True)
plt.savefig("perplexity_scores.png")
plt.show()


In [None]:

optimal_index = coherence_scores.index(max(coherence_scores))
best_model = models_list[optimal_index]
best_num_topics = optimal_index + 2
print(f"Optimal number of topics: {best_num_topics}")


In [None]:

topics = best_model.print_topics(num_words=10)
for topic_num, topic in topics:
    print(f"Topic {topic_num}: {topic}")


In [None]:

def get_dominant_topic(ldamodel, bow):
    topics = ldamodel.get_document_topics(bow)
    if topics:
        return sorted(topics, key=lambda x: x[1], reverse=True)[0][0]
    return None

df['Dominant_Topic'] = [get_dominant_topic(best_model, doc) for doc in corpus]


In [None]:

plt.figure(figsize=(12, 6))
sns.countplot(data=df, x='Year', hue='Dominant_Topic', palette='tab10')
plt.title('Topic Frequency Distribution by Year')
plt.xlabel('Year')
plt.ylabel('Document Count')
plt.legend(title='Topic')
plt.grid(True)
plt.tight_layout()
plt.savefig("topic_year_distribution.png")
plt.show()


In [None]:

df.to_csv("topic_model_output.csv", index=False)
