In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
from collections import Counter
from textblob import Word
import nltk
from gensim import corpora, models
from gensim.models import CoherenceModel


In [None]:

df = pd.read_excel('318NewsDataSet.xlsx')


In [None]:

df.columns = [col.strip().replace(' ', '_') for col in df.columns]


In [None]:

stopwords = set(nltk.corpus.stopwords.words('english')).union({
    'said', 'also', 'sh', 'r', 'one', 'would', 'get', 'could', 'us',
    'like', 'make', 'many', 'however', 'must', 'still', 'even', 'much', 'new', 'take',
    'two', 'use', 'may', 'well', 'back', 'around', 'another', 'since', 'year', 'yet',
    'without', 'first', 'mr', 'can'
})

def preprocess(text):
    text = str(text).lower()
    text = text.translate(str.maketrans('', '', string.punctuation + string.digits))
    words = text.split()
    words = [Word(w).lemmatize() for w in words if w not in stopwords]
    return ' '.join(words)


In [None]:

df['Processed_Content'] = df['Content'].apply(preprocess)
df['Tokens'] = df['Processed_Content'].apply(lambda x: x.split())


In [None]:

df['Doc_Length'] = df['Tokens'].apply(len)
plt.figure(figsize=(10, 5))
sns.histplot(df['Doc_Length'], bins=30, kde=True)
plt.title('Document Length Distribution')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.grid(True)
plt.savefig("length_distribution.png")
plt.show()


In [None]:

dictionary = corpora.Dictionary(df['Tokens'])
corpus = [dictionary.doc2bow(text) for text in df['Tokens']]


In [None]:

coherence_scores = []
models_list = []
for num_topics in range(2, 11):
    lda_model = models.LdaModel(corpus=corpus,
                                 id2word=dictionary,
                                 num_topics=num_topics,
                                 random_state=42,
                                 update_every=1,
                                 chunksize=100,
                                 passes=10,
                                 alpha='auto',
                                 per_word_topics=True)
    coherence_model = CoherenceModel(model=lda_model, texts=df['Tokens'], dictionary=dictionary, coherence='c_v')
    score = coherence_model.get_coherence()
    coherence_scores.append(score)
    models_list.append(lda_model)


In [None]:

plt.figure(figsize=(10, 5))
plt.plot(range(2, 11), coherence_scores, marker='o')
plt.title('LDA Coherence Scores')
plt.xlabel('Number of Topics')
plt.ylabel('Coherence Score')
plt.grid(True)
plt.savefig("coherence_scores.png")
plt.show()


In [None]:

best_model = models_list[4]  # Adjust based on coherence score results
topics = best_model.print_topics(num_words=10)
for topic_num, topic in topics:
    print(f"Topic {topic_num}: {topic}")


In [None]:

def get_dominant_topic(ldamodel, bow):
    topics = ldamodel.get_document_topics(bow)
    if topics:
        return sorted(topics, key=lambda x: x[1], reverse=True)[0][0]
    return None

df['Dominant_Topic'] = [get_dominant_topic(best_model, doc) for doc in corpus]


In [None]:

df.to_csv("topic_model_output.csv", index=False)
