In [1]:
import pandas as pd
import nltk
import gensim
from gensim import corpora
from gensim.models import CoherenceModel
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
import re

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Azif\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Azif\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Azif\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Azif\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [5]:
df = pd.read_csv('news_dataset.csv')
df = df[['text']]  # Keep only the text column
df.dropna(inplace=True)  # Remove null values

In [7]:
df.head()

Unnamed: 0,text
0,I was wondering if anyone out there could enli...
1,I recently posted an article asking what kind ...
2,\nIt depends on your priorities. A lot of peo...
3,an excellent automatic can be found in the sub...
4,: Ford and his automobile. I need information...


In [9]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()  # lowercase
    text = re.sub(r'\d+', '', text)  # remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    tokens = word_tokenize(text)  # tokenize
    tokens = [t for t in tokens if t not in stop_words and len(t) > 3]  # remove stopwords and short tokens
    tokens = [lemmatizer.lemmatize(t) for t in tokens]  # lemmatize
    return tokens

df['tokens'] = df['text'].apply(preprocess)

In [11]:
dictionary = corpora.Dictionary(df['tokens'])
corpus = [dictionary.doc2bow(text) for text in df['tokens']]

In [13]:
lda_model = gensim.models.LdaModel(corpus=corpus,
                                   id2word=dictionary,
                                   num_topics=4,
                                   random_state=100,
                                   update_every=1,
                                   chunksize=100,
                                   passes=10,
                                   alpha='auto',
                                   per_word_topics=True)

In [15]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=df['tokens'], dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model_lda.get_coherence()
print(f'Coherence Score: {coherence_score}')

Coherence Score: 0.5066238862749219


In [17]:
topics = lda_model.print_topics(num_words=10)
for i, topic in topics:
    print(f"Topic {i+1}: {topic}")

Topic 1: 0.003*"blbh" + 0.002*"trinomials" + 0.002*"boomer" + 0.002*"maxaxaxaxaxaxaxaxaxaxaxaxaxaxax" + 0.002*"scorer" + 0.002*"calgary" + 0.001*"winnipeg" + 0.001*"bhcssimaxbyte" + 0.001*"drake" + 0.001*"scand"
Topic 2: 0.005*"istanbul" + 0.003*"cable" + 0.003*"ankara" + 0.003*"bayonet" + 0.002*"serdar" + 0.002*"argic" + 0.002*"hojali" + 0.002*"negev" + 0.002*"azeri" + 0.002*"erzurum"
Topic 3: 0.009*"would" + 0.008*"people" + 0.006*"dont" + 0.005*"government" + 0.005*"know" + 0.005*"think" + 0.005*"right" + 0.005*"time" + 0.005*"like" + 0.004*"could"
Topic 4: 0.015*"encryption" + 0.015*"chip" + 0.012*"system" + 0.010*"information" + 0.009*"key" + 0.008*"privacy" + 0.008*"file" + 0.007*"program" + 0.007*"data" + 0.007*"message"


#The coherence score obtained for the LDA model is 0.5066, which shows an okay level of topic interpetration. 
Typically coherence scores ranges from 0 to 1. What this means is that higher values suggest that the topics 
generated by the model are more semantically consistent and meaningful. Scoring above 0.5 is
generally acceptable, especially  for text data like public articles, which can be noisy or covers a lot of themes. 
The current score suggests that the topics make some logical sense there is still room for improvement
by cleaning the text further or experimenting with a different number of topics.