In [1]:
#Kamal Adeem Bin Kamaruddin IS01081937
#Muhammad Hafiz SW01081229

import pandas as pd
import re
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

from gensim import corpora
from gensim.models import LdaModel, CoherenceModel

# Download NLTK resources (only needed once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load dataset
df = pd.read_csv('news_dataset.csv')

# Drop nulls and keep only 'text' column
df = df[['text']].dropna()

# Initialize NLP tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Text preprocessing
def preprocess(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text.lower())
    tokens = word_tokenize(text)
    tokens = [
        lemmatizer.lemmatize(stemmer.stem(word))
        for word in tokens if word not in stop_words and len(word) > 3
    ]
    return tokens

# Apply preprocessing
df['tokens'] = df['text'].apply(preprocess)
print(df['tokens'])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0        [wonder, anyon, could, enlighten, door, sport,...
1        [recent, post, articl, ask, kind, rate, singl,...
2        [depend, prioriti, peopl, higher, prioriti, mi...
3        [excel, automat, found, subaru, legaci, switch...
4        [ford, automobil, need, inform, whether, ford,...
                               ...                        
11309    [secreci, clipper, chip, serial, number, clipp...
11310    [interest, sourc, feal, encrypt, algorithm, so...
11311    [actual, algorithm, classifi, howev, main, thr...
11312    [appear, gener, call, upon, name, anti, christ...
11313    [probabl, keep, quiet, take, lest, kneecap, bust]
Name: tokens, Length: 11096, dtype: object


In [11]:
# Prepare dictionary and corpus
dictionary = corpora.Dictionary(df['tokens'])
corpus = [dictionary.doc2bow(tokens) for tokens in df['tokens']]

# Train LDA Model with 4 topics
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=2, passes=15, random_state=42)

# Evaluate with Coherence Score
coherence_model = CoherenceModel(model=lda_model, texts=df['tokens'], dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print(f"\n🧪 Coherence Score: {coherence_score:.4f}\n")

# Assign dominant topic to each document
df['assigned_topic'] = [
    max(lda_model.get_document_topics(dictionary.doc2bow(tokens)), key=lambda x: x[1])[0]
    for tokens in df['tokens']
]

# Show first few labeled articles
print("📄 First few labeled articles:")
print(df[['text', 'assigned_topic']].head(), "\n")


🧪 Coherence Score: 0.4944

📄 First few labeled articles:
                                                text  assigned_topic
0  I was wondering if anyone out there could enli...               1
1  I recently posted an article asking what kind ...               0
2  \nIt depends on your priorities.  A lot of peo...               0
3  an excellent automatic can be found in the sub...               1
4  : Ford and his automobile.  I need information...               1 



In [9]:
# Show top terms per topic
print("Top terms per topic:")
topics = lda_model.print_topics(num_words=10)
for idx, topic in topics:
    print(f"\nTopic {idx + 1}:")
    for term in topic.split('+'):
        weight, word = term.split('*')
        print(f"- {word.strip().strip('\"')} (weight: {weight.strip()})")

Top terms per topic:

Topic 1:
- would (weight: 0.008)
- peopl (weight: 0.007)
- think (weight: 0.005)
- like (weight: 0.005)
- know (weight: 0.005)
- time (weight: 0.004)
- year (weight: 0.004)
- make (weight: 0.004)
- right (weight: 0.004)
- govern (weight: 0.004)

Topic 2:
- use (weight: 0.008)
- system (weight: 0.006)
- file (weight: 0.006)
- encrypt (weight: 0.006)
- program (weight: 0.005)
- chip (weight: 0.005)
- window (weight: 0.004)
- would (weight: 0.004)
- like (weight: 0.004)
- inform (weight: 0.004)


In [17]:
"""in Topic 2 above, words like use, system, file,
and encrypt appear together and suggest a theme 
around technology or cybersecurity. The 
top words in topic 1 are unrelated or too generic, the coherence score may be lower, 
signaling that the topic might not represent a distinct, meaningful theme."""

'in Topic 2 above, words like use, system, file,\nand encrypt appear together and suggest a theme \naround technology or cybersecurity. The \ntop words in topic 1 are unrelated or too generic, the coherence score may be lower, \nsignaling that the topic might not represent a distinct, meaningful theme.'