In [56]:
# For text preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

# For topic modeling
from gensim import corpora
from gensim.models import LdaModel
from gensim.models import CoherenceModel
import pandas as pd
import re
import gensim

# Download NLTK Resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\Nur
[nltk_data]     Adilah\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Nur
[nltk_data]     Adilah\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Nur
[nltk_data]     Adilah\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [32]:
# Read the data (use only the ‘text’ column)
data = pd.read_csv('news_dataset.csv', usecols=['text'])
data

Unnamed: 0,text
0,I was wondering if anyone out there could enli...
1,I recently posted an article asking what kind ...
2,\nIt depends on your priorities. A lot of peo...
3,an excellent automatic can be found in the sub...
4,: Ford and his automobile. I need information...
...,...
11309,Secrecy in Clipper Chip\n\nThe serial number o...
11310,Hi !\n\nI am interested in the source of FEAL ...
11311,"The actual algorithm is classified, however, t..."
11312,\n\tThis appears to be generic calling upon th...


In [33]:
# Remove null values
data.dropna(subset=['text'], inplace=True)
data.shape

(11096, 1)

In [50]:
# Initialize necessary tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Text preprocessing function
def preprocess(text):
    # Remove special characters, digits, and single letters, then make lowercase
    text = re.sub(r'\W', ' ', str(text))  # Remove special characters
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)  # Remove single letters
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)  # Remove single letters from start
    text = re.sub(r'\s+', ' ', text, flags=re.I)  # Replace multiple spaces with single space
    text = re.sub(r'\b[a-zA-Z]\b', '', text)  # Remove all single characters
    text = text.lower()  # Convert to lowercase
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    
    # Remove any remaining tokens of only one character
    tokens = [word for word in tokens if len(word) > 1]
    
    # Lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Stemming
    tokens = [stemmer.stem(word) for word in tokens]
    
    return tokens


# Apply preprocessing to the text column
data['processed_text'] = data['text'].apply(preprocess)

In [51]:
# Create a dictionary and corpus for the LDA model
id2word = corpora.Dictionary(data['processed_text'])
texts = data['processed_text']
corpus = [id2word.doc2bow(text) for text in texts]

In [52]:
# Build LDA model
lda_model = gensim.models.LdaModel(corpus=corpus,
                                   id2word=id2word,
                                   num_topics=4,
                                   passes=10)

In [53]:
# Assign dominant topic to each document
article_labels = []

for doc in texts:
    # Convert document to bag-of-words representation
    bow = id2word.doc2bow(doc)
    # Get list of topic probabilities
    topics = lda_model.get_document_topics(bow)
    # Determine topic with highest probability
    dominant_topic = max(topics, key=lambda x: x[1])[0]
    # Append to the list
    article_labels.append(dominant_topic)

# Create DataFrame
df_result = pd.DataFrame({"Article": data['text'], "Topic": article_labels})

# Print the DataFrame
print("Table with Articles and Topic:")
print(df_result)
print()

Table with Articles and Topic:
                                                 Article  Topic
0      I was wondering if anyone out there could enli...      0
1      I recently posted an article asking what kind ...      0
2      \nIt depends on your priorities.  A lot of peo...      0
3      an excellent automatic can be found in the sub...      0
4      : Ford and his automobile.  I need information...      0
...                                                  ...    ...
11309  Secrecy in Clipper Chip\n\nThe serial number o...      0
11310  Hi !\n\nI am interested in the source of FEAL ...      0
11311  The actual algorithm is classified, however, t...      0
11312  \n\tThis appears to be generic calling upon th...      0
11313  \nProbably keep quiet and take it, lest they g...      3

[11096 rows x 2 columns]



In [54]:
# Print top terms for each topic
for topic_id in range(lda_model.num_topics):
    print(f"Top terms for Topic #{topic_id}:")
    top_terms = lda_model.show_topic(topic_id, topn=10)
    print([term[0] for term in top_terms])
    print()

Top terms for Topic #0:
['use', 'key', 'would', 'one', 'encrypt', 'system', 'get', 'like', 'work', 'chip']

Top terms for Topic #1:
['ax', 'max', 'gv', 'bf', 'pl', 'di', 'bh', 'cx', 'tm', 'bhj']

Top terms for Topic #2:
['edu', 'file', 'db', 'window', 'anonym', 'use', 'program', 'com', 'mail', 'ftp']

Top terms for Topic #3:
['peopl', 'one', 'would', 'say', 'think', 'go', 'like', 'know', 'time', 'year']



In [58]:
# Evaluate the LDA model using Coherence score
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f"Coherence Score: {coherence_lda}")

Coherence Score: 0.7171447255423457


In [1]:
## Student name and id
# Aisya Batrisyia BInti Azley SW01081523
# Nur Adilah Binti Zainal Abidin SW01081031

# The coherence score (0.717) indicates that the level of semantic similarity and meaningfulness
# among the words thus the topics are well-formed and interpretable thus providing coherent and distinct 
# themes that are useful in understanding the structure of the text data. Hence, the model is a well-performing one. 

