In [12]:
# Group Members:
# SARVENTTHINI - SW01081411
# DANESH THEVAR - SN01081843


In [13]:
# Import necessary libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import gensim
from gensim import corpora
from gensim.models import CoherenceModel
import re


In [14]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
# Read the data (use only the ‘text’ column)
data = pd.read_csv('news_dataset.csv')
texts = data['text'].dropna().tolist()

In [16]:
# Text preprocessing
def preprocess(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation and non-alphabetic characters
    text = re.sub(r'\W+', ' ', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords and single character words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and len(word) > 1]
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

In [17]:
# Apply preprocessing to the dataset
processed_texts = [preprocess(text) for text in texts]

In [18]:
# Create dictionary and corpus for LDA
dictionary = corpora.Dictionary(processed_texts)
corpus = [dictionary.doc2bow(text) for text in processed_texts]


In [19]:
# Perform LDA using Gensim
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=4, id2word=dictionary, passes=15)

In [20]:
# Evaluate the LDA model using Coherence score
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

In [23]:
# Print the topics and coherence score
topics = lda_model.print_topics(num_words=10)
for topic in topics:
    print(f"Topic {topic[0]}: {topic[1]}")

print(f"\nCoherence Score: {coherence_lda:.4f}")

Topic 0: 0.010*"file" + 0.008*"edu" + 0.007*"db" + 0.007*"window" + 0.006*"system" + 0.005*"use" + 0.005*"program" + 0.005*"com" + 0.005*"mail" + 0.005*"available"
Topic 1: 0.006*"00" + 0.005*"10" + 0.004*"25" + 0.004*"11" + 0.004*"55" + 0.004*"17" + 0.003*"15" + 0.003*"16" + 0.003*"14" + 0.003*"12"
Topic 2: 0.007*"would" + 0.007*"one" + 0.005*"people" + 0.004*"know" + 0.004*"like" + 0.004*"time" + 0.004*"think" + 0.003*"get" + 0.003*"key" + 0.003*"year"
Topic 3: 0.435*"ax" + 0.032*"max" + 0.008*"g9v" + 0.007*"b8f" + 0.006*"a86" + 0.005*"pl" + 0.005*"145" + 0.004*"1d9" + 0.003*"0t" + 0.003*"1t"

Coherence Score: 0.7811


In [24]:
# Interpretation of the result
interpretation = """
Interpretation of Coherence Score:
The coherence score is a measure of how interpretable the topics are to humans. It
considers the degree of semantic similarity between high-scoring words in the topics. 
A higher coherence score typically indicates more meaningful and coherent topics. 
In this model, the coherence score is {:.4f}, which suggests that the topics identified
by the LDA model are reasonably coherent. However, this score is context-dependent 
and should be interpreted with consideration to the specific dataset and domain.
""".format(coherence_lda)

print(interpretation)


Interpretation of Coherence Score:
The coherence score is a measure of how interpretable the topics are to humans. It
considers the degree of semantic similarity between high-scoring words in the topics. 
A higher coherence score typically indicates more meaningful and coherent topics. 
In this model, the coherence score is 0.7811, which suggests that the topics identified
by the LDA model are reasonably coherent. However, this score is context-dependent 
and should be interpreted with consideration to the specific dataset and domain.

