In [29]:
# Import necessary libraries
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from gensim import corpora, models
from gensim.test.utils import datapath
import gensim
import pyLDAvis
import pyLDAvis.gensim_models

# Preprocessing

In [11]:
# Function to load data from folder path
def load_data(folder_path):
    articles = []
    
	# Iterate through all files in the folder path
    for file in os.listdir(folder_path):
        with open(os.path.join(folder_path, file), "r", encoding="latin") as f:
            content = f.read()
            articles.append(content)
    return articles

In [12]:
# Function to preprocess the loaded data
def preprocess_data(articles):
    cleaned_articles = []
     # Iterate through all articles in the input list   

    for article in articles:
	# Remove excessive whitespace characters and convert to lowercase
        text = re.sub(r"\s+", " ", article)
        text = re.sub(r"[^a-zA-Z\s]", "", text).lower()
	# Tokenize text into words
        tokens = word_tokenize(text)
	# Remove stop words (common words with little meaning such as "the" or "a")
        stop_words = set(stopwords.words("english"))
        custom_stopwords = {"mr", "said", "u"}  # Add any additional stopwords here
        stop_words = stop_words.union(custom_stopwords)
	# Remove any remaining short words and lemmatize 
        tokens = [token for token in tokens if token not in stop_words]
        tokens = [token for token in tokens if len(token) > 1 or token in {"i", "a"}]
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        tokens = [token for token in tokens if token != "u"]
        cleaned_articles.append(tokens)
    return cleaned_articles

In [13]:
# Function to create a corpus and dictionary from the preprocessed data
def create_corpus_dictionary(cleaned_articles):
    dictionary = corpora.Dictionary(cleaned_articles)
    corpus = [dictionary.doc2bow(article) for article in cleaned_articles]
    return corpus, dictionary

In [14]:
# Run preprocessing functions
folder_path = "Articles/"
articles = load_data(folder_path)
cleaned_articles = preprocess_data(articles)
corpus, dictionary = create_corpus_dictionary(cleaned_articles)

# Topic Models

In [15]:
# Function to train an LDA model
def train_lda(corpus, dictionary, num_topics, passes=15):
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=passes)
    return lda_model

In [16]:
# Function to print and plot the topics from an LDA model
def plot_topics(lda_model):
    num_topics = lda_model.num_topics
    topic_words = lda_model.print_topics(num_words=10)
    
    for topic in range(num_topics):
        print(f"Topic {topic}:")
        print(topic_words[topic])

In [17]:
# Function to run the LDA training and save the results to a file
def run_and_save_lda(corpus, dictionary, num_topics, passes, model_number, output_file):
    lda_model = train_lda(corpus, dictionary, num_topics, passes)

    # save the 4 models to Models/ folder
    temp_file = 'Models/lda_model_' + str(model_number+1)
    lda_model.save(temp_file)

    # Save the topic words to output files
    topic_words = lda_model.print_topics(num_words=10)
    with open(output_file, "w") as f:
        for topic in range(num_topics):
            f.write(f"Topic {topic}:\n")
            f.write(str(topic_words[topic]) + "\n\n")

In [18]:
# Define different parameters to build the LDA model
# use different topic and pass count and discuss differences in report
params = [{"num_topics": 5, "passes": 10},{"num_topics": 10, "passes": 10},{"num_topics": 5, "passes": 20},{"num_topics": 10, "passes": 20}]


# Iterating through the parameters and building different LDA models
for i, p in enumerate(params):
    output_file = f"output_{i + 1}.txt"
    run_and_save_lda(corpus, dictionary,  p["num_topics"], p["passes"], i, output_file)

# Evaluation

In [19]:
# Load models from Models folder
model_1 = models.ldamodel.LdaModel.load("Models/lda_model_1")
model_2 = models.ldamodel.LdaModel.load("Models/lda_model_2")
model_3 = models.ldamodel.LdaModel.load("Models/lda_model_3")
model_4 = models.ldamodel.LdaModel.load("Models/lda_model_4")
models_list = [model_1, model_2, model_3, model_4]

In [20]:
# Compute Perplexity of all 4 models
for m in models_list:
    print('\nPerplexity: ', m.log_perplexity(corpus))


Perplexity:  -8.493143757989483

Perplexity:  -8.502332575847719

Perplexity:  -8.478354714100588

Perplexity:  -8.504284328414839


# Visualization

In [30]:
# Enable Python Notebook display of pyLDAvis
pyLDAvis.enable_notebook

# Model 1 visualization
vis = pyLDAvis.gensim_models.prepare(model_1, corpus, dictionary)
vis

  default_term_info = default_term_info.sort_values(


In [31]:
# Model 2 visualization
vis = pyLDAvis.gensim_models.prepare(model_2, corpus, dictionary)
vis

  default_term_info = default_term_info.sort_values(
