In [77]:
# Import necessary libraries
import os
import re
import pandas as pd
import numpy
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from gensim import corpora, models
import pyLDAvis
import pyLDAvis.gensim_models
from gensim.models import CoherenceModel

# Preprocessing

In [78]:
# Function to load data from folder path
def load_data(folder_path):
    articles = []
    
	# Iterate through all files in the folder path
    for file in os.listdir(folder_path):
        with open(os.path.join(folder_path, file), "r", encoding="latin") as f:
            content = f.read()
            articles.append(content)
    return articles

In [80]:
# Function to preprocess the loaded data
def preprocess_data(article_files):
    cleaned_articles = []
     # Iterate through all articles in the input list   

    for article_file in article_files:
        # Split individual articles
        articles = article_file.split('')
        for article in articles:
	        # Remove new line characters
                text = re.sub(r"\s+", " ", article)
                # Remove numbers and punctuation
                text = re.sub(r"[^a-zA-Z\s]", "", text)
                # Make all characters lowercase
                text = text.lower()
                # Tokenize text into words
                tokens = word_tokenize(text)
	        # Remove stop words (common words with little meaning such as "the" or "a")
                stop_words = set(stopwords.words("english"))
                custom_stopwords = {"mr", "said", "u"}  # Add any additional stopwords here
                stop_words = stop_words.union(custom_stopwords)
	        # Remove any remaining short words and lemmatize 
                tokens = [token for token in tokens if token not in stop_words]
                tokens = [token for token in tokens if len(token) > 1 or token in {"i", "a"}]
                lemmatizer = WordNetLemmatizer()
                tokens = [lemmatizer.lemmatize(token) for token in tokens]
                tokens = [token for token in tokens if token != "u"]
                cleaned_articles.append(tokens)
    return cleaned_articles

In [81]:
# Function to create a corpus and dictionary from the preprocessed data
def create_corpus_dictionary(cleaned_articles):
    dictionary = corpora.Dictionary(cleaned_articles)
    corpus = [dictionary.doc2bow(article) for article in cleaned_articles]
    return corpus, dictionary

In [82]:
# Run preprocessing functions
folder_path = "Articles/"
article_files = load_data(folder_path)
cleaned_articles = preprocess_data(article_files)
corpus, dictionary = create_corpus_dictionary(cleaned_articles)



# Topic Models

In [64]:
# Function to train an LDA model
def train_lda(corpus, dictionary, num_topics, passes):
    lda_model = models.LdaModel(corpus=corpus,
                                num_topics=num_topics,
                                id2word=dictionary,
                                passes=passes,
                                random_state=13)
    return lda_model

In [65]:
# Function to print and plot the topics from an LDA model
def plot_topics(lda_model):
    num_topics = lda_model.num_topics
    topic_words = lda_model.print_topics(num_words=10)
    
    for topic in range(num_topics):
        print(f"Topic {topic}:")
        print(topic_words[topic])

In [66]:
# Function to run the LDA training and save the results to a file
def run_and_save_lda(corpus, dictionary, num_topics, passes, model_number, output_file):
    lda_model = train_lda(corpus, dictionary, num_topics, passes)

    # save the 4 models to Models/ folder
    temp_file = 'Models/lda_model_' + str(model_number+1)
    lda_model.save(temp_file)

    # Save the topic words to output files
    topic_words = lda_model.print_topics(num_words=10)
    with open(output_file, "w") as f:
        for topic in range(num_topics):
            f.write(f"Topic {topic}:\n")
            f.write(str(topic_words[topic]) + "\n\n")

In [67]:
# Define different parameters to build the LDA model
# use different topic and pass count and discuss differences in report
params = [{"num_topics": 5, "passes": 10},{"num_topics": 10, "passes": 10},{"num_topics": 5, "passes": 20},{"num_topics": 10, "passes": 20}]


# Iterating through the parameters and building different LDA models
for i, p in enumerate(params):
    output_file = f"output_{i + 1}.txt"
    run_and_save_lda(corpus, dictionary,  p["num_topics"], p["passes"], i, output_file)

# Evaluation

In [68]:
# Load models from Models folder
model_1 = models.ldamodel.LdaModel.load("Models/lda_model_1")
model_2 = models.ldamodel.LdaModel.load("Models/lda_model_2")
model_3 = models.ldamodel.LdaModel.load("Models/lda_model_3")
model_4 = models.ldamodel.LdaModel.load("Models/lda_model_4")
models_list = [model_1, model_2, model_3, model_4]

In [69]:
# Function to print Coherence Score of model
def compute_coherence(lda_model):
    coherence_lda_model = CoherenceModel(model=lda_model, texts=cleaned_articles, dictionary=dictionary, coherence='c_v')
    coherence_number = coherence_lda_model.get_coherence()
    print('Coherence:  ', coherence_number)

In [70]:
model_number = 0

# Compute Perplexity of all 4 models
for m in models_list:
    model_number+=1
    print('Model ', model_number)
    print('Perplexity: ', m.log_perplexity(corpus))
    compute_coherence(m)
    print('\n')

Model  1
Perplexity:  -8.492769328374036
Coherence:   0.3008167104770959


Model  2
Perplexity:  -8.498756589638418
Coherence:   0.3068762045774803


Model  3
Perplexity:  -8.483742503821002
Coherence:   0.2884899369530641


Model  4
Perplexity:  -8.49175200324849
Coherence:   0.30918349721905114




# Visualization

In [71]:
# Enable Python Notebook display of pyLDAvis
pyLDAvis.enable_notebook()


In [72]:

# Model 1 visualization
vis = pyLDAvis.gensim_models.prepare(model_1, corpus, dictionary)
vis

  default_term_info = default_term_info.sort_values(


In [73]:
# Model 2 visualization
vis = pyLDAvis.gensim_models.prepare(model_2, corpus, dictionary)
vis

  default_term_info = default_term_info.sort_values(


In [74]:
# Model 3 visualization
vis = pyLDAvis.gensim_models.prepare(model_3, corpus, dictionary)
vis

  default_term_info = default_term_info.sort_values(


In [75]:
# Model 4 visualization
vis = pyLDAvis.gensim_models.prepare(model_4, corpus, dictionary)
vis

  default_term_info = default_term_info.sort_values(
