In [21]:
import pandas as pd
import nltk
import re
import numpy as np
import gensim
import pyLDAvis.gensim_models
import pyLDAvis.gensim_models as gensimvis
import json

from collections import Counter
from nltk.corpus import stopwords
from gensim import corpora, models
from gensim.models import CoherenceModel
from collections import defaultdict


nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def preprocess_text(text, threshold=2):
    months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']

    # Instantiate stop words
    stop_words = stopwords.words('english')
    stop_words.extend(months)
    stop_words

    # Lowercase
    text = text.lower()
    # Remove numbers and punctuation
    text = re.sub(r'[^a-zA-Z]+', ' ', text)
    # Remove months and people names
    
    # Tokenize words
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [token for token in tokens if token not in stop_words]
    # Remove short words
    tokens = [token for token in tokens if len(token) > 2]

    word_counts = Counter(tokens)
    filtered_words = [word for word in text.split() if word_counts[word] >= threshold]

    # Join tokens back into a string
    text = ' '.join(filtered_words)
    return text


def get_optimal_num_topics(articles):
    texts = [doc.split() for doc in articles]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    coherence_scores = defaultdict(list)
    
    for num_topics in range(2, 10):
        lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
        coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_score = coherence_model_lda.get_coherence()
        coherence_scores[num_topics].append(coherence_score)

    optimal_num_topics = max(coherence_scores, key=lambda k: np.mean(coherence_scores[k]))
    return optimal_num_topics


def start_lda(preprocessed_content):
    # get the optimal number of topics for this date
    optimal_num_topics = get_optimal_num_topics(preprocessed_content)

    # create the dictionary and corpus
    texts = [doc.split() for doc in preprocessed_content]

    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]

    # create the LDA model
    lda_model = models.LdaModel(corpus, num_topics=optimal_num_topics, id2word=dictionary, passes=10)

    # calculate the coherence score for the model
    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()
    print(f"Coherence Score: {coherence_score}")


    vis_data = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

    topics = {}
    
    for i in range(len(vis_data.topic_coordinates)):
        topic_num = f"Topic {i+1}"
        topic_terms = [term for term, _ in lda_model.show_topic(i, topn=12)]
        topics[topic_num] = topic_terms

        if i == 2:  # stop after getting the top 3 topics
            break

    with open("topics.json", "w") as f:
        json.dump(topics, f)
    
    # topics_df = pd.DataFrame(topics_data.topic_info)
    # # get the top three topics based on frequency
    # top_topics = topics_df.groupby('Category').size().sort_values(ascending=False).head(3)

    # # create a dictionary of the top topics and their terms
    # top_topics_dict = {}
    # for category in top_topics.index:
    #     category_terms = topics_df[topics_df['Category'] == category][['Term', 'Freq']].set_index('Term').to_dict()['Freq']
    #     top_topics_dict[category] = category_terms

    # # write the dictionary to a JSON file
    # with open('top_topics.json', 'w') as f:
    #     json.dump(top_topics_dict, f)
        

    

    print("\n")


def initiate_topic_modelling():
    df = pd.read_csv("../csv/bmc.csv", index_col=0)
    df.reset_index(drop=True, inplace=True)
    # Extract text data from content column and preprocess it
    df['preprocessed_content'] = df['content'].apply(preprocess_text)

    start_lda(df['preprocessed_content'])


if __name__ == '__main__':
    initiate_topic_modelling()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jeremy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jeremy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Coherence Score: 0.4005180777392556




In [2]:
pip install pyLDAvis

Collecting pyLDAvisNote: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
twint 2.1.21 requires aiodns, which is not installed.
twint 2.1.21 requires aiohttp, which is not installed.
twint 2.1.21 requires aiohttp-socks, which is not installed.
twint 2.1.21 requires cchardet, which is not installed.
twint 2.1.21 requires elasticsearch, which is not installed.
twint 2.1.21 requires fake-useragent, which is not installed.
twint 2.1.21 requires geopy, which is not installed.
twint 2.1.21 requires googletransx, which is not installed.
twint 2.1.21 requires schedule, which is not installed.
daal4py 2021.6.0 requires daal==2021.4.0, which is not installed.
numba 0.55.1 requires numpy<1.22,>=1.18, but you have numpy 1.24.3 which is incompatible.



  Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
     ---------------------------------------- 2.6/2.6 MB 10.4 MB/s eta 0:00:00
Collecting joblib>=1.2.0
  Using cached joblib-1.2.0-py3-none-any.whl (297 kB)
Collecting funcy
  Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Collecting numpy>=1.24.2
  Using cached numpy-1.24.3-cp39-cp39-win_amd64.whl (14.9 MB)
Collecting pandas>=2.0.0
  Using cached pandas-2.0.1-cp39-cp39-win_amd64.whl (10.7 MB)
Collecting tzdata>=2022.1
  Using cached tzdata-2023.3-py2.py3-none-any.whl (341 kB)
Installing collected packages: funcy, tzdata, numpy, joblib, pandas, pyLDAvis
  Attempting uninstall: numpy
    Found existing installation: numpy 1.21.5
    Uninstalling numpy-1.21.5:
      Successfully uninstalled numpy-1.21.5
  Attempting uninstall: joblib
    Found existing installation: joblib 1.1.0
    Uninstalling joblib-1.1.0:
      Successfully uninstalled joblib-1.1.0
  Attempting uninstall: pandas
    Found existing installation: pandas 1.4.4

In [7]:
pip install pandas


