# Topic Modeling

In [7]:
import pandas as pd
import numpy as np
import pyLDAvis.sklearn
import pyLDAvis
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
from nltk.corpus import stopwords
#Get a list of English stop words
stop_words = set(stopwords.words('english'))

In [159]:
# Time processing
df = pd.read_csv("finall_data.csv")
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month

Yearly Topic Modeling

In [10]:
df['year'].value_counts()

2023    12686
2021    10503
2022    10266
2019     9259
2020     8691
2017     6046
2018     5845
2016     3629
2015     1507
2014       94
2012        5
2013        4
2011        3
Name: year, dtype: int64

2023    12686
2021    10503
2022    10266
2019     9259
2020     8691
2017     6046
2018     5845
2016     3629
2015     1507
2014       94
2012        5
2013        4
2011        3
Name: year, dtype: int64

In [11]:
from scipy.sparse import csr_matrix
def preprocess_text(text, stop_words):
    """
    This function is used to preprocess the input text, including word segmentation, removing stop words and so on. The text is first divided using the word_tokenize function, then the stops are removed, and finally the processed words are concatenated into a new string and returned
    :param text:
    :param stop_words:
    :return:
    """
    # Remove stop words from the tokenized text
    seg_list_after = []
    try:
        seg_list = word_tokenize(text)
    except:
        return ''

    for seg in seg_list:
        if seg not in stop_words:
            seg_list_after.append(seg)
    # Join the processed tokens into a single string
    return ' '.join(seg_list_after)

# Process the raw text data with the preprocess_text function
def lad_year(data_info, stop_words, kind, time, n_topics):
    """
    This function is the main processing logic part and consists of the following steps:

    Initialize a TF-IDF vectorizer using TfidfVectorizer().
    Each piece of data in the input original text data data_info was preprocessed to obtain the processed text data set processed_texts.
    The processed text data set processed_texts was converted into tf_idf matrix.
    Initialize a Latent Dirichlet Allocation (LDA) model and set parameters such as the number of topics, n_topics, number of iterations, etc.
    Using TF-IDF matrix tf_idf to train LDA model, the subject model is obtained.
    Use the pyLDAvis library to prepare visualization data, generate theme visualization results, and save the results as HTML and Excel files.
    :param data_info:
    :param stop_words:
    :param kind:
    :param time:
    :param n_topics:
    :return:
    """
    tf_idf_vectorizer = TfidfVectorizer()

    processed_texts = (preprocess_text(v, stop_words) for v in data_info)
    # Fit the processed texts to a TF-IDF matrix
    tf_idf = tf_idf_vectorizer.fit_transform(processed_texts)

    # Create an instance of LDA with specified parameters
    lda = LatentDirichletAllocation(
        n_components=n_topics, max_iter=50,
        learning_method='online',
        learning_offset=50.,
        random_state=0)
    """
    n_components: This parameter specifies the number of topics to be discovered by the model. By setting different topic counts, you can explore different number of topic features in the text data.
    max_iter: This parameter specifies the maximum number of iterations for the model. During the training process, the model iteratively optimizes the parameters to fit the data.
    learning_method: The 'online' learning method is selected here. Online learning methods can accelerate the training of models and are suitable for handling large data sets.
    learning_offset: During online learning, this parameter can smoothly adjust the learning rate and help the model converge faster.
    random_state: A random seed is set to ensure that the random results produced by the model are consistent every time the model is run, which helps the reproducibility of the results.
    """
    # Train the LDA model on the TF-IDF matrix
    lda.fit(tf_idf)

    # Use pyLDAvis to prepare and return a visualization of the topics
    data = pyLDAvis.sklearn.prepare(lda, tf_idf, tf_idf_vectorizer)
    data.topic_info[data.topic_info['Category']!='Default'].to_excel(f"data/{kind}/{time}.xlsx")
    pyLDAvis.save_html(data, f"data/{kind}/{time}.html")


In [12]:
for i in range(2011,2024):
    data_info = df[df['year']==i]['body'].values.tolist()
    if i < 2014:
        lad_year(data_info,stop_words,'year',i,2)
    else:
        lad_year(data_info,stop_words,'year',i,3)

  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_v

# Monthly Topic Modeling

In [13]:
from scipy.sparse import csr_matrix
def lad(data_info,kind,time,n_topics):
    wordslist = []
    seg_list_after = []
    for v in data_info:
        try:
            seg_list = word_tokenize(v)
        except:
            continue
        for seg in seg_list:
            if seg not in stop_words:
                seg_list_after.append(seg)
        temp = ' '.join(seg_list_after)
        wordslist.append(temp)

    tf_idf_vectorizer = TfidfVectorizer()
    tf_idf = tf_idf_vectorizer.fit_transform(wordslist)

    # Convert bag-of-words vectors to sparse matrix format
    tf_idf = csr_matrix(tf_idf)

    # The number of selected topics
    n_topics = n_topics


    lda = LatentDirichletAllocation(
        n_components=n_topics, max_iter=50,
        learning_method='online',
        learning_offset=50.,
        random_state=0)
    # Core, feed LDA the resulting TF-IDF matrix
    lda.fit(tf_idf)
    data = pyLDAvis.sklearn.prepare(lda, tf_idf, tf_idf_vectorizer)
    data.topic_info[data.topic_info['Category']!='Default'].to_excel(f"data/{kind}/{time}.xlsx")
    pyLDAvis.save_html(data, f"data/{kind}/{time}.html")


In [15]:
for year in range(2016,2024):
    for month in range(1,13):
        time = str(year) + '-' + str(month)
        data_info = df[(df['year']==year)&(df['month']==month)]['body'].values.tolist()
        if len(data_info) != 0:
            if len(data_info) < 100:
                lad(data_info,'month',time,2)
            else:
                lad(data_info,'month',time,3)
            print(time)

  default_term_info = default_term_info.sort_values(


2016-1


  default_term_info = default_term_info.sort_values(


2016-2


  default_term_info = default_term_info.sort_values(


2016-3


  default_term_info = default_term_info.sort_values(


2016-4


  default_term_info = default_term_info.sort_values(


2016-5


  default_term_info = default_term_info.sort_values(


2016-6


  default_term_info = default_term_info.sort_values(


2016-7


  default_term_info = default_term_info.sort_values(


2016-8


  default_term_info = default_term_info.sort_values(


2016-9


  default_term_info = default_term_info.sort_values(


2016-10


  default_term_info = default_term_info.sort_values(


2016-11


  default_term_info = default_term_info.sort_values(


2016-12


  default_term_info = default_term_info.sort_values(


2017-1


  default_term_info = default_term_info.sort_values(


2017-2


  default_term_info = default_term_info.sort_values(


2017-3


  default_term_info = default_term_info.sort_values(


2017-4


  default_term_info = default_term_info.sort_values(


2017-5


  default_term_info = default_term_info.sort_values(


2017-6


  default_term_info = default_term_info.sort_values(


2017-7


  default_term_info = default_term_info.sort_values(


2017-8


  default_term_info = default_term_info.sort_values(


2017-9


  default_term_info = default_term_info.sort_values(


2017-10


  default_term_info = default_term_info.sort_values(


2017-11


  default_term_info = default_term_info.sort_values(


2017-12


  default_term_info = default_term_info.sort_values(


2018-1


  default_term_info = default_term_info.sort_values(


2018-2


  default_term_info = default_term_info.sort_values(


2018-3


  default_term_info = default_term_info.sort_values(


2018-4


  default_term_info = default_term_info.sort_values(


2018-5


  default_term_info = default_term_info.sort_values(


2018-6


  default_term_info = default_term_info.sort_values(


2018-7


  default_term_info = default_term_info.sort_values(


2018-8


  default_term_info = default_term_info.sort_values(


2018-9


  default_term_info = default_term_info.sort_values(


2018-10


  default_term_info = default_term_info.sort_values(


2018-11


  default_term_info = default_term_info.sort_values(


2018-12


  default_term_info = default_term_info.sort_values(


2019-1


  default_term_info = default_term_info.sort_values(


2019-2


  default_term_info = default_term_info.sort_values(


2019-3


  default_term_info = default_term_info.sort_values(


2019-4


  default_term_info = default_term_info.sort_values(


2019-5


  default_term_info = default_term_info.sort_values(


2019-6


  default_term_info = default_term_info.sort_values(


2019-7


  default_term_info = default_term_info.sort_values(


2019-8


  default_term_info = default_term_info.sort_values(


2019-9


  default_term_info = default_term_info.sort_values(


2019-10


  default_term_info = default_term_info.sort_values(


2019-11


  default_term_info = default_term_info.sort_values(


2019-12


  default_term_info = default_term_info.sort_values(


2020-1


  default_term_info = default_term_info.sort_values(


2020-2


  default_term_info = default_term_info.sort_values(


2020-3


  default_term_info = default_term_info.sort_values(


2020-4


  default_term_info = default_term_info.sort_values(


2020-5


  default_term_info = default_term_info.sort_values(


2020-6


  default_term_info = default_term_info.sort_values(


2020-7


  default_term_info = default_term_info.sort_values(


2020-8


  default_term_info = default_term_info.sort_values(


2020-9


  default_term_info = default_term_info.sort_values(


2020-10


  default_term_info = default_term_info.sort_values(


2020-11


  default_term_info = default_term_info.sort_values(


2020-12


  default_term_info = default_term_info.sort_values(


2021-1


  default_term_info = default_term_info.sort_values(


2021-2


  default_term_info = default_term_info.sort_values(


2021-3


  default_term_info = default_term_info.sort_values(


2021-4


  default_term_info = default_term_info.sort_values(


2021-5


  default_term_info = default_term_info.sort_values(


2021-6


  default_term_info = default_term_info.sort_values(


2021-7


  default_term_info = default_term_info.sort_values(


2021-8


  default_term_info = default_term_info.sort_values(


2021-9


  default_term_info = default_term_info.sort_values(


2021-10


  default_term_info = default_term_info.sort_values(


2021-11


  default_term_info = default_term_info.sort_values(


2021-12


  default_term_info = default_term_info.sort_values(


2022-1


  default_term_info = default_term_info.sort_values(


2022-2


  default_term_info = default_term_info.sort_values(


2022-3


  default_term_info = default_term_info.sort_values(


2022-4


  default_term_info = default_term_info.sort_values(


2022-5


  default_term_info = default_term_info.sort_values(


2022-6


  default_term_info = default_term_info.sort_values(


2022-7


  default_term_info = default_term_info.sort_values(


2022-8


  default_term_info = default_term_info.sort_values(


2022-9


  default_term_info = default_term_info.sort_values(


2022-10


  default_term_info = default_term_info.sort_values(


2022-11


  default_term_info = default_term_info.sort_values(


2022-12


  default_term_info = default_term_info.sort_values(


2023-1


  default_term_info = default_term_info.sort_values(


2023-2


  default_term_info = default_term_info.sort_values(


2023-3


  default_term_info = default_term_info.sort_values(


2023-4


  default_term_info = default_term_info.sort_values(


2023-5


  default_term_info = default_term_info.sort_values(


2023-6


  default_term_info = default_term_info.sort_values(


2023-7


  default_term_info = default_term_info.sort_values(


2023-8


  default_term_info = default_term_info.sort_values(


2023-9


  default_term_info = default_term_info.sort_values(


2023-10


  default_term_info = default_term_info.sort_values(


2023-11


  default_term_info = default_term_info.sort_values(


2023-12
