## Topic Modeling

### Importing the necessary libraries

In [None]:
pip install clean-text

In [None]:
import os, types
import numpy as np
import re
import csv
import json
import pandas as pd
from collections import Counter

In [None]:
import sklearn
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from cleantext import clean
import numpy as np
import matplotlib.pyplot as plt

### Load the data

In [None]:
def jsonToDF(name):
    """Read a list of sentences from the JSON file, store them in a dataframe"""
    
    with open(f"{name}.json") as fin:
        textList = json.load(fin)

    # create a name for each document, based on its category
    indexNames = [f"{name}" for i in range(len(textList))]

    # create the dataframe, it will have one column and one index
    df = pd.DataFrame(data=textList, index=indexNames)
    df.columns = ['document']
    return df

In [None]:
user1 = jsonToDF("user1")
user2 = jsonToDF("user2")
user3 = jsonToDF("user3")
user4 = jsonToDF("user4")
user5 = jsonToDF("user5")
user6 = jsonToDF("user6")
user7 = jsonToDF("user7")
user8 = jsonToDF("user8")
user9 = jsonToDF("user9")
user10 = jsonToDF("user10")
user3.shape

In [None]:
#remove duplicates and missing values
def cleanDf(df):
    df.dropna(subset=['document'])
    df.drop_duplicates(inplace=True)
    return df

In [None]:
allDocs = [user1, user2, user3, user4, user5, user6, user7, user8, user9, user10]
for df in allDocs:
    df = cleanDf(df)
user3.shape

In [None]:
user3

In [None]:
#removes emojis from text
RE_EMOJI = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE)

def strip_emoji(text):
    if not isinstance(text, str):
        return str(text)
    return RE_EMOJI.sub(r'', text)

user3['document'] = user3['document'].apply(strip_emoji)

In [None]:
#splits hashtags
def splitHashtags(sentence):
    """Takes a sentence and splits hashtags if present"""
    if isinstance(sentence, str):
        hashtags = [tag.strip('#') for tag in sentence.split('#') if tag.strip('#')]
        return ''.join(hashtags)
    return []

user3['document'] = user3['document'].apply(splitHashtags)

In [None]:
toRemove = ['fyp', 'trending', 'foryou', 'viral', 'foryoupage', 'fy', 'fypage', 'blowthisup', 'tiktok', 'video', 'videos', 'forypu', 'fup', 'everyonefyp', 'reaction', 'fypviral', 'relatable']

def remove_words(sentence):
    """Takes a sentence and removes popular phrases"""
    if not isinstance(sentence, str):
        return str(sentence)
    words = sentence.split()
    return ' '.join(word for word in words if word.lower() not in toRemove)

user3['document'] = user3['document'].apply(remove_words)
pd.set_option("display.max_colwidth",1000)
user3

### Convert to document-term matrix

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Initialize the vectorizer
vectorizer = CountVectorizer(
    strip_accents='unicode',
    stop_words='english',
    lowercase=True,
    token_pattern=r'\b[a-zA-Z]{3,}\b', # we want only words that contain letters and are 3 or more characters long
)

# Transform our data into the document-term matrix
dtm = vectorizer.fit_transform(user7['document'])
dtm

In [None]:
feature_names = vectorizer.get_feature_names_out()
feature_names

In [None]:
def matrix2Doc(dtMatrix, features, index):
    """Turns each row of the document-term matrix into a list of terms"""
    row = dtMatrix.getrow(index).toarray()
    non_zero_indices = row.nonzero()[1]
    words = [features[idx] for idx in non_zero_indices]
    return words

In [None]:
user7AsTerms = [matrix2Doc(dtm, feature_names, i) for i in range(dtm.shape[0])]

In [None]:
user7['terms'] = user7AsTerms
pd.set_option("display.max_colwidth",1000)
user7.head()

### Fitting the LDA model

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
# Step 1: Initialize the model
lda = LatentDirichletAllocation(n_components=5, # we are picking the number of topics arbitrarily at the moment
                                random_state=0)
# Step 2: Fit the model
lda.fit(dtm)

In [None]:
lda.components_.shape

In [None]:
doc_topic_dist = lda.transform(dtm)
doc_topic_dist 

In [None]:
doc_topic_dist.shape

In [None]:
def display_topics(model, features, no_top_words):
    """Helper function to show the top words of a model"""
    for topic_idx, topic in enumerate(model.components_):                                                                                                             
        print(f"Topic {topic_idx}:")
        print(" ".join([features[i]
                        for i in topic.argsort()[:-no_top_words-1:-1]]))

display_topics(lda, feature_names, 30)

In [None]:
def displayHeader(model, features, no_top_words):
    """Helper function to show the top words of a model"""
    topicNames = []
    for topic_idx, topic in enumerate(model.components_):
        topicNames.append(f"Topic {topic_idx}: " + (", ".join([features[i]
                             for i in topic.argsort()[:-no_top_words-1:-1]])))
    return topicNames

In [None]:
# column names
topicnames = displayHeader(lda, feature_names, 0)

# index names
docnames = user8.index.tolist() # We will use the original names of the documents

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(doc_topic_dist, 3), 
                                 columns=topicnames, 
                                 index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1) # finds the maximum argument
df_document_topic['dominant_topic'] = dominant_topic

df_document_topic.tail()

In [None]:
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Sentences']
df_topic_distribution

In [None]:
# Extract data from DataFrame columns
x_values = df_topic_distribution['Topic Num']
y_values = df_topic_distribution['Num Sentences']

# Plot the bar chart
plt.style.use('classic')
plt.bar(x_values, y_values, color='darkred')
plt.grid(axis='y', linestyle='solid', linewidth=0.5, alpha=0.5)
plt.xlim(-0.5, 4.5)

# Add labels and title
plt.xlabel('Topics')
plt.ylabel('Sent ence Counts')
plt.title('user8 Bar Chart of Topics vs. Sentences')

# Rotate x-axis labels for better readability (if needed)
plt.xticks(rotation=45)

# Show the plot
plt.show()