# Import Libraries

In [1]:
# Data Manipulation and Visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import statsmodels.api as sm

# For progress bars and faster processing
from tqdm import tqdm
import swifter

# Natural Language Processing (NLP) Libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Topic Modeling and Embeddings
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, PartOfSpeech
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT

# Transformers and Gensim for Language Models
from transformers import AutoTokenizer, AutoModel, pipeline
from gensim.models import LdaModel, CoherenceModel
from gensim.corpora import Dictionary

# Vectorization and Embedding-based Techniques
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF

# For Dimensionality Reduction
from umap import UMAP
from sklearn.decomposition import PCA

# For Clustering
from hdbscan import HDBSCAN

# Machine Learning Models and Evaluation
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# PyTorch for Deep Learning
import torch
import torch.nn as nn
import torch.optim as optim

# WordCloud for Visualizing Word Frequencies
from wordcloud import WordCloud
from collections import Counter

# Data Loading

In [2]:
final_topics_overview = pd.read_parquet("./data/final_topics_overview.parquet")

In [3]:
final_topics_overview

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,POS,KeyBERT_MMR,Representative_Docs
0,-1,131216,-1_chinese_covid_government_pay,"[chinese, covid, government, pay, country, pol...","[singaporeans, singaporean, covid, racist, chi...","[chinese, government, pay, country, police, fo...","[singaporeans, singaporean, covid, racist, chi...",[I thoroughly agree. Men have always had the a...
1,0,4914,0_singaporeans_foreigners_country_singaporean,"[singaporeans, foreigners, country, singaporea...","[singaporeans, singaporean, foreigners, migran...","[foreigners, country, singaporean, foreign, wo...","[singaporeans, singaporean, foreigners, migran...","[Wait, so you're asking for minimum wage only ..."
2,1,4703,1_gay_lgbt_lgbtq_religion,"[gay, lgbt, lgbtq, religion, religious, church...","[homosexual, homosexuality, homosexuals, lgbt,...","[gay, lgbtq, religion, religious, church, tran...","[homosexual, homosexuality, homosexuals, lgbt,...",[it is people like you who set gay rights back...
3,2,4177,2_drug_drugs_victim_court,"[drug, drugs, victim, court, jail, death, poli...","[sentencing, jailed, offender, crimes, convict...","[drug, drugs, victim, court, jail, death, poli...","[sentencing, jailed, offender, crimes, convict...",[&gt; # Man admits trespassing into NUS hall t...
4,3,3901,3_liao_sia_kena_siao,"[liao, sia, kena, siao, lao, jiak, wah, lang, ...","[liao, sia, lian, siu, siao, siong, sian, si, ...","[sia, siao, boi, nasi, kenna, sian, peng, lema...","[liao, sia, lian, siu, siao, siong, sian, si, ...","[die liao lor, die liao die liao this time rea..."
...,...,...,...,...,...,...,...,...
114,113,214,113_slavery_slave_slaves_modern,"[slavery, slave, slaves, modern, corporate, wa...","[slavery, slave, slaves, enslaved, slavers, sl...","[slavery, slave, slaves, modern, corporate, wa...","[slavery, slave, slaves, enslaved, slavers, sl...","[in after someone say its slavery, Wtf is this..."
115,114,212,114_delivery_parcel_delivered_singpost,"[delivery, parcel, delivered, singpost, order,...","[deliveries, delivery, deliver, shipping, ship...","[delivery, parcel, singpost, order, items, sho...","[deliveries, delivery, deliver, shipping, ship...",[Bought a fan from Shopee and they claim they ...
116,115,209,115_bot_bots_thy_user,"[bot, bots, thy, user, shakespeare, full_list_...","[bot, bots, chatbot, botrank, chatbots, robot,...","[bot, bots, user, bible, boop, optout, robot, ...","[bot, bots, chatbot, botrank, chatbots, robot,...","[good bot, Wait what? There's such a bot? Lol,..."
117,116,205,116_art_artist_logo_draw,"[art, artist, logo, draw, drawing, artists, pa...","[art, artwork, artworks, arts, artistic, paint...","[art, artist, logo, drawing, artists, painting...","[art, artwork, artworks, arts, artistic, paint...","[Live art, BuT iTs ArT!!, this is what i call ..."


In [4]:
post_topic_data = pd.read_parquet("./data/post_topic_data.parquet")

In [5]:
post_topic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1547105 entries, 0 to 1547104
Data columns (total 25 columns):
 #   Column               Non-Null Count    Dtype         
---  ------               --------------    -----         
 0   text                 1547105 non-null  object        
 1   timestamp            1547105 non-null  datetime64[ns]
 2   username             1547105 non-null  object        
 3   link                 1547105 non-null  object        
 4   link_id              1547105 non-null  object        
 5   parent_id            1547105 non-null  object        
 6   id                   1547105 non-null  object        
 7   subreddit_id         1547105 non-null  object        
 8   moderation           1547105 non-null  object        
 9   BERT_2_hate          1547105 non-null  bool          
 10  nltk_processed_text  1547105 non-null  object        
 11  topic_number         1547105 non-null  int64         
 12  topic_words          1547105 non-null  object        
 1

In [6]:
post_data = pd.read_parquet("./data/post.parquet")

In [7]:
post_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4509972 entries, 0 to 4509971
Data columns (total 20 columns):
 #   Column              Dtype         
---  ------              -----         
 0   text                object        
 1   timestamp           datetime64[ns]
 2   username            object        
 3   link                object        
 4   link_id             object        
 5   parent_id           object        
 6   text_id             object        
 7   subreddit_id        object        
 8   moderation          object        
 9   BERT_2_hate         bool          
 10  subreddit           object        
 11  post_id             object        
 12  post_timestamp      datetime64[ns]
 13  post_title          object        
 14  author              object        
 15  author_id           object        
 16  comment_count       float64       
 17  vote_score          float64       
 18  post_title_cleaned  object        
 19  rake_keywords       object        
dtypes:

In [8]:
hate_toxic_topical_comments = pd.read_parquet("./data/hate_toxic_topical_comments.parquet")

## Super-spreader Behaviour Analysis

In [187]:
hate_toxic_topical_comments['username'].value_counts()[2:32]

username
deangsana              9051
FitCranberry           8697
blackwoodsix           8595
tom-slacker            7950
MangoDangoLango        7140
Boogie_p0p             6566
-_af_-                 6254
Jammy_buttons2         5771
HeavyArmsJin           4919
rowgw                  4865
AutoModerator          4741
rheinl                 4477
CastThatAccountAway    4115
jquin03                3946
MisoMesoMilo           3880
Scary_Cow              3863
oceanmountainlifer     3684
bilbolaggings          3416
brownriver12           3373
Bcpjw                  3146
ObsessedWCorgisNEXO    3135
potatetoe_tractor      2929
silentscope90210       2914
tictactorz             2735
bukitbukit             2695
bueytahanliao          2681
uranuscat              2609
prime5119              2543
SamBellFromSarang      2534
gmdotes                2529
Name: count, dtype: int64

In [10]:
# Define the function
def plot_user_activity_24hr_cycle(df, username):
    # Filter for the specified user
    user_hate_comments = df[df['username'] == username]
    
    # Extract the hour from the timestamp
    user_hate_comments['hour'] = user_hate_comments['timestamp'].dt.hour
    
    # Count the number of comments for each hour of the day and ensure all hours from 0 to 23 are included
    hourly_activity = user_hate_comments['hour'].value_counts().sort_index()
    hourly_activity = hourly_activity.reindex(range(24), fill_value=0)  # Fill missing hours with 0
    
    # Convert the hour index to 12-hour format with AM/PM labels
    hours_am_pm = [(str(hour % 12) + ("AM" if hour < 12 else "PM")) if hour % 12 != 0 else "12" + ("AM" if hour < 12 else "PM") for hour in hourly_activity.index]
    
    # Create a bar chart using Plotly
    fig = go.Figure(data=[
        go.Bar(
            x=hours_am_pm, 
            y=hourly_activity.values,
            marker_color='skyblue',
            opacity=0.7
        )
    ])
    
    # Update layout for aesthetics
    fig.update_layout(
        title=f"{username}'s Hateful/Toxic Behaviour in 24-Hour Cycle",
        yaxis_title="Number of Comments",
        xaxis=dict(
            tickmode='linear',
            tickfont=dict(size=10)  # Set font size for x-axis labels
        ),
        yaxis=dict(showgrid=True, gridcolor='lightgrey'),
        plot_bgcolor='white'
    )
    
    # Show the plot
    fig.show()

In [11]:
plot_user_activity_24hr_cycle(hate_toxic_topical_comments, 'deangsana')

In [12]:
# Extract the hour from the timestamp
hate_toxic_topical_comments['hour'] = hate_toxic_topical_comments['timestamp'].dt.hour

# Count the number of comments for each hour of the day and ensure all hours from 0 to 23 are included
hourly_activity = hate_toxic_topical_comments['hour'].value_counts().sort_index()
hourly_activity = hourly_activity.reindex(range(24), fill_value=0)  # Fill missing hours with 0

# Create a bar chart using Plotly
fig = go.Figure(data=[
    go.Bar(
        x=hourly_activity.index, 
        y=hourly_activity.values,
        marker_color='skyblue',
        opacity=0.7
    )
])

# Update layout for aesthetics
fig.update_layout(
    title=f"User Activity Levels by Hour of the Day (24-Hour Cycle)",
    xaxis_title="Hour of the Day",
    yaxis_title="Number of Comments",
    xaxis=dict(tickmode='linear', tick0=0, dtick=1),  # Show all hours from 0 to 23
    yaxis=dict(showgrid=True, gridcolor='lightgrey'),
    plot_bgcolor='white'
)

# Show the plot
fig.show()

In [13]:
# Define the function
def plot_user_activity_weekly_cycle(df, username):
    # Filter for the specified user
    user_hate_comments = df[df['username'] == username]
    
    # Extract the day of the week from the timestamp (0=Monday, 6=Sunday)
    user_hate_comments['day_of_week'] = user_hate_comments['timestamp'].dt.dayofweek
    
    # Count the number of comments for each day of the week
    weekly_activity = user_hate_comments['day_of_week'].value_counts().sort_index()
    
    # Map day of the week numbers to names
    days_of_week = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    weekly_activity.index = [days_of_week[day] for day in weekly_activity.index]
    
    # Create a bar chart using Plotly
    fig = go.Figure(data=[
        go.Bar(
            x=weekly_activity.index, 
            y=weekly_activity.values,
            marker_color='skyblue',
            opacity=0.7
        )
    ])
    
    # Update layout for aesthetics
    fig.update_layout(
        title=f"{username}'s Hateful/Toxic Behaviour Across the Week",
        xaxis_title="Day of the Week",
        yaxis_title="Number of Comments",
        xaxis=dict(tickmode='linear'),
        yaxis=dict(showgrid=True, gridcolor='lightgrey'),
        plot_bgcolor='white'
    )
    
    # Show the plot
    fig.show()

In [14]:
plot_user_activity_weekly_cycle(hate_toxic_topical_comments, 'bueytahanliao')

## Subreddit Behaviour Analysis

In [15]:
# Define the function
def plot_subreddit_activity_24hr_cycle(df, subreddit):
    # Filter for the specified user
    user_hate_comments = df[df['subreddit'] == subreddit]
    
    # Extract the hour from the timestamp
    user_hate_comments['hour'] = user_hate_comments['timestamp'].dt.hour
    
    # Count the number of comments for each hour of the day and ensure all hours from 0 to 23 are included
    hourly_activity = user_hate_comments['hour'].value_counts().sort_index()
    hourly_activity = hourly_activity.reindex(range(24), fill_value=0)  # Fill missing hours with 0
    
    # Convert the hour index to 12-hour format with AM/PM labels
    hours_am_pm = [(str(hour % 12) + ("AM" if hour < 12 else "PM")) if hour % 12 != 0 else "12" + ("AM" if hour < 12 else "PM") for hour in hourly_activity.index]
    
    # Create a bar chart using Plotly
    fig = go.Figure(data=[
        go.Bar(
            x=hours_am_pm, 
            y=hourly_activity.values,
            marker_color='skyblue',
            opacity=0.7
        )
    ])
    
    # Update layout for aesthetics
    fig.update_layout(
        title=f"{subreddit}'s Hateful/Toxic Behaviour in 24-Hour Cycle",
        yaxis_title="Number of Comments",
        xaxis=dict(
            tickmode='linear',
            tickfont=dict(size=10)  # Set font size for x-axis labels
        ),
        yaxis=dict(showgrid=True, gridcolor='lightgrey'),
        plot_bgcolor='white'
    )
    
    # Show the plot
    fig.show()

In [16]:
plot_subreddit_activity_24hr_cycle(post_topic_data, 'r/Singapore')

In [17]:
plot_subreddit_activity_24hr_cycle(post_topic_data, 'r/SingaporeRaw')

In [18]:
plot_subreddit_activity_24hr_cycle(post_topic_data, 'r/SingaporeHappenings')