### Topic Analysis using BERTtopic (Community Dataset #1: 01/09/2020 - 15/12/2020)

#### Import Core Libraries

In [2]:
# Import Core library
import sys
import re, numpy as np, pandas as pd, matplotlib.pyplot as plt, nltk
import emoji
import plotly
from pprint import pprint
from cleantext import clean
import little_mallet_wrapper
from bertopic import BERTopic
from textblob import TextBlob

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


#### Importing and Visualizing Dataset

In [3]:
# Import Dataset
df = pd.read_excel('../Community306_raw.xlsx')

In [4]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,id,source_id,created_at,geo_source,truncated,text,text_sentiment,text_toxicity,lang,...,retweet_id_str,is_quote_status,quoted_status_id_str,in_reply_to_status_id_str,in_reply_to_user_id_str,in_reply_to_screen_name,entities,possibly_sensitive,coordinates,place
0,1,1.34617e+18,18,2021-01-04 19:01:23,1.0,0,@HHere4trump @sbacon0410 #stopthesteal #stopth...,,,und,...,,0,,1346156649287987201,,,"{""entities"": {""hashtags"": [{""end"": 38, ""tag"": ...",0,,
1,3,1.345954e+18,18,2021-01-04 04:45:23,,0,@RudyGiuliani @CaliConserv1 #StoptheSteal #Sto...,,,und,...,,0,,1345924505483546624,,,"{""entities"": {""hashtags"": [{""end"": 41, ""tag"": ...",0,,


In [5]:
df.columns

Index(['Unnamed: 0', 'id', 'source_id', 'created_at', 'geo_source',
       'truncated', 'text', 'text_sentiment', 'text_toxicity', 'lang', 'user',
       'Community', 'retweet_count', 'favorite_count', 'quote_count',
       'reply_count', 'retweet_id_str', 'is_quote_status',
       'quoted_status_id_str', 'in_reply_to_status_id_str',
       'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'entities',
       'possibly_sensitive', 'coordinates', 'place'],
      dtype='object')

In [6]:
df['text'].head()

0    @HHere4trump @sbacon0410 #stopthesteal #stopth...
1    @RudyGiuliani @CaliConserv1 #StoptheSteal #Sto...
2    #StoptheSteal #StoptheSteal #StoptheSteal #Sto...
3    #StoptheSteal #StoptheSteal #StoptheSteal #Sto...
4    @realDonaldTrump @4Libertyinlaw #StoptheSteal ...
Name: text, dtype: object

In [7]:
df['create_year'] = pd.to_datetime(df.created_at, format='%m/%d/%Y', errors='coerce')
df['year'] = pd.to_datetime(df.created_at).dt.strftime('%Y')
df['month'] = pd.to_datetime(df.created_at).dt.strftime('%m')

In [8]:
start_date = "09/01/2020"
end_date = "12/15/2020"
after_start_date = df['create_year'] > start_date
before_end_date = df['create_year'] < end_date
between_two_dates = after_start_date & before_end_date

stream_one = df.loc[between_two_dates]

#### Cleaning and Pre-processing Dataset

In [9]:
stream_one.head()
# stream_one.create_year

Unnamed: 0.1,Unnamed: 0,id,source_id,created_at,geo_source,truncated,text,text_sentiment,text_toxicity,lang,...,in_reply_to_status_id_str,in_reply_to_user_id_str,in_reply_to_screen_name,entities,possibly_sensitive,coordinates,place,create_year,year,month
43814,124741,1.300607e+18,18,2020-09-01 01:32:22,,0,#StopTheSteal 2020 https://t.co/0eGlgNMmpS,,,und,...,,,,"{""entities"": {""urls"": [{""end"": 42, ""url"": ""htt...",0,,,2020-09-01 01:32:22,2020,9
43815,124742,1.300612e+18,18,2020-09-01 01:49:48,,0,#StopTheSteal https://t.co/1iBSanqHp7,,,und,...,,,,"{""entities"": {""urls"": [{""end"": 37, ""url"": ""htt...",0,,"{""place_id"": ""dc96a032c98a1ea9""}",2020-09-01 01:49:48,2020,9
43816,124744,1.300612e+18,18,2020-09-01 01:52:14,,0,RT @MeryRobins: #StopTheSteal,,,und,...,,,,"{""entities"": {""hashtags"": [{""end"": 29, ""tag"": ...",0,,,2020-09-01 01:52:14,2020,9
43817,124745,1.300796e+18,18,2020-09-01 14:03:22,,0,Time to end cashless bail and this stupid bail...,,,en,...,,,,"{""entities"": {""urls"": [{""end"": 200, ""url"": ""ht...",0,,,2020-09-01 14:03:22,2020,9
43818,124748,1.30083e+18,18,2020-09-01 16:18:51,,0,@RaheemKassam #StopTheSteal .. they did the sa...,,,en,...,1.300830058244866e+18,,,"{""entities"": {""hashtags"": [{""end"": 27, ""tag"": ...",0,,,2020-09-01 16:18:51,2020,9


In [10]:
stream_one.head(43814)

Unnamed: 0.1,Unnamed: 0,id,source_id,created_at,geo_source,truncated,text,text_sentiment,text_toxicity,lang,...,in_reply_to_status_id_str,in_reply_to_user_id_str,in_reply_to_screen_name,entities,possibly_sensitive,coordinates,place,create_year,year,month
43814,124741,1.300607e+18,18,2020-09-01 01:32:22,,0,#StopTheSteal 2020 https://t.co/0eGlgNMmpS,,,und,...,,,,"{""entities"": {""urls"": [{""end"": 42, ""url"": ""htt...",0,,,2020-09-01 01:32:22,2020,09
43815,124742,1.300612e+18,18,2020-09-01 01:49:48,,0,#StopTheSteal https://t.co/1iBSanqHp7,,,und,...,,,,"{""entities"": {""urls"": [{""end"": 37, ""url"": ""htt...",0,,"{""place_id"": ""dc96a032c98a1ea9""}",2020-09-01 01:49:48,2020,09
43816,124744,1.300612e+18,18,2020-09-01 01:52:14,,0,RT @MeryRobins: #StopTheSteal,,,und,...,,,,"{""entities"": {""hashtags"": [{""end"": 29, ""tag"": ...",0,,,2020-09-01 01:52:14,2020,09
43817,124745,1.300796e+18,18,2020-09-01 14:03:22,,0,Time to end cashless bail and this stupid bail...,,,en,...,,,,"{""entities"": {""urls"": [{""end"": 200, ""url"": ""ht...",0,,,2020-09-01 14:03:22,2020,09
43818,124748,1.300830e+18,18,2020-09-01 16:18:51,,0,@RaheemKassam #StopTheSteal .. they did the sa...,,,en,...,1300830058244866050,,,"{""entities"": {""hashtags"": [{""end"": 27, ""tag"": ...",0,,,2020-09-01 16:18:51,2020,09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87623,237611,1.324548e+18,18,2020-11-06 03:03:58,,0,#Stopthesteal https://t.co/XHGvw90qOe,,,und,...,,,,"{""entities"": {""urls"": [{""end"": 37, ""url"": ""htt...",0,,,2020-11-06 03:03:58,2020,11
87624,237612,1.324548e+18,18,2020-11-06 03:03:58,,0,#CountEveryLEGALVote 🇺🇸\n#GOPStepUpForUsNOW !!...,,,und,...,,,,"{""entities"": {""urls"": [{""end"": 120, ""url"": ""ht...",0,,,2020-11-06 03:03:58,2020,11
87625,237614,1.324548e+18,18,2020-11-06 03:04:07,,0,#StopTheSteal https://t.co/40HHCYtT8a,,,und,...,,,,"{""entities"": {""urls"": [{""end"": 37, ""url"": ""htt...",0,,,2020-11-06 03:04:07,2020,11
87626,237616,1.324548e+18,18,2020-11-06 03:04:10,,0,#StopTheSteal #VoterFraud 🇺🇸🇺🇸🇺🇸 https://t.co/...,,,und,...,,,,"{""entities"": {""urls"": [{""end"": 56, ""url"": ""htt...",0,,,2020-11-06 03:04:10,2020,11


In [11]:
# Function to remove twitter specific characters
def get_url_patern():
    return re.compile(
        r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))'
        r'[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})')

def get_hashtags_pattern():
    return re.compile(r'#\w*')

def get_single_letter_words_pattern():
    return re.compile(r'(?<![\w\-])\w(?![\w\-])')

def get_blank_spaces_pattern():
    return re.compile(r'\s{2,}|\t')

def get_twitter_reserved_words_pattern():
    return re.compile(r'(RT|rt|FAV|fav|VIA|via)')

def get_mentions_pattern():
    return re.compile(r'@\w*')

def process_text(word):
  word=re.sub(pattern=get_url_patern(), repl="", string=word)
  word=re.sub(pattern=get_mentions_pattern(), repl="", string=word)
  word=re.sub(pattern=get_hashtags_pattern(), repl="", string=word)
  word=re.sub(pattern=get_twitter_reserved_words_pattern(), repl='', string=word)
  word=re.sub(r'http\S+', "", word)  # remove http links
  word=re.sub(r'bit.ly/\S+', "", word)  # rempve bitly links
  word=word.strip('[link]')  # remove [links]
  word=re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', "", word)  # remove retweet
  word=re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', "", word)  # remove tweeted at
  word=clean(word, no_emoji=True)
  return word

In [12]:
# Remove twitter specific characters
stream_one.text = stream_one.apply(lambda row: process_text(row.text), 1)

In [13]:
# Remove punctuations, stop words, digits and turn text to lowercase
training_data = [little_mallet_wrapper.process_string(text, numbers='remove') for text in stream_one.text]

In [14]:
training_data

['',
 '',
 '',
 'time end cashless bail stupid bail reform turns justice system revolving door let criminals back offend',
 'thing test run',
 'steal election election tactic used test run',
 'true know vote primaries got congress votes democratic nyc district school boards universities county commissioners spending indoctrinating money',
 '',
 'test run worked like charm prevented broward county succeeding sounding alarm absolutely nothing done prevent future',
 'test run worked like charm prevented broward county succeeding sound',
 'test run worked like charm prevented broward county succeeding sound',
 'test run worked like charm prevented broward county succeeding sound',
 'test run worked like charm prevented broward county succeeding sound',
 'test run worked like charm prevented broward county succeeding sound',
 'damn ridiculous little baby son nasty ass bitch needs put big boy panties stop acting fucking fool damn zero respect stupidity',
 'test run worked like charm prevente

#### Building BERTTopic Model

In [15]:
# Get variables
tweets = training_data
dates = stream_one['create_year'].apply(lambda x: pd.Timestamp(x)).to_list()

In [16]:
topic_model = BERTopic(language='english', verbose=True)
topics, probabilities = topic_model.fit_transform(tweets)

Batches: 100%|██████████| 7882/7882 [13:45<00:00,  9.55it/s] 
2022-03-26 09:27:21,406 - BERTopic - Transformed documents to Embeddings
2022-03-26 10:28:45,751 - BERTopic - Reduced dimensionality with UMAP
2022-03-26 10:30:03,518 - BERTopic - Clustered UMAP embeddings with HDBSCAN


#### Visualize Results from BERTtopic Model

In [17]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,77556,0_sayin_tellin_xxxx_sponsored
1,-1,39825,-1_corrupt_audit_american_like
2,1,5188,1_woahhhhh___
3,2,3465,2_fewer_assembled_incredible_peacefully
4,3,2763,3_congressmen_joined_building_capitol
...,...,...,...
2535,2549,10,2549_powerful_stronger_effective_weak
2533,2547,10,2547_alleging_deceified_awarded_files
2532,2546,10,2546_globalist_holes_centralized_globalists
2531,2551,10,2551_looting_terrorism_rioting_vandalism


In [18]:
topic_model.get_topic(1)

[('woahhhhh', 6.2422232654551655),
 ('', 1e-05),
 ('', 1e-05),
 ('', 1e-05),
 ('', 1e-05),
 ('', 1e-05),
 ('', 1e-05),
 ('', 1e-05),
 ('', 1e-05),
 ('', 1e-05)]

#### Visualizing Topics (Barchart, Topics over time)

In [19]:
topic_model.visualize_barchart(top_n_topics=11)

In [20]:
topics_over_time = topic_model.topics_over_time(tweets, topics, dates, nr_bins=20)

20it [00:35,  1.80s/it]


In [21]:
fig = topic_model.visualize_topics_over_time(topics_over_time, topics=[0, 2, 3, 4, 5, 6, 7, 8 , 9, 10])
fig

In [22]:
file = '1st_analysis (topic streams over time)'
plotly.io.write_html(fig, file=file, full_html=True)