### Topic Analysis using BERTtopic Model (Community Dataset #2: 15/12/2020 - 31/01/2021)

#### Import Core Libraries

In [1]:
# Import Core library
import sys
import re, numpy as np, pandas as pd, matplotlib.pyplot as plt, nltk
import emoji
import plotly
from pprint import pprint
from cleantext import clean
from bertopic import BERTopic

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


#### Importing Dataset

In [2]:
# Import Dataset
df = pd.read_excel('../Community306_raw.xlsx')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,source_id,created_at,geo_source,truncated,text,text_sentiment,text_toxicity,lang,...,retweet_id_str,is_quote_status,quoted_status_id_str,in_reply_to_status_id_str,in_reply_to_user_id_str,in_reply_to_screen_name,entities,possibly_sensitive,coordinates,place
0,1,1.34617e+18,18,2021-01-04 19:01:23,1.0,0,@HHere4trump @sbacon0410 #stopthesteal #stopth...,,,und,...,,0,,1.3461566492879872e+18,,,"{""entities"": {""hashtags"": [{""end"": 38, ""tag"": ...",0,,
1,3,1.345954e+18,18,2021-01-04 04:45:23,,0,@RudyGiuliani @CaliConserv1 #StoptheSteal #Sto...,,,und,...,,0,,1.3459245054835466e+18,,,"{""entities"": {""hashtags"": [{""end"": 41, ""tag"": ...",0,,
2,4,1.345954e+18,18,2021-01-04 04:45:13,,0,#StoptheSteal #StoptheSteal #StoptheSteal #Sto...,,,und,...,,0,1.3459245054835466e+18,,,,"{""entities"": {""urls"": [{""end"": 303, ""url"": ""ht...",0,,
3,5,1.345952e+18,18,2021-01-04 04:34:34,,0,#StoptheSteal #StoptheSteal #StoptheSteal #Sto...,,,und,...,,0,1.345933290654593e+18,,,,"{""entities"": {""urls"": [{""end"": 303, ""url"": ""ht...",0,,
4,6,1.345825e+18,18,2021-01-03 20:12:15,,0,@realDonaldTrump @4Libertyinlaw #StoptheSteal ...,,,und,...,,0,,1.34579820265046e+18,,,"{""entities"": {""hashtags"": [{""end"": 45, ""tag"": ...",0,,


In [4]:
df['text'].head()

0    @HHere4trump @sbacon0410 #stopthesteal #stopth...
1    @RudyGiuliani @CaliConserv1 #StoptheSteal #Sto...
2    #StoptheSteal #StoptheSteal #StoptheSteal #Sto...
3    #StoptheSteal #StoptheSteal #StoptheSteal #Sto...
4    @realDonaldTrump @4Libertyinlaw #StoptheSteal ...
Name: text, dtype: object

In [12]:
df['create_year'] = pd.to_datetime(df.created_at, format='%m/%d/%Y', errors='coerce')
df['year'] = pd.to_datetime(df.created_at).dt.strftime('%Y')
df['month'] = pd.to_datetime(df.created_at).dt.strftime('%m')

In [18]:
start_date = "12/15/2020"
end_date = "01/31/2021"
after_start_date = df['create_year'] > start_date
before_end_date = df['create_year'] < end_date
between_two_dates = after_start_date & before_end_date

stream_two = df.loc[between_two_dates]

In [21]:
stream_two.head(2)

Unnamed: 0.1,Unnamed: 0,id,source_id,created_at,geo_source,truncated,text,text_sentiment,text_toxicity,lang,...,in_reply_to_status_id_str,in_reply_to_user_id_str,in_reply_to_screen_name,entities,possibly_sensitive,coordinates,place,create_year,year,month
0,1,1.34617e+18,18,2021-01-04 19:01:23,1.0,0,@HHere4trump @sbacon0410 #stopthesteal #stopth...,,,und,...,1346156649287987201,,,"{""entities"": {""hashtags"": [{""end"": 38, ""tag"": ...",0,,,2021-01-04 19:01:23,2021,1
1,3,1.345954e+18,18,2021-01-04 04:45:23,,0,@RudyGiuliani @CaliConserv1 #StoptheSteal #Sto...,,,und,...,1345924505483546624,,,"{""entities"": {""hashtags"": [{""end"": 41, ""tag"": ...",0,,,2021-01-04 04:45:23,2021,1


In [None]:
stream_two.create_year.head(100)

In [None]:
stream_two.create_year

#### Clean and Pre-process Dataset

In [7]:
def get_url_patern():
    return re.compile(
        r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))'
        r'[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})')

def get_hashtags_pattern():
    return re.compile(r'#\w*')

def get_single_letter_words_pattern():
    return re.compile(r'(?<![\w\-])\w(?![\w\-])')

def get_blank_spaces_pattern():
    return re.compile(r'\s{2,}|\t')

def get_twitter_reserved_words_pattern():
    return re.compile(r'(RT|rt|FAV|fav|VIA|via)')

def get_mentions_pattern():
    return re.compile(r'@\w*')

def process_text(word):
  word=re.sub(pattern=get_url_patern(), repl="", string=word)
  word=re.sub(pattern=get_mentions_pattern(), repl="", string=word)
  word=re.sub(pattern=get_hashtags_pattern(), repl="", string=word)
  word=re.sub(pattern=get_twitter_reserved_words_pattern(), repl='', string=word)
  word=re.sub(pattern=get_single_letter_words_pattern(), repl='', string=word)
  word=re.sub(pattern=get_blank_spaces_pattern(), repl=' ', string=word)
  word=re.sub('\s+', " ", word)
  word=re.sub("\'", "", word)
  word=re.sub(r'http\S+', "", word)  # remove http links
  word=re.sub(r'bit.ly/\S+', "", word)  # rempve bitly links
  word=word.strip('[link]')  # remove [links]
  word=re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', "", word)  # remove retweet
  word=re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', "", word)  # remove tweeted at
  word=clean(word, no_emoji=True)
#   word =re.sub(r'\d+', '', word)
#   word = word.strip()
  return word

In [8]:
stream_two.text = stream_two.apply(lambda row: process_text(row.text), 1)

In [9]:
stream_two.text.head(3)

0    
1    
2    
Name: text, dtype: object

#### Building the BERTtopic Model

In [10]:
# Get variables
tweets = stream_two.text.to_list()
dates = stream_two['create_year'].apply(lambda x: pd.Timestamp(x)).to_list()

In [11]:
dates[:3]

[Timestamp('2021-01-04 19:01:23'),
 Timestamp('2021-01-04 04:45:23'),
 Timestamp('2021-01-04 04:45:13')]

In [12]:
topic_model = BERTopic(min_topic_size=70, verbose=True)
topics, probs = topic_model.fit_transform(tweets)

Batches: 100%|██████████| 2458/2458 [12:01<00:00,  3.40it/s]
2022-03-21 14:00:22,532 - BERTopic - Transformed documents to Embeddings
2022-03-21 14:03:33,234 - BERTopic - Reduced dimensionality with UMAP
2022-03-21 14:03:55,036 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [13]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,24737,-1_and_you_of_the
1,0,12511,0_exciting_hooah_gotta_period
2,1,1579,1_nailed_aka_note_please
3,2,1288,2_kicked_flight_wearing_joke
4,3,1261,3_location_inciter_cap_minutes
...,...,...,...
146,145,76,145_finished_fought_line_race
147,146,76,146_peaceful_track_largest_record
148,147,73,147_hoft_gateway_pundit_jim
149,148,72,148_speaking_am_here_capitol


#### Visualizing Topics (Barchart, Topics over time)

In [14]:
topic_model.visualize_barchart()

In [15]:
topics_over_time = topic_model.topics_over_time(tweets, topics, dates, nr_bins=20)

20it [00:12,  1.56it/s]


In [16]:
fig = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10)
fig

#### Export plotly as HTML

In [17]:
file = 'community_topic_model_2nd_analysis'
plotly.io.write_html(fig, file=file, full_html=True)