<a href="https://colab.research.google.com/github/DaisyXinyiHe/sentiment_analysis_topic_model/blob/main/tweet_BERTopic_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install bertopic

!pip install hdbscan
!pip install flair
!pip install gensim

In [None]:
!pip install umap-learn

In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
import matplotlib.pyplot as plt

import emodict ## This is a emoji dictionary downloaded from github and stored in folders
# from summarizer import Summarizer,TransformerSummarizer

## Import emoji libraries
EMOTICONS = emodict.EMOTICONS_EMO
UNICODE_EMO = emodict.UNICODE_EMOJI

In [None]:
tweet_filename1 = 'tweets_2022-02-14.csv'
tweets = pd.read_csv(tweet_filename1)
tweets

In [None]:
tweet_filename2 = 'tweets_2022-02-12.csv'
tweets2 = pd.read_csv(tweet_filename2)
tweets = pd.concat([tweets, tweets2])
tweets

In [None]:
def get_date(date_time):
  date_time = date_time.split(' ')
  return date_time[0]

In [None]:
tweets['date'] = tweets.datetime.apply(get_date)

In [None]:
tweets[[ 'tweet', 'date']].groupby(['tweet','date']).size().nlargest(10)

In [None]:
## Tweet cleaning functions

## Change all tweets to lowercase
def lower_case(tweet):
  return tweet.lower()

## Remove punctuation
def remove_punctuation(tweet):
  PUNCT_TO_REMOVE = string.punctuation+'’'+'「'+'」'
  return tweet.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

## Filter stop words
def filter_stopwords(tweet):
  filtered = ''
  stop_words = stopwords.words('english')
  stop_words.append('im')
  stop_words = set(stop_words)
  word_tokens = word_tokenize(tweet)
  filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
  filtered = ' '
  filtered =filtered.join(filtered_sentence)
  return filtered


## Remove unnecessary symbols
def clean(tweet):
  # tweet = " ".join(filter(lambda x:x[0]!='@', tweet.split())) # remove @mention
  tweet = re.sub(r'@','', tweet) # remove @ symbol
  tweet = re.sub(r'#','', tweet) # remove hastag symbol
  tweet = re.sub(r'https?:\/\/\S+','', tweet) # remove hyperlink
  tweet = re.sub(r'rt[\s]+','', tweet) # remove 'RT'
  # tweet = remove_punctuation(tweet)
  tweet = filter_stopwords(tweet)
  return tweet




## replace emoji and emoticons to words
# def convert_emoticons(tweet):
#   for emot in EMOTICONS:
#     if emot in tweet:
#       tweet = tweet.replace(emot, EMOTICONS[emot])
#   return tweet

def convert_emoji(tweet):
  for emo in UNICODE_EMO:
    if emo in tweet:
      tweet = tweet.replace(emo, UNICODE_EMO[emo])
    tweet = tweet.lower()
    tweet = tweet.replace(':', '')
    # tweet = tweet.replace('_', ' ')
  return tweet

## Take away the keywords for search in tweets
def take_away_keyword(tweet):
  for k in search_word:
    if k in tweet:
      tweet = tweet.replace(k, '')
  return tweet

## Connect keywords as one word
def connect_keyword(tweet, keyword):
  keyword_connected = keyword.replace(' ', '_')
  if keyword in tweet:
    tweet = tweet.replace(keyword, keyword_connected)
  return tweet
    



In [None]:
tweets['tweet_processed'] = tweets['tweet'].apply(lower_case)
tweets['tweet_processed']  =tweets['tweet'].apply(clean)
# tweets['tweet'] = tweets['tweet'].apply(convert_emoji)

In [None]:
tweets = tweets.sort_values(by='datetime')

In [None]:
tweets_nodup = tweets.drop(tweets.loc[tweets['tweet'].duplicated()].index)

In [None]:
tweets_nodup = tweets_nodup.reset_index(drop=True)
empty_tweet = []
for t in range(len(tweets_nodup)):
  if len(tweets_nodup.tweet.iloc[t])==0:
    empty_tweet.append(t)
tweets_nodup = tweets_nodup.drop(index = empty_tweet)
tweets_nodup = tweets_nodup.reset_index(drop=True)
tweets_nodup

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
all_tweets = [x for x in np.array(tweets_nodup.tweet)]
dates = tweets_nodup.datetime.apply(get_date)
dates = dates.apply(lambda x: pd.Timestamp(x)).to_list()

In [None]:
len(all_tweets), len(dates)

In [None]:
# Load sentence transformer model
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")


In [None]:
# Create documents embeddings
embeddings = sentence_model.encode(all_tweets, show_progress_bar=False)


In [None]:
import umap
# Define UMAP model to reduce embeddings dimension
umap_model = umap.UMAP(n_neighbors=30,
                       n_components=30,
                       min_dist=0.0,
                       metric='cosine',
                       low_memory=False, 
                       random_state=42)

In [None]:
import hdbscan
# Define HDBSCAN model to perform documents clustering
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=10,
                                min_samples=1,
                                metric='euclidean',
                                cluster_selection_method='eom',
                                prediction_data=True)

In [None]:
from bertopic import BERTopic
# Create BERTopic model
topic_model = BERTopic(top_n_words=30,
                       n_gram_range=(1,3), 
                       calculate_probabilities=True,
                       umap_model= umap_model,
                       hdbscan_model=hdbscan_model,
                       #similarity_threshold_merging=0.5,
                       verbose=True)



In [None]:
# Train model, extract topics and probabilities
topics, probabilities = topic_model.fit_transform(all_tweets, embeddings)

In [None]:
sum(topic_model.get_topic_freq().Count)

In [None]:
topic_model.get_topic_freq().head()

In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.get_topic(3)

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.get_representative_docs()

In [None]:
topic_over_time = topic_model.topics_over_time(all_tweets, topics, dates)

In [None]:
topic_model.visualize_topics_over_time(topic_over_time, topics=list(np.arange(0,20)))

# BERTopic 2: Change minimum topic size

In [None]:
def get_date(date_time):
  date_time = date_time.split(' ')
  return date_time[0]

In [None]:
dates = tweets_nodup.datetime.apply(get_date)
dates = dates.apply(lambda x: pd.Timestamp(x)).to_list()

In [None]:
text = tweets_nodup.tweet_processed.to_list()

In [None]:
print(len(text), len(dates))

In [None]:
import umap
import hdbscan
from bertopic import BERTopic
umap_model = umap.UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=40)
topic_model2 = BERTopic(min_topic_size=60, n_gram_range=(1,3), verbose=True, umap_model=umap_model)
topics2, probabilities2 = topic_model2.fit_transform(text)

In [None]:
topic_model2.get_topic_info()

In [None]:
topic_model2.visualize_topics()

In [None]:
topic2_over_time = topic_model2.topics_over_time(text, topics2, dates)

In [None]:
sum(topic2_over_time.Frequency)

In [None]:
topic_model2.visualize_topics_over_time(topic2_over_time, topics=list(np.arange(0,20)))

In [None]:
topic_model2.get_representative_docs()

# Visualize Topic Modeling with Class

In [None]:
from flair.models import TextClassifier
from flair.data import Sentence

In [None]:
classifier = TextClassifier.load('en-sentiment')

## Test model
sentence = Sentence('Flair is pretty neat!')
classifier.predict(sentence)
# print sentence with predicted labels
print('Sentence above is: ', sentence.labels)

In [None]:
len(tweets_nodup)

In [None]:
sentences = [Sentence(s) for s in tweets_nodup['tweet']]

In [None]:
len(sentences)

In [None]:
classifier.predict(sentences)

In [None]:
sent = str(sentences[0].labels[0])
num = float(re.findall("\d+\.\d+", sent)[0])
lab = " ".join(re.findall("[a-zA-Z]+", sent))
print(num, lab)

In [None]:
sent_labels=[]
sent_conf = []
for s in range(0,len(sentences)):
  if sentences[s]:
    sent = str(sentences[s].labels[0])
    sent_conf.append(float(re.findall("\d+\.\d+", sent)[0]))
    sent_labels.append(" ".join(re.findall("[a-zA-Z]+", sent)))
  else:
    print(s)

In [None]:
tweets_nodup['sentiment'] = sent_labels
tweets_nodup['sentiment_confidence'] =sent_conf


In [None]:
classes = [s for s in tweets_nodup.sentiment]

In [None]:
classes.count('POSITIVE'), classes.count('NEGATIVE')

In [None]:
print(len(classes), len(text), len(topics2))

In [None]:
topics_per_class = topic_model.topics_per_class(text, topics, classes=classes)

In [None]:
topics_per_class

In [None]:
topic_model.visualize_topics_per_class(topics_per_class)

In [None]:
len(text), len(topics2)

In [None]:
topics_per_class2 = topic_model2.topics_per_class(text, topics2, classes=classes)

In [None]:
# topics2_red, topics2_red_prob = topic_model2.reduce_topics(text, topics2, nr_topics=7)
# topic_model2.get_topic_info()

In [None]:
topic_model2.visualize_topics_per_class(topics_per_class2)

In [None]:
non_processed_tweet = tweets_nodup.tweet

In [None]:
len(topics2), len(text), len(non_processed_tweet), len(classes)

In [None]:
topic_docs = {topic: [] for topic in set(topics2)}
for i, (topic, doc, sent, tweet) in enumerate(zip(topics2, text ,classes, non_processed_tweet)):
  doc = [i, doc,sent, tweet]
  topic_docs[topic].append(doc)

In [None]:
import random

pos_tweet = []
neg_tweet = []
for t in range(-1, len(topic_docs)-1):
  positive_doc = []
  negative_doc = []   
  for s in topic_docs[t]:
    if 'POSITIVE' in s:
      positive_doc.append(s[3])
    if 'NEGATIVE' in s:
      negative_doc.append(s[3])
  pos_tweet.append(random.sample(positive_doc, k=1))
  neg_tweet.append(random.sample(negative_doc, k=1))
  # pos_tweet.append(positive_doc[0])
  # neg_tweet.append(negative_doc[0])

In [None]:
topics_per_class2_pos = topics_per_class2[['Topic', 'Words', 'Frequency']].loc[topics_per_class2.Class == 'POSITIVE']
topics_per_class2_neg = topics_per_class2[['Topic', 'Words', 'Frequency']].loc[topics_per_class2.Class == 'NEGATIVE']
all_topics_by_class = topics_per_class2_pos.set_index('Topic').join(topics_per_class2_neg.set_index('Topic'), lsuffix='_pos', rsuffix='_neg', on='Topic')
all_topics_by_class['tweet_example_pos'] = pos_tweet
all_topics_by_class['tweet_example_neg'] = neg_tweet
all_topics_by_class 


In [None]:
all_topics_by_class.to_csv('all_topics_by_class.csv')