In [10]:
from bertopic import BERTopic
import pandas as pd
import numpy as np

df_1 = pd.read_csv('../data/twitter/tweets_isTweet.csv')
df_2 = pd.read_csv('../data/twitter/quoted_tweets.csv')
df_2["combined_text"] = df_2["quoted_tweet_text"].fillna("") + " " + df_2["tweet_text"].fillna("")


# filter super short tweets
# filtered_docs = [doc for doc in documents if len(doc.split()) > 3]
df_2 = df_2[df_2["combined_text"].apply(lambda x: len(x.split()) > 3)]

# merge dataframe to have both quoted tweets and timestamps
merged_docs = df_2.merge(df_1, left_on="tweet_id", right_on="id", how="inner")  
merged_docs.to_csv('../data/twitter/merged_tweets.csv', index=False)  

filtered_docs = merged_docs["combined_text"].tolist()   
# Sample documents
documents = df_2["combined_text"].tolist()

In [11]:
merged_docs.columns
#print(len(merged_docs))
merged_docs = merged_docs[["tweet_id", "createdAt", "combined_text",
                           "retweetCount", "replyCount", "likeCount", "quoteCount", "viewCount", "sentiment"]]
# Create a BERTopic model

In [6]:
# do some more preprocessing
import re
import spacy
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from nltk import pos_tag

from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

# Combine stopwords
tokenizer = nltk.ToktokTokenizer()
stopword_list = STOP_WORDS

def preprocess_text(text):
    # Lowercase and remove URLs, hashtags, etc.
    text = text.lower()
    text = re.sub(r"http\S+|#\w+|[^a-z\s]", "", text)

    # Run spaCy pipeline (includes NER)
    doc = nlp(text)

    # Extract named entities
    named_entities = {ent.text.lower() for ent in doc.ents}

    tokens = [
        token.lemma_ for token in doc
        if (token.lemma_ in named_entities or token.lemma_ not in stopword_list)
        and len(token.lemma_) > 2
    ]

    return " ".join(tokens)

preprocessed_docs = [preprocess_text(doc) for doc in filtered_docs]
#print(preprocessed_docs[:5])

In [None]:
# print(len(filtered_docs))
# timestamps = pd.to_datetime(merged_docs["createdAt"], errors="coerce").tolist()
# doc_id = merged_docs["tweet_id"].tolist()
# sentiment = merged_docs["sentiment"].tolist() 

7965


In [25]:
from sentence_transformers import SentenceTransformer
tweet_model = SentenceTransformer('peulsilva/phrase-bert-setfit-20shots-RAFT-TWITTER-EMOTION')

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=2)  # remove words that appear in only 1 doc

In [27]:
# Create BERTopic model
topic_model = BERTopic(vectorizer_model=vectorizer, embedding_model=tweet_model)

# Fit the model on documents
topics, probs = topic_model.fit_transform(preprocessed_docs)

# Display topics
topic_model.get_topics()

{-1: [('trump', 0.008965432270971716),
  ('elon', 0.008468640220953257),
  ('people', 0.00820266011716307),
  ('government', 0.007270291132337693),
  ('vote', 0.007056529246746115),
  ('medium', 0.006605148843972895),
  ('musk', 0.006555579616334566),
  ('doge', 0.006485240297808597),
  ('like', 0.0064649460127462425),
  ('illegal', 0.006189226964974066)],
 0: [('grok', 0.07117912035350836),
  ('post', 0.027357916887910266),
  ('premium', 0.023650952285175095),
  ('image', 0.023413205402086433),
  ('video', 0.021891053200528168),
  ('xai', 0.02186441888680716),
  ('note', 0.02164110198389057),
  ('app', 0.020469034810277228),
  ('platform', 0.018911651652393363),
  ('update', 0.015558497877371125)],
 1: [('tesla', 0.06885072702743182),
  ('car', 0.041815902038145676),
  ('model', 0.038751469248395724),
  ('drive', 0.03255222122946662),
  ('fsd', 0.029531282130497934),
  ('vehicle', 0.029345143748196273),
  ('cybertruck', 0.02264009815025855),
  ('electric', 0.01582681371854336),
  ('mi

### Get topic frequencies 

In [30]:
freq_df = topic_model.get_topic_info()
#print(freq_df.head())
top_n = 10
top_topics = freq_df[freq_df.Topic != -1].nlargest(top_n, 'Count')
print(top_topics)


    Topic  Count                                 Name  \
1       0    526            0_grok_post_premium_image   
2       1    272              1_tesla_car_model_drive   
3       2    188          2_rape_gang_britain_british   
4       3    184  3_debt_trillion_government_interest   
5       4    151           4_people_like_think_speech   
6       5    149  5_brazil_moraes_brazilian_alexandre   
7       6    139   6_judge_impeach_president_activist   
8       7    135          7_germany_afd_europe_german   
9       8    134      8_news_legacy_medium_journalism   
10      9    120                9_mind_kid_virus_wake   

                                       Representation  \
1   [grok, post, premium, image, video, xai, note,...   
2   [tesla, car, model, drive, fsd, vehicle, cyber...   
3   [rape, gang, britain, british, inquiry, girl, ...   
4   [debt, trillion, government, interest, inflati...   
5   [people, like, think, speech, medium, know, lo...   
6   [brazil, moraes, brazilian

In [31]:
topic_model.save("/Users/phuongkhanh/Documents/Master /coin seminar/COIN_Repo/models/elon_topic_model")



In [12]:
merged_docs.columns
#print(len(merged_docs))

docs = pd.DataFrame(preprocessed_docs, columns=["Document"])
# timestamps = pd.DataFrame(timestamps, columns=["timestamps"])
# doc_id = pd.DataFrame(doc_id, columns=["tweet_id"])
docs = pd.concat([merged_docs["tweet_id"], docs, merged_docs["createdAt"],
merged_docs["retweetCount"], merged_docs["replyCount"], merged_docs["likeCount"], merged_docs["quoteCount"], 
merged_docs["viewCount"], merged_docs["sentiment"]], axis=1)
docs.to_csv("/Users/phuongkhanh/Documents/Master /coin seminar/COIN_Repo/models/docs.csv", index=False)

In [None]:
print(topic_model.get_document_info(preprocessed_docs).columns)

NameError: name 'topic_model' is not defined