In [None]:
!pip install pandas
!pip install bertopic
!pip install nltk
!pip install cohere
!pip install tiktoken
!pip install tweet-preprocessor


Collecting bertopic
  Downloading bertopic-0.16.0-py2.py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.1/154.1 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.33.tar.gz (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap-learn-0.5.5.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.9/90.9 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence_transformers-2.5.1-py3-none-any.whl (156 kB)
[2K     [90m━━━━━━━

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
import os
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from bertopic import BERTopic
from nltk.corpus import stopwords
import preprocessor as p

p.set_options(p.OPT.URL)

standard_stop_words = stopwords.words('english')

def clean_text(text):
    text = p.clean(text)

    text = re.sub(r'\d+', '', text)

    words = text.split()
    filtered_words = [word for word in words if word.lower() not in standard_stop_words]

    return ' '.join(filtered_words)

vectorizer_model = TfidfVectorizer(stop_words=standard_stop_words, ngram_range=(1, 3))

model = BERTopic(embedding_model="all-MiniLM-L6-v2", vectorizer_model=vectorizer_model, calculate_probabilities=False, nr_topics="auto")

path = "/content"
all_files = [os.path.join(path, file) for file in os.listdir(path) if file.endswith('.csv')]

for file in all_files:
    df = pd.read_csv(file)
    df = df.dropna(subset=['Simple Text'])
    df['Simple Text'] = df['Simple Text'].apply(clean_text)

    documents = df['Simple Text'].tolist()
    topics, _ = model.fit_transform(documents)

    df['Topic'] = topics
    topic_counts = df['Topic'].value_counts()
    print(f"\nTopics in file: {file}")
    print(topic_counts)

    topic_info = model.get_topic_info()
    for index, row in topic_info.iterrows():
        if row['Topic'] != -1:
            words = model.get_topic(row['Topic'])
            print(f"Topic {row['Topic']}: {len(words)} words, representative words: {[word for word, _ in words]}")

    new_file_name = os.path.basename(file)[:-4] + '_with_topics.csv'
    new_file_path = os.path.join(path, new_file_name)
    df.to_csv(new_file_path, index=False)
    print(f"Processed file saved: {new_file_path}")



Topics in file: /content/Cleaned_Facebook_Content_Aware_t=1_P=1.csv.csv
 0    585
-1    269
 1     97
 2     42
 3     18
 4     17
 5     13
 6     12
 7     12
Name: Topic, dtype: int64
Topic 0: 10 words, representative words: ['vote', 'election', 'make', 'voting', 'elections', 'early', 'midterm', 'let', 'sure', 'ballot']
Topic 1: 10 words, representative words: ['biden', 'election', 'trump', 'elections', 'midterm', 'misinformation', 'arizona', 'midterms', 'midterm elections', 'fraud']
Topic 2: 10 words, representative words: ['rights', 'women', 'abortion', 'reproductive rights', 'abortion rights', 'reproductive', 'let', 'lgbtq', 'abortionrights', 'women rights']
Topic 3: 10 words, representative words: ['crime', 'rates', 'problems', 'crime rates', 'republican', 'republicans', 'tuberville', 'party', 'order', 'violent']
Topic 4: 10 words, representative words: ['florida', 'desantis', 'ron desantis', 'ron', 'florida governor', 'election', 'governor', 'trump', 'governor ron', 'florida 