In [29]:
import pandas as pd

df = pd.read_csv("2010-05-csv/2010-05-07.csv")
print(df.head())
df.shape

                                                Text   Origin           id
0  @londonluvspell awesome. I'm on the other side...  Twitter  13570699500
1  @JimCarrey \nIt's never gonna be 106.5 mill ji...  Twitter  13570700102
2  Apple: QA Engineer - Summer Internship (Lincol...  Twitter  13570700803
3  @Cucuxenxo fue @keyzito , que es un amor e hiz...  Twitter  13570703600
4  Iced white mocha! FTW (@ Starbucks w/  @davese...  Twitter  13570706802


(1009, 3)

In [30]:
from stage1_subject_filtering.subject_keyword_filtering import filter_subject_keyword_only

subject = "awesome"
df_f = filter_subject_keyword_only(df, subject, text_col="Text")

print("IN:", len(df), "OUT:", len(df_f))


IN: 1009 OUT: 3


In [31]:
from stage1_subject_filtering.cooccurence_keyword_filtering import (
    filter_subject_cooccurrence_expansion
)

subject = "awesome"
TEXT_COL = "Text"

df_co, keywords = filter_subject_cooccurrence_expansion(
    df,
    subject,
    text_col=TEXT_COL,
    top_n=25,
    min_count=20,
)

print("Filtered docs:", len(df_co))
print("Expanded keywords:", keywords)

Filtered docs: 3
Expanded keywords: ['awesome']


In [35]:
import importlib
import stage1_subject_filtering.llm_expansion as llm_expansion

importlib.reload(llm_expansion)
synonyms = llm_expansion.get_synonyms("subject")

synonyms


['topic',
 'theme',
 'matter',
 'issue',
 'area',
 'field',
 'discipline',
 'study',
 'focus',
 'content']

In [None]:
import pandas as pd

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer

# 1) Load your CSV
df = pd.read_csv("2010-05-csv/2010-05-07.csv")

TEXT_COL = "Text"   

if TEXT_COL not in df.columns:
    raise KeyError(f"'{TEXT_COL}' not found. Columns: {list(df.columns)}")

texts = df[TEXT_COL].fillna("").astype(str).tolist()
texts = [t for t in texts if t.strip()]
print("Loaded docs:", len(texts))

# 2) Embedding model (fast/light)
encoder = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = encoder.encode(
    texts,
    show_progress_bar=True,
    batch_size=64,
    normalize_embeddings=True,
)

# 3) UMAP (fixed hyperparams)
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    random_state=42,
)

# 4) HDBSCAN (fixed hyperparams)
hdbscan_model = HDBSCAN(
    min_cluster_size=15,
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True,
)

# 5) Vectorizer (for c-TF-IDF keywords)
vectorizer_model = CountVectorizer(
    stop_words="english",
    ngram_range=(1, 2),
    min_df=2,
)

# 6) BERTopic
topic_model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    calculate_probabilities=True,
    verbose=True,
)

topics, probs = topic_model.fit_transform(texts, embeddings)

# 7) Save outputs
topic_info = topic_model.get_topic_info()
topic_info.to_csv("topic_info_2010-05-07.csv", index=False)
print("Saved topic info -> topic_info_2010-05-07.csv")

doc_topics = pd.DataFrame({"text": texts, "topic": topics})
doc_topics.to_csv("doc_topics_2010-05-07.csv", index=False)
print("Saved doc->topic -> doc_topics_2010-05-07.csv")

# 8) Quick sanity stats
n_outliers = sum(1 for t in topics if t == -1)
print(f"Outliers: {n_outliers}/{len(texts)} = {n_outliers/len(texts):.3f}")
print("Topics (excluding -1):", (topic_info["Topic"] != -1).sum())

  from .autonotebook import tqdm as notebook_tqdm


Loaded docs: 1009


Batches: 100%|██████████| 16/16 [00:02<00:00,  6.79it/s]
2026-01-14 15:47:07,275 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2026-01-14 15:47:12,473 - BERTopic - Dimensionality - Completed ✓
2026-01-14 15:47:12,473 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-14 15:47:12,508 - BERTopic - Cluster - Completed ✓
2026-01-14 15:47:12,510 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-14 15:47:12,527 - BERTopic - Representation - Completed ✓


Saved topic info -> topic_info_2010-05-07.csv
Saved doc->topic -> doc_topics_2010-05-07.csv
Outliers: 6/1009 = 0.006
Topics (excluding -1): 7
