In [1]:
# --- Load full dataset and drop missing articles ---
import pandas as pd

csv_path = "../data/raw/newspapers/all-the-news-2-1.csv"
df = pd.read_csv(csv_path, nrows=100_000)
df = df.dropna(subset=["article"])
df = df[df["article"].str.strip().astype(bool)]

# --- Preprocess text ---
import re
import nltk
from nltk.corpus import stopwords
import spacy

# Download stopwords if not done already
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load spacy model for lemmatization
spacy.cli.download("en_core_web_sm")

nlp = spacy.load('en_core_web_sm')

def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    text = text.lower()  # Lowercase
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'\d+', '', text)  # Remove digits (optional)
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize whitespace
    
    # Remove stopwords
    tokens = [word for word in text.split() if word not in stop_words]
    return " ".join(tokens)

# Example: assume your DataFrame has a 'raw_text' column
df['clean_text'] = df['article'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to /Users/rada/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
# Optional: sample for performance (adjust as needed)
df_sample = df.sample(n=5000, random_state=42)  # You can increase this

# --- Fit BERTopic ---
from bertopic import BERTopic

topic_model = BERTopic(language="english", verbose=True)
topics, probs = topic_model.fit_transform(df_sample["clean_text"].tolist())

# --- View top topics ---
topic_info = topic_model.get_topic_info()
display(topic_info.head())

# --- Visualize ---
topic_model.visualize_topics()

# --- Representative documents ---
reps = topic_model.get_representative_docs()


2025-05-27 21:48:09,851 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

2025-05-27 21:48:30,047 - BERTopic - Embedding - Completed ✓
2025-05-27 21:48:30,047 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-27 21:48:39,342 - BERTopic - Dimensionality - Completed ✓
2025-05-27 21:48:39,342 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-27 21:48:39,433 - BERTopic - Cluster - Completed ✓
2025-05-27 21:48:39,434 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-27 21:48:40,297 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1617,-1_said_people_like_trump,"[said, people, like, trump, one, also, us, new...",[back november obama backlash gathering steam ...
1,0,271,0_art_artists_work_museum,"[art, artists, work, museum, artist, paintings...",[advertise hyperallergic nectar ads new york c...
2,1,181,1_music_album_band_song,"[music, album, band, song, songs, like, record...",[last time green day released album three exac...
3,2,122,2_brexit_eu_britain_minister,"[brexit, eu, britain, minister, european, brit...",[british prime minister theresa may asked parl...
4,3,117,3_chyna_kardashian_kylie_got,"[chyna, kardashian, kylie, got, baby, shes, ro...",[kim khloe kourtney kardashian sticking guns c...


In [3]:
topic_model.visualize_barchart(top_n_topics=10)

In [4]:
topic_model.visualize_heatmap()

In [5]:
topic_model.visualize_term_rank()