In [1]:
# --- Load full dataset and drop missing articles ---
import pandas as pd

csv_path = "../data/raw/newspapers/all-the-news-2-1.csv"
df = pd.read_csv(csv_path, nrows=100_000)
df = df.dropna(subset=["article"])
df = df[df["article"].str.strip().astype(bool)]

# --- Preprocess text ---
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords if not done already
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    text = text.lower()  # Lowercase
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'\d+', '', text)  # Remove digits (optional)
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize whitespace
    
    # Remove stopwords
    tokens = [word for word in text.split() if word not in stop_words]
    return " ".join(tokens)

# Example: assume your DataFrame has a 'raw_text' column
df['clean_text'] = df['article'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to /Users/rada/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Optional: sample for performance (adjust as needed)
df_sample = df.sample(n=5000, random_state=42)  # You can increase this

# --- Fit BERTopic ---
from bertopic import BERTopic

topic_model = BERTopic(language="english", verbose=True)
topics, probs = topic_model.fit_transform(df_sample["clean_text"].tolist())

# --- View top topics ---
topic_info = topic_model.get_topic_info()
display(topic_info.head())

# --- Visualize ---
topic_model.visualize_topics()

# --- Representative documents ---
reps = topic_model.get_representative_docs()


Batches:   0%|          | 0/157 [00:00<?, ?it/s]

2025-05-29 11:01:40,656 - BERTopic - Transformed documents to Embeddings
2025-05-29 11:01:49,878 - BERTopic - Reduced dimensionality
2025-05-29 11:01:49,973 - BERTopic - Clustered reduced embeddings


Unnamed: 0,Topic,Count,Name
0,-1,1631,-1_said_people_like_one
1,0,209,0_art_artists_work_artist
2,1,186,1_music_album_song_band
3,2,125,2_kardashian_chyna_kylie_got
4,3,119,3_brexit_eu_britain_minister


In [3]:
topic_model.visualize_barchart(top_n_topics=10)

In [4]:
topic_model.visualize_heatmap()

In [5]:
topic_model.visualize_term_rank()

In [6]:
# Get topic frequencies
topic_freq = topic_model.get_topic_info()

# Remove -1 topic as we don't want "outliers"/unclassified
topic_freq_filtered = topic_freq[topic_freq.Topic != -1]

# Calculate total number of documents assigned a topic
total_docs = topic_freq_filtered['Count'].sum()

# Compute topic shares
topic_freq_filtered['Share'] = topic_freq_filtered['Count'] / total_docs

# Format as a DataFrame with Topic, Count, and Share
topic_shares = topic_freq_filtered[['Topic', 'Count', 'Share']]
print(topic_shares)


    Topic  Count     Share
1       0    209  0.062036
2       1    186  0.055209
3       2    125  0.037103
4       3    119  0.035322
5       4    107  0.031760
..    ...    ...       ...
80     79     11  0.003265
81     80     10  0.002968
82     81     10  0.002968
83     82     10  0.002968
84     83     10  0.002968

[84 rows x 3 columns]
