# Topic Modeling

Outline:
- 1) LDA (full sample)
- 2) BERTopic 

To be further developed... 

## 1) LDA (full sample)

In [2]:
import pandas as pd
# load publishers data from the corresponding file
publishers = pd.read_csv("../data/processed/publishers.csv") 

In [None]:
# compute topic modeling for all publishers (append all the samples)
df_all = pd.concat([pd.read_csv(f"../data/processed/newspapers/sample_{re.sub(r'\\W+','_ ', pub.lower()).strip('_')}.csv") for pub in publishers['publication']], ignore_index=True)

# Tokenization and stopword removal using regex and sklearn stopwords
custom_stopwords = ENGLISH_STOP_WORDS.union({'said', 'mr', 'also'})

def tokenize_and_clean(text):
    # Keep words with 3 or more alphabetic characters
    tokens = re.findall(r'\b[a-z]{3,}\b', str(text).lower())
    return [t for t in tokens if t not in custom_stopwords]

df_all['tokens'] = df_all['article'].apply(tokenize_and_clean)

# Create dictionary and corpus
dictionary = corpora.Dictionary(df_all['tokens'])
dictionary.filter_extremes(no_below=10, no_above=0.5)
corpus = [dictionary.doc2bow(text) for text in df_all['tokens']]

# Train LDA model
num_topics = 10
lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15, random_state=42)

# Plot wordclouds
fig, axes = plt.subplots(2, 5, figsize=(20, 10), constrained_layout=True)
axes = axes.flatten()

for idx, ax in enumerate(axes):
    topic_words = dict(lda_model.show_topic(idx, 50))
    wc = WordCloud(width=500, height=300, background_color='white', max_words=50)
    wc.generate_from_frequencies(topic_words)
    ax.imshow(wc, interpolation='bilinear')
    ax.set_title(f'Topic {idx + 1}', fontsize=14)
    ax.axis('off')

plt.suptitle('LDA Topics – Word Clouds', fontsize=18)
plt.show()

In [None]:
# Save LDA model and dictionary
lda_model.save("../models/topic_model/lda_model.gensim")
dictionary.save("../models/topic_model/lda_dictionary.dict")

In [None]:
from gensim import corpora, models

# Load model and dictionary
lda_model = models.LdaModel.load("../models/topic_model/lda_model.gensim")
dictionary = corpora.Dictionary.load("../models/topic_model/lda_dictionary.dict")

In [None]:
from gensim.models import CoherenceModel

coherence_model = CoherenceModel(model=lda_model, texts=df_all['tokens'], dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print(f'Coherence Score (c_v): {coherence_score:.4f}')
# A score above 0.4 is generally considered good for topic coherence.

In [None]:
from datetime import datetime
import re

# --- Assumes these exist ---
# - df['tokens'] = list of preprocessed word tokens
# - df['date'] = parsed datetime
# - df['publication'] = publisher name
# - lda_model = trained gensim LdaModel
# - dictionary = gensim Dictionary used to train the model

# Load all samples from new processed newspapers folder
df_all = pd.concat([pd.read_csv(f"../data/processed/newspapers/sample_{re.sub(r'\\W+','_ ', pub.lower()).strip('_')}.csv") for pub in publishers['publication']], ignore_index=True)

# Step 1: Convert tokens to bag-of-words
corpus = [dictionary.doc2bow(text) for text in df_all['tokens']]

# Step 2: Get topic distribution for each article
def get_topic_dist(bow):
    # Return full-length vector with zero entries where necessary
    dist = lda_model.get_document_topics(bow, minimum_probability=0)
    return [prob for _, prob in dist]

df_all['topic_distribution'] = [get_topic_dist(doc) for doc in corpus]

# Step 3: Unpack topic distributions into separate columns
num_topics = lda_model.num_topics
topic_cols = [f'topic_{i}' for i in range(num_topics)]
df_topics = pd.DataFrame(df_all['topic_distribution'].tolist(), columns=topic_cols)

# Step 4: Combine with metadata
df_meta = df_all[['date', 'publication']].copy()
df_combined = pd.concat([df_meta, df_topics], axis=1)
df_combined['month'] = pd.to_datetime(df_combined['date'], format='mixed', errors='coerce').dt.to_period('M')

# Step 5: Aggregate topic shares by month and publisher
df_monthly_pub = df_combined.groupby(['month', 'publication'])[topic_cols].mean().reset_index()

# Step 6: Save to CSV
df_monthly_pub.to_csv('../data/processed/monthly_topic_shares_by_publisher.csv', index=False)

print("✅ Saved: 'monthly_topic_shares_by_publisher.csv'")

## 2) Supervised feature engineering

Goal: Pre-define topics relevant to our context

In [3]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# --- 1. Load publisher article data ---
# Load publishers
publishers = pd.read_csv("../data/processed/publishers.csv")

# Define safe filename generator
def safe_filename(pub):
    return re.sub(r'\W+', '_', pub.lower()).strip('_')

# Concatenate all article samples
df_all = pd.concat([
    pd.read_csv(f"../data/processed/newspapers/sample_{safe_filename(pub)}.csv")
    for pub in publishers['publication']
], ignore_index=True)

# --- 2. Define keyword filter ---
topic_keywords = [
    "manufacturing", "factory", "production", "industry", "output",
    "supply chain", "logistics", "transport", "shortage"
]

# --- 3. Clean and filter text ---
def clean_text(text_input):
    text_input = str(text_input).lower()
    text_input = re.sub(r'\d+', '', text_input)
    text_input = re.sub(f"[{re.escape(string.punctuation)}]", '', text_input)
    text_input = re.sub(r'\s+', ' ', text_input).strip()
    return text_input

def filter_keywords(text, keywords):
    text = text.lower()
    return ' '.join([
        word for word in text.split()
        if any(k in word for k in keywords)
    ])

# --- 4. Aggregate articles per month ---
df_all['date'] = pd.to_datetime(df_all['date'], errors='coerce')
df_all['month'] = df_all['date'].dt.to_period('M').dt.to_timestamp()

df_monthly = df_all.groupby('month')['article'].apply(lambda x: ' '.join(x.dropna())).reset_index()

# --- 5. Apply keyword filtering ---
df_monthly['filtered_content'] = df_monthly['article'].apply(lambda x: filter_keywords(x, topic_keywords))

# --- 6. Load IPI data and merge ---
df_indpro = pd.read_csv('https://fred.stlouisfed.org/graph/fredgraph.csv?id=INDPRO', parse_dates=['observation_date'])
df_indpro.rename(columns={'observation_date': 'date', 'INDPRO': 'ipi'}, inplace=True)
df_indpro['month'] = df_indpro['date'].dt.to_period('M').dt.to_timestamp()

df_keywords_monthly = pd.merge(df_monthly, df_indpro[['month', 'ipi']], on='month', how='inner')

# --- 7. TF-IDF Vectorization ---
vectorizer = TfidfVectorizer(
    max_df=0.95,
    min_df=10,
    stop_words='english',
    token_pattern=r'\b[a-zA-Z]{3,}\b',
    preprocessor=clean_text
)

X_tfidf = vectorizer.fit_transform(df_keywords_monthly['filtered_content'].fillna(''))

df_features = pd.DataFrame(
    X_tfidf.toarray(),
    columns=[f"kw_{word}" for word in vectorizer.get_feature_names_out()]
)
df_features['month'] = df_keywords_monthly['month'].values
df_features['ipi'] = df_keywords_monthly['ipi'].values

# --- 8. Save final DataFrame ---
df_features.to_csv("../data/processed/df_industry_keywords_monthly.csv", index=False)


FileNotFoundError: [Errno 2] No such file or directory: '../data/processed/newspapers/sample_reuters.csv'

## 3) BERTopic

Goal: Topic shares by month-publisher using BERTopic for the topic modeling. 

In [7]:
import pandas as pd
import re
import os
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
from difflib import get_close_matches
from tqdm import tqdm

# Load publishers
publishers = pd.read_csv("../data/processed/publishers.csv")

# Get all available newspaper sample files
data_dir = "../data/processed/newspapers/"
available_files = os.listdir(data_dir)

# Clean basenames from filenames
available_basenames = {
    re.sub(r'^sample_|\.csv$', '', fname): fname
    for fname in available_files
    if fname.startswith("sample_") and fname.endswith(".csv")
}

# Helper to sanitize
def sanitize(pub):
    return re.sub(r'\W+', '_', pub.lower()).strip('_')

# Load files
dfs = []
for pub in publishers['publication']:
    pub_clean = sanitize(pub)
    match = get_close_matches(pub_clean, available_basenames.keys(), n=1, cutoff=0.7)
    if match:
        matched_filename = os.path.join(data_dir, available_basenames[match[0]])
        print(f"✅ Matched: {pub} → {matched_filename}")
        dfs.append(pd.read_csv(matched_filename))
    else:
        print(f"❌ No file found for: {pub}")

df_all = pd.concat(dfs, ignore_index=True)

# Text cleaning
def clean_text(text):
    tokens = re.findall(r'\b[a-z]{3,}\b', str(text).lower())
    return ' '.join(tokens)

df_all['clean_article'] = df_all['article'].astype(str).apply(clean_text)

# Drop empties + reset index
df_all = df_all[df_all['clean_article'].str.strip().astype(bool)].reset_index(drop=True)

# Sample subset
df_all = df_all.sample(10000, random_state=42).reset_index(drop=True)

# Text list
texts = df_all['clean_article'].tolist()

# Embedding model
embedding_model = SentenceTransformer("paraphrase-MiniLM-L3-v2")

# Generate embeddings with progress bar
print("🔍 Generating embeddings...")
embeddings = embedding_model.encode(texts, show_progress_bar=True, batch_size=64)

# Fit BERTopic with precomputed embeddings
print("🔧 Fitting BERTopic...")
topic_model = BERTopic(
    embedding_model=None,  # Avoid recomputing!
    language="english",
    calculate_probabilities=True,
    verbose=True
)

topics, probs = topic_model.fit_transform(texts, embeddings)

# Plot bar chart
topic_model.visualize_barchart(top_n_topics=10)


✅ Matched: Reuters → ../data/processed/newspapers/sample_reuters.csv
✅ Matched: The New York Times → ../data/processed/newspapers/sample_the_new_york_times.csv
✅ Matched: CNBC → ../data/processed/newspapers/sample_cnbc.csv
✅ Matched: The Hill → ../data/processed/newspapers/sample_the_hill.csv
✅ Matched: People → ../data/processed/newspapers/sample_people.csv
✅ Matched: CNN → ../data/processed/newspapers/sample_cnn.csv
✅ Matched: Refinery 29 → ../data/processed/newspapers/sample_refinery_29.csv
✅ Matched: Vice → ../data/processed/newspapers/sample_vice.csv
✅ Matched: Mashable → ../data/processed/newspapers/sample_mashable.csv
✅ Matched: Business Insider → ../data/processed/newspapers/sample_business_insider.csv
✅ Matched: The Verge → ../data/processed/newspapers/sample_the_verge.csv
✅ Matched: TechCrunch → ../data/processed/newspapers/sample_techcrunch.csv
✅ Matched: TMZ → ../data/processed/newspapers/sample_tmz.csv
✅ Matched: Axios → ../data/processed/newspapers/sample_axios.csv
✅ Matc

Batches:   0%|          | 0/157 [00:00<?, ?it/s]

🔧 Fitting BERTopic...


2025-05-30 00:53:25,356 - BERTopic - Reduced dimensionality
2025-05-30 00:53:29,915 - BERTopic - Clustered reduced embeddings


In [9]:
topic_model.save("../models/topic_model/bertopic_model")

# topic_model.transform returns topics and probabilities
# Reuse embeddings used for training
topics, probs = topic_model.transform(df_all['clean_article'], embeddings=embeddings)

# Create DataFrame from topic distributions
num_topics = len(set(topic_model.get_topics().keys())) - (1 if -1 in topic_model.get_topics() else 0)
topic_cols = [f"topic_{i}" for i in range(num_topics)]
df_probs = pd.DataFrame(probs, columns=topic_cols)

df_meta = df_all[['date', 'publication']].copy()
df_combined = pd.concat([df_meta.reset_index(drop=True), df_probs], axis=1)
df_combined['month'] = pd.to_datetime(df_combined['date'], format='mixed', errors='coerce').dt.to_period('M')

df_monthly_pub = df_combined.groupby(['month', 'publication'])[topic_cols].mean().reset_index()

df_monthly_pub.to_csv('../data/processed/monthly_topic_shares_by_publisher_bertopic.csv', index=False)
print("✅ Saved: 'monthly_topic_shares_by_publisher_bertopic.csv'")


2025-05-30 00:55:25,256 - BERTopic - Reduced dimensionality
2025-05-30 00:55:30,080 - BERTopic - Calculated probabilities with HDBSCAN
2025-05-30 00:55:30,081 - BERTopic - Predicted clusters


✅ Saved: 'monthly_topic_shares_by_publisher_bertopic.csv'
