In [None]:
import pandas as pd
from src.topic_modeler import TopicModeler
import matplotlib.pyplot as plt

# Load your dataset
df = pd.read_csv("../data/raw.csv", parse_dates=["date"])

# Sample for speed
df_sample = df.sample(3000, random_state=42)

# Run Topic Modeler (preprocessing is now internal)
tm = TopicModeler(num_topics=5, max_features=500)
topics = tm.fit(df_sample['headline'])

# Display topics
for topic, words in topics:
    print(f"{topic}: {', '.join(words)}")

# Plot topics
def plot_topics(topics):
    fig, axs = plt.subplots(len(topics), 1, figsize=(10, 4 * len(topics)))
    if len(topics) == 1:
        axs = [axs]
    for ax, (topic_name, words) in zip(axs, topics):
        ax.barh(words[::-1], range(1, len(words)+1))
        ax.set_title(topic_name)
        ax.set_xlabel("Importance Rank")
    plt.tight_layout()
    plt.show()

plot_topics(topics)
