## 4) LLM Topic Modeling

In [1]:
import re
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# --- Step 1: Clean text ---
def tokenize_and_clean(text):
    tokens = re.findall(r'\b[a-z]{3,}\b', str(text).lower())
    custom_stopwords = {'said', 'mr', 'also'}  # Add more if needed
    return ' '.join([t for t in tokens if t not in custom_stopwords])

df_all['clean_text'] = df_all['article'].apply(tokenize_and_clean)



NameError: name 'df_all' is not defined

In [3]:
# --- Step 2: Generate embeddings ---
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df_all['clean_text'].tolist(), show_progress_bar=True)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


NameError: name 'df_all' is not defined

In [4]:
# --- Step 3: Cluster into topics ---
num_topics = 10
kmeans = KMeans(n_clusters=num_topics, random_state=42)
df_all['topic_cluster'] = kmeans.fit_predict(embeddings)

# --- Load LLaMA 3.2 1B model locally for topic labeling ---
print("\n--- Loading LLaMA 3.2 1B-Instruct locally ---")
model_id = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
llama_model = AutoModelForCausalLM.from_pretrained(model_id)
llama_pipe = pipeline("text-generation", model=llama_model, tokenizer=tokenizer, max_new_tokens=20, return_full_text=False)

# --- Generate topic labels ---
print("\n--- LLaMA Topic Labeling (Local Inference) ---")
llama_topic_labels = {}
n_sample_docs_for_llama = 3

for i in range(num_topics):
    cluster_docs_df = df_all[df_all['topic_cluster'] == i]
    if not cluster_docs_df.empty:
        sample_texts = cluster_docs_df['clean_text'].sample(min(n_sample_docs_for_llama, len(cluster_docs_df)), random_state=42).tolist()
        combined_text = "\n\n".join(sample_texts)
        prompt = f"Analyze the following documents and identify their main shared topic in a short phrase (2-5 words). Documents:\n\n{combined_text}\n\nMain topic:"

        try:
            outputs = llama_pipe(prompt)
            generated = outputs[0]['generated_text'].strip()
            llama_topic_labels[i] = generated
        except Exception as e:
            llama_topic_labels[i] = f"Error for topic {i}: {e}"
    else:
        llama_topic_labels[i] = f"No documents found for Topic {i}"

# Display the results
print("\nGenerated LLaMA Topic Labels (via Local Inference):")
for topic_id, label in llama_topic_labels.items():
    print(f"Topic {topic_id}: {label}")

NameError: name 'embeddings' is not defined

In [None]:
# --- Step 4: Topic distribution per document ---
similarities = cosine_similarity(embeddings, kmeans.cluster_centers_)
topic_cols = [f'topic_{i}' for i in range(num_topics)]
df_topic_dist = pd.DataFrame(similarities, columns=topic_cols)
df_all = pd.concat([df_all, df_topic_dist], axis=1)

In [None]:
# --- Step 5: Aggregate over time and publisher ---
df_all['date'] = pd.to_datetime(df_all['date'], errors='coerce')
df_all['month'] = df_all['date'].dt.to_period('M')
df_combined = pd.concat([df_all[['month', 'publication']], df_all[topic_cols]], axis=1)
df_monthly_pub = df_combined.groupby(['month', 'publication'])[topic_cols].mean().reset_index()


  from .autonotebook import tqdm as notebook_tqdm


KeyboardInterrupt: 

In [None]:
# --- Step 6: Save ---
df_monthly_pub.to_csv('../data/processed/monthly_topic_shares_by_publisher_llm.csv', index=False)
print("✅ Saved: 'monthly_topic_shares_by_publisher_llm.csv'")