# üß™ Project Trend Hunter: Analysis Playground

Welcome to the interactive test bench! Here you can run the entire trend detection pipeline step-by-step, toggle different methods, and visualize the results immediately.

### üéØ Objectives:
1.  **Inspect Data**: EDA on sources, timing, and content length.
2.  **Compare Methods**: Semantic (Google Trends) vs. Hybrid (Cluster-First).
3.  **Verify Reranking**: See the difference Cross-Encoder makes.
4.  **Visualize**: Run t-SNE to see the clusters in 2D space.

---

In [None]:
# Install wordcloud if missing
!pip install wordcloud --quiet

In [None]:
# 1. Setup & Imports
import sys
import os
import glob
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np
from rich.console import Console
from sklearn.manifold import TSNE
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer

# Ensure project root is in path
sys.path.append(os.path.abspath('..'))

from sklearn.metrics.pairwise import cosine_similarity
from crawlers.analyze_trends import find_matches, find_matches_hybrid, load_social_data, load_news_data, load_google_trends, refine_trends_preprocessing
from crawlers.clustering import cluster_data, extract_cluster_labels
from crawlers.alias_normalizer import build_alias_dictionary, normalize_with_aliases
from crawlers.vectorizers import get_embeddings

console = Console()
pd.set_option('display.max_colwidth', 100)
%matplotlib inline

## ‚öôÔ∏è Configuration
Adjust these parameters to control the experiment.

In [None]:
LIMIT_POSTS = 500  # Set to None for full run (~4600 posts), 500 for testing
USE_PHOBERT = True # Use PhoBERT for sentiment
THRESHOLD = 0.5    # Similarity threshold

REFINE_TRENDS = True # [NEW] Phase 6: Use LLM to clean Google Trends before matching
NO_DEDUP = False      # [NEW] Phase 4: Skip semantic deduplication if too aggressive
USE_KEYWORDS = True   # [NEW] Phase 8: Extract high-signal keywords before clustering

# Recommendations for Vietnamese:
# Bi-Encoder: 'keepitreal/vietnamese-sbert' or 'dangvantuan/vietnamese-embedding'
MODEL_NAME = "paraphrase-multilingual-mpnet-base-v2"

# Cross-Encoder: 'DiTy/cross-encoder-vietnamese-mobilebert'
CROSS_ENCODER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"

EMBEDDING_MODEL = 'dangvantuan/vietnamese-embedding' # Or 'keepitreal/vietnamese-sbert'
EMBEDDING_METHOD = 'sentence-transformer' # 'tfidf', 'bow', 'sentence-transformer'
LABELING_METHOD = 'semantic'              # 'tfidf', 'semantic'
RERANK = True                             # Use Cross-Encoder for precision
MIN_CLUSTER_SIZE = 5,                     # Min posts to form a trend

# LLM Refinement
USE_LLM = False                           # Set to True to enable Refinement
LLM_PROVIDER = 'gemini'                   # 'gemini' or 'kaggle'
GEMINI_API_KEY = ""                       # For Gemini
LLM_MODEL_PATH = "google/gemma-2-2b-it"   # For Kaggle
USE_CACHE = True                          # Save/Load embeddings to disk
DEBUG_LLM = False                         # Print raw LLM responses on error
SUMMARIZE_ALL = False                     # Set True to summarize ALL posts with ViT5 (slow!)

# Custom Prompt for Cluster Refinement
LLM_CUSTOM_INSTRUCTION = """For each cluster ID, provide a professional title, category, and reasoning.
Categories:
- A: Critical (Accidents, Disasters, Safety)
- B: Social (Policy, controversy, public sentiment)
- C: Market (Commerce, Tech, Entertainment)"""


## üìÇ 1. Load Data

In [None]:
# Load Trends
trend_files = glob.glob("../crawlers/trendings/*.csv")
trends = load_google_trends(trend_files)
print(f"Loaded {len(trends)} trends.")

# Load Social & News
fb_files = glob.glob("../crawlers/facebook/*.json")
news_files = glob.glob("../crawlers/news/**/*.csv", recursive=True)
posts = load_social_data(fb_files) + load_news_data(news_files)

if LIMIT_POSTS:
    # Shuffle briefly before limiting to get mix? Or just take first.
    posts = posts[:LIMIT_POSTS]
    
# Helper: Extract contents
post_contents = [p.get('content', '') for p in posts]
print(f"Loaded {len(posts)} posts for analysis.")

## üßπ 1.1 Phase 6: Google Trends Refinement (Optional)
Clean and merge trends before analysis using instructions defined in Configuration.

In [None]:
if REFINE_TRENDS:
    trends = refine_trends_preprocessing(
        trends, 
        llm_provider=LLM_PROVIDER, 
        gemini_api_key=GEMINI_API_KEY, 
        llm_model_path=LLM_MODEL_PATH, 
        debug_llm=DEBUG_LLM, 
        source_files=trend_files  # Enables caching
    )
else:
    print("Skipping Trend Refinement (using raw trends).")

## üìä 1.2 General Stats
Let's understand our dataset volume.

In [None]:
# Convert to DataFrame for EDA
df_raw = pd.DataFrame(posts)

# 1. Clean Time field
df_raw['time'] = pd.to_datetime(df_raw['time'], errors='coerce')

# 2. Source Categories
df_raw['source_type'] = df_raw['source'].apply(lambda x: 'Facebook' if isinstance(x, str) and 'Face:' in x else 'News')
df_raw['content_length'] = df_raw['content'].apply(lambda x: len(str(x)) if x is not None else 0)

fig, ax = plt.subplots(1, 2, figsize=(14, 5))

# A. Source Type Distribution
sns.countplot(data=df_raw, x='source_type', ax=ax[0], palette='pastel')
ax[0].set_title("Distribution of Data Types")

# B. Post Counts over Time
if df_raw['time'].notnull().any():
    df_raw[df_raw['time'].notnull()].set_index('time').resample('D').size().plot(ax=ax[1], color='teal', marker='o')
    ax[1].set_title("Daily Post Volume")
    ax[1].set_ylabel("Number of Posts")
else:
    ax[1].text(0.5, 0.5, "No Valid Time Data", ha='center')

plt.tight_layout()
plt.show()

## ‚òÅÔ∏è 1.2 Deep Dive: Sources and Content
Which specific pages are most active? What are they talking about?

In [None]:
# A. Top 20 specific sources
def clean_source_name(s):
    if not isinstance(s, str): return 'Unknown'
    return s.replace('Face: ', '')

df_raw['clean_source'] = df_raw['source'].apply(clean_source_name)
top_sources = df_raw['clean_source'].value_counts().head(20)

plt.figure(figsize=(12, 6))
sns.barplot(x=top_sources.values, y=top_sources.index, palette='viridis')
plt.title("Top 20 Active Sources")
plt.xlabel("Number of Posts")
plt.show()

# B. Word Cloud
# Simple stopwords list for Vietnamese (basic)
stops = {'v√†', 'c·ªßa', 'l√†', 'c√≥', 'trong', 'ƒë√£', 'ng√†y', 'theo', 'v·ªõi', 'cho', 'ng∆∞·ªùi', 'nh·ªØng', 't·∫°i', 'v·ªÅ', 'c√°c', 'ƒë∆∞·ª£c'}
text_corpus = " ".join(df_raw['content'].dropna().tolist())

wc = WordCloud(width=800, height=400, background_color='white', stopwords=stops, max_words=100).generate(text_corpus)

plt.figure(figsize=(14, 7))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.title("Most Common Words (Word Cloud)")
plt.show()

## üõ†Ô∏è 1.3 Preprocessing Demo (Before vs After)
See how our **Alias Normalization** and **TF-IDF Tokenizer** process the raw text.

In [None]:
# 1. Initialize Alias Dictionary (Custom Layer)
build_alias_dictionary(trends)

# 2. Initialize TF-IDF (Scikit-Learn Layer)
tfidf_demo = TfidfVectorizer(ngram_range=(1, 2), max_features=20)

# 3. Pick a sample text (Try to find one with a potential alias)
# Let's look for a post mentioning "s·ªë 3" (common alias for storm)
sample_text = "C∆°n b√£o s·ªë 3 ƒëang g√¢y m∆∞a l·ªõn t·∫°i H√† N·ªôi."
candidates = df_raw[df_raw['content'].str.contains("s·ªë 3", case=False, na=False)]
if not candidates.empty:
    sample_text = candidates.iloc[0]['content'][:100] + "..."

print("--- STEP 1: RAW INPUT ---")
print(f"Original: '{sample_text}'")

print("\n--- STEP 2: OUR ALIAS NORMALIZATION (Augmentation) ---")
normalized_text = normalize_with_aliases(sample_text)
print(f"Processed: '{normalized_text}'")
print("(Notice how relevant trend names are PREPENDED to the text)")

print("\n--- STEP 3: TF-IDF TOKENIZATION (Cleaning) ---")
tfidf_demo.fit([normalized_text])
tokens = tfidf_demo.get_feature_names_out()
print(f"Final Tokens: {list(tokens)}")
print("(Lowercase, Punctuation Removed, Bigrams Created)")

## üî¨ 2. Run Semantic Analysis (Baseline)
Standard Bi-Encoder matching (fast, fuzzy).

In [None]:
print("Running Semantic Matching...")
matches_semantic = find_matches(
    posts, trends, 
    threshold=THRESHOLD, 
    model_name="paraphrase-multilingual-mpnet-base-v2",
    save_all=True  # Include unmatched
)
df_sem = pd.DataFrame(matches_semantic)
print("Semantic Match Count:", len(df_sem[df_sem['is_matched'] == True]))
df_sem.head(3)

### üé® Visualize Semantic Matches
How do the posts group when assigned directly to trends?

In [None]:
# Filter for matched posts only
sem_plot_df = df_sem[df_sem['is_matched'] == True].copy()

if len(sem_plot_df) < 5:
    print("Not enough semantic matches to plot.")
else:
    print(f"Visualizing {len(sem_plot_df)} Semantic Matches...")
    sem_texts = sem_plot_df['post_content'].tolist()
    sem_labels = sem_plot_df['trend'].tolist()

    # Embeddings (Always use Sentence Transformer for visualization quality)
    sem_embeddings = get_embeddings(sem_texts, method="sentence-transformer", 
                                    model_name="paraphrase-multilingual-mpnet-base-v2")

    # t-SNE
    tsne_sem = TSNE(n_components=2, random_state=42, perplexity=min(30, len(sem_texts)-1))
    coords_sem = tsne_sem.fit_transform(sem_embeddings)

    # Create DataFrame for Plotly
    df_vis_sem = pd.DataFrame({
        'x': coords_sem[:, 0],
        'y': coords_sem[:, 1],
        'Label': sem_labels,
        'Snippet': [t[:100] + '...' for t in sem_texts]
    })

    # Interactive Plot
    fig = px.scatter(df_vis_sem, x='x', y='y', color='Label', 
                     hover_data=['Snippet'],
                     title="Interactive t-SNE: Semantic Matches (Baseline)")
    fig.show()

## üöÄ 3. Run Hybrid Analysis (Cluster-First)
This uses HDBSCAN + Cross-Encoder (if enabled).
Note: This automatically filters noise and finds 'Discovery' topics.

In [None]:
print(f"Running Hybrid Analysis (Embedding={EMBEDDING_METHOD}, Labeling={LABELING_METHOD}, Rerank={RERANK})...")
matches_hybrid = find_matches_hybrid(
    posts, trends, 
    threshold=THRESHOLD, 
    model_name=MODEL_NAME,
    reranker_model_name=CROSS_ENCODER_MODEL,
    embedding_method=EMBEDDING_METHOD,
    labeling_method=LABELING_METHOD,
    rerank=RERANK,
    use_llm=USE_LLM,
    gemini_api_key=GEMINI_API_KEY,
    llm_provider=LLM_PROVIDER,
    llm_model_path=LLM_MODEL_PATH,
    llm_custom_instruction=LLM_CUSTOM_INSTRUCTION,
    use_cache=USE_CACHE,
    debug_llm=DEBUG_LLM,
    summarize_all=SUMMARIZE_ALL,
    no_dedup=NO_DEDUP,
    use_keywords=USE_KEYWORDS,
    save_all=True
)
df_hyb = pd.DataFrame(matches_hybrid)
print("Hybrid Topics Found:", df_hyb['final_topic'].nunique())

    # improved display with new metrics
    cols = ['final_topic', 'category', 'topic_type', 'trend_score', 'sentiment', 'llm_reasoning', 'post_content']
    # Check if columns exist (graceful fallback)
    available_cols = [c for c in cols if c in df_hyb.columns]
    df_result = df_hyb[available_cols].copy()

    # Sort by Score if available
    if 'trend_score' in df_result.columns:
        df_result = df_result.sort_values('trend_score', ascending=False)
        
    df_result.head(10)

## ü§ù 3.1 Cross-Source Integration Analysis
Evaluate how **News articles** and **Facebook posts** are blended together in the same clusters.

In [None]:
# 1. Define Source Type
df_hyb['source_type'] = df_hyb['source'].apply(lambda x: 'Facebook' if isinstance(x, str) and 'Face:' in x else 'News')

# 2. Calculate Mixing Statistics
topic_sources = df_hyb.groupby(['final_topic', 'source_type']).size().unstack(fill_value=0)

mixed_topics = topic_sources[(topic_sources['Facebook'] > 0) & (topic_sources['News'] > 0)]
total_topics = df_hyb[df_hyb['topic_type'] != 'Noise']['final_topic'].nunique()

print(f"üìä Total Meaningful Trends: {total_topics}")
print(f"ü§ù Mixed-Source Trends (FB + News): {len(mixed_topics)} ({len(mixed_topics)/total_topics*100:.1f}%)")

# 3. Visualize Top 10 Trends (Source Distribution)
top_10 = df_hyb[df_hyb['topic_type'] != 'Noise']['final_topic'].value_counts().head(10).index
df_top10 = df_hyb[df_hyb['final_topic'].isin(top_10)]

plt.figure(figsize=(12, 6))
sns.countplot(data=df_top10, y='final_topic', hue='source_type', palette='Set2')
plt.title("Source Distribution in Top 10 Trends")
plt.xlabel("Article/Post Count")
plt.ylabel("Trend Topic")
plt.legend(title="Source Type")
plt.show()

### üé® Cross-Source t-SNE
Visually confirm that news and social media occupy the same semantic cluster.

In [None]:
if len(df_top10) < 5:
    print("Not enough data for cross-source t-SNE.")
else:
    # Embed top 10 trends articles
    top_texts = df_top10['post_content'].tolist()
    top_embs = get_embeddings(top_texts, method=EMBEDDING_METHOD, model_name=MODEL_NAME)
    
    tsne_x = TSNE(n_components=2, perplexity=min(30, len(top_texts)-1), random_state=42)
    coords_x = tsne_x.fit_transform(top_embs)
    
    df_vis_x = pd.DataFrame({
        'x': coords_x[:, 0],
        'y': coords_x[:, 1],
        'Source': df_top10['source_type'].tolist(),
        'Trend': df_top10['final_topic'].tolist(),
        'Snippet': [t[:80] + '...' for t in top_texts]
    })
    
    fig = px.scatter(df_vis_x, x='x', y='y', color='Source', symbol='Trend', 
                     hover_data=['Snippet', 'Trend'],
                     title="Cross-Source Clusters: News vs Facebook Overlap")
    fig.update_traces(marker=dict(size=10, opacity=0.8))
    fig.show()

## üî¨ 3.2 SAHC Deep Dive: News-First Comparative Clustering
Visualize the **Source-Aware Hierarchical Clustering (SAHC)** architecture in action.
1. **Before**: News articles form stable event clusters.
2. **After**: Social media posts gravitate toward these News seeds (Attachment).

In [None]:
# 1. Extract News vs Social indices from the subset used above (df_top10)
news_data = df_top10[df_top10['source_type'] == 'News'].copy()
fb_data = df_top10[df_top10['source_type'] == 'Facebook'].copy()

print(f"Original News Seeds: {len(news_data)} | Attaching Facebook Posts: {len(fb_data)}")

# 2. Get embeddings for just News
news_embs = get_embeddings(news_data['post_content'].tolist(), method=EMBEDDING_METHOD, model_name=MODEL_NAME)
tsne_sub = TSNE(n_components=2, perplexity=min(30, len(news_data)-1), random_state=42)
coords_news = tsne_sub.fit_transform(news_embs)

# 3. Plot 1: News Clusters Only
df_news_vis = pd.DataFrame({
    'x': coords_news[:, 0], 'y': coords_news[:, 1],
    'Trend': news_data['final_topic'].tolist(),
    'Type': ['News (Seed)'] * len(news_data)
})

fig1 = px.scatter(df_news_vis, x='x', y='y', color='Trend', 
                  title="[Step 1] News-Only Foundational Clusters",
                  hover_data=['Trend'])
fig1.update_traces(marker=dict(size=12))
fig1.show()

# 4. Plot 2: Integrated View (Show Displacement/Inclusion)
# Re-run t-SNE on joint set to show shared space
joint_embs = get_embeddings(news_data['post_content'].tolist() + fb_data['post_content'].tolist(), method=EMBEDDING_METHOD, model_name=MODEL_NAME)
tsne_joint = TSNE(n_components=2, perplexity=min(30, len(joint_embs)-1), random_state=42)
coords_joint = tsne_joint.fit_transform(joint_embs)

df_joint_vis = pd.DataFrame({
    'x': coords_joint[:, 0], 'y': coords_joint[:, 1],
    'Trend': news_data['final_topic'].tolist() + fb_data['final_topic'].tolist(),
    'Source': ['News article'] * len(news_data) + ['Facebook post'] * len(fb_data)
})

fig2 = px.scatter(df_joint_vis, x='x', y='y', color='Trend', symbol='Source',
                  title="[Step 2] Integrated Clusters (Social Posts attached to News Seeds)",
                  hover_data=['Trend', 'Source'])
fig2.update_traces(marker=dict(size=10, opacity=0.7))
print("‚úÖ Visual similarity between News dots and FB symbols within same color confirms successful SAHC attachment.")
fig2.show()

## üìä 4. Comparison Stats
Let's see the metrics side-by-side.

In [None]:
# Comparison Data
stats = {
    'Method': ['Semantic', 'Hybrid'],
    'Total Matched/Clustered': [
        len(df_sem[df_sem['is_matched'] == True]),
        len(df_hyb[df_hyb['final_topic'] != 'Unassigned'])
    ],
    'Unique Topics': [
        df_sem[df_sem['is_matched'] == True]['trend'].nunique(),
        df_hyb[df_hyb['final_topic'] != 'Unassigned']['final_topic'].nunique()
    ]
}
df_stats = pd.DataFrame(stats)

fig, ax = plt.subplots(1, 2, figsize=(12, 5))
sns.barplot(data=df_stats, x='Method', y='Total Matched/Clustered', ax=ax[0], palette='viridis')
ax[0].set_title("Coverage (Total Matched Posts)")

sns.barplot(data=df_stats, x='Method', y='Unique Topics', ax=ax[1], palette='magma')
ax[1].set_title("Diversity (Unique Topics)")
plt.tight_layout()
plt.show()

## üé® 5. t-SNE Visualization with Plotly (Hybrid)
Let's visualize the clusters found by the **Hybrid Method** in 2D space.
**Hover over the blue dots** to discover what those small clusters are!

In [None]:
# 1. Filter data (remove 'Unassigned' or 'Noise' for clearer plot)
plot_df = df_hyb[df_hyb['topic_type'] != 'Noise'].copy()

if len(plot_df) < 5:
    print("Not enough data points for t-SNE.")
else:
    print(f"Visualizing {len(plot_df)} clustered posts...")
    texts = plot_df['processed_content'].tolist()
    labels = plot_df['final_topic'].tolist()
    types = plot_df['topic_type'].tolist()
    scores = plot_df['score'].tolist()
    
    # 2. Get Embeddings (Use SAME method as configured)
    print(f"Generating embeddings using {EMBEDDING_METHOD}...")
    embeddings = get_embeddings(texts, method=EMBEDDING_METHOD, 
                                model_name="paraphrase-multilingual-mpnet-base-v2",
                                max_features=2000) # For TF-IDF/BoW speed
    
    # 3. Running t-SNE
    print("Running t-SNE...")
    tsne = TSNE(n_components=2, random_state=42, perplexity=min(30, len(texts)-1))
    coords = tsne.fit_transform(embeddings)
    
    # 4. Interactive Plot with Plotly
    df_vis = pd.DataFrame({
        'x': coords[:, 0],
        'y': coords[:, 1],
        'Topic': labels,
        'Type': types,
        'Score': np.round(scores, 2),
        'Snippet': [t[:100] + '...' for t in texts]
    })
    
    # Only show Top 20 topics in legend, others grouped as 'Other' to avoid palette exhaustion
    top_n_topics = df_vis['Topic'].value_counts().head(20).index.tolist()
    df_vis['Legend_Group'] = df_vis['Topic'].apply(lambda x: x if x in top_n_topics else 'Other (Blue Clusters)')
    
    fig = px.scatter(df_vis, x='x', y='y', 
                     color='Legend_Group', 
                     symbol='Type',
                     hover_data=['Topic', 'Type', 'Score', 'Snippet'],
                     title=f"Interactive t-SNE: Hybrid Clusters ({EMBEDDING_METHOD})")
    fig.show()

## üåü 6. Discovery Viewer
Top 'Discovery' topics (New trends not in Google Trends).

In [None]:
discoveries = df_hyb[df_hyb['topic_type'] == 'Discovery']
top_discoveries = discoveries['final_topic'].value_counts().head(10)

print("Top 10 New Discoveries:")
print(top_discoveries)

# Show samples
if not top_discoveries.empty:
    top_topic = top_discoveries.index[0]
    print(f"\nSample posts for top discovery '{top_topic}':")
    print(discoveries[discoveries['final_topic'] == top_topic]['post_content'].head(3).values)