# üî• Multi-Source Trend Detection (Improved Pipeline)

**Key improvements:**
- ‚úÖ Alias-based text normalization (uses Google Trends keywords)
- ‚úÖ Multilingual sentence embeddings (`paraphrase-multilingual-mpnet-base-v2`)
- ‚úÖ Direct trend assignment (no HDBSCAN needed)
- ‚úÖ Trend coverage analysis (filter to real trends only)

> üí° **Enable GPU**: Settings ‚Üí Accelerator ‚Üí GPU T4 x2

## 1. Setup

In [None]:
!pip install -q sentence-transformers rich

In [None]:
import json
import csv
import os
import glob
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
from typing import Dict, List

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE

import warnings
warnings.filterwarnings('ignore')

import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"üñ•Ô∏è Device: {device}")
if device == 'cuda':
    print(f"üöÄ GPU: {torch.cuda.get_device_name(0)}")

## 2. Configuration

In [None]:
# === PATHS (Update these for your Kaggle dataset) ===
FB_DATA_PATH = "/kaggle/input/your-dataset/fb_data.json"
NEWS_DATA_DIR = "/kaggle/input/your-dataset/data"
TRENDS_CSV_FILES = [
    "/kaggle/input/your-dataset/trending_VN_7d.csv"
]

# === MODEL CONFIG ===
MODEL_NAME = "paraphrase-multilingual-mpnet-base-v2"  # Best for Vietnamese similarity
SIMILARITY_THRESHOLD = 0.55
MIN_POSTS_FOR_VALID_TREND = 3

print(f"ü§ñ Model: {MODEL_NAME}")

## 3. Alias Normalizer (Replaces NER)

In [None]:
# Global alias dictionary
TREND_ALIASES: Dict[str, List[str]] = {}

def build_alias_dictionary(trends: dict) -> Dict[str, List[str]]:
    """Build alias dictionary from Google Trends keywords."""
    global TREND_ALIASES
    TREND_ALIASES = {}
    
    for main_trend, keywords in trends.items():
        canonical = main_trend.lower().strip()
        aliases = [k.lower().strip() for k in keywords if k.strip()]
        if canonical not in aliases:
            aliases.insert(0, canonical)
        TREND_ALIASES[canonical] = aliases
    
    print(f"üìö Built {len(TREND_ALIASES)} alias groups")
    return TREND_ALIASES


def normalize_with_aliases(text: str, max_additions: int = 10) -> str:
    """Add known aliases to text for better matching."""
    if not TREND_ALIASES:
        return text
    
    text_lower = text.lower()
    additions = set()
    
    for canonical, aliases in TREND_ALIASES.items():
        if canonical in text_lower:
            additions.update(aliases[:5])
            additions.add(canonical)
        for alias in aliases[:5]:
            if len(alias) > 3 and alias in text_lower:
                additions.add(canonical)
                additions.update(aliases[:3])
                break
    
    if additions:
        return " ".join(list(additions)[:max_additions]) + " " + text
    return text


def batch_normalize(texts: List[str]) -> List[str]:
    """Normalize multiple texts."""
    results = []
    for i, text in enumerate(texts):
        if (i + 1) % 500 == 0:
            print(f"  Normalizing: {i+1}/{len(texts)}")
        results.append(normalize_with_aliases(text))
    return results

## 4. Data Loading

In [None]:
def load_facebook_data(filepath):
    """Load Facebook posts from JSON."""
    if not os.path.exists(filepath):
        print(f"‚ö†Ô∏è FB file not found: {filepath}")
        return []
    
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    unified = []
    for item in data:
        unified.append({
            "source": f"Face: {item.get('page_name', 'Unknown')}",
            "content": item.get('content', ''),
            "stats": item.get('stats', {'likes': 0, 'comments': 0, 'shares': 0}),
            "time": item.get('time_label', '')
        })
    return unified


def load_news_articles(data_dir):
    """Load news from CSV files."""
    if not os.path.exists(data_dir):
        print(f"‚ö†Ô∏è News dir not found: {data_dir}")
        return []
    
    unified = []
    pattern = os.path.join(data_dir, "**", "articles.csv")
    csv_files = glob.glob(pattern, recursive=True)
    
    for filepath in csv_files:
        source_name = os.path.basename(os.path.dirname(filepath)).upper()
        with open(filepath, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            for row in reader:
                unified.append({
                    "source": source_name,
                    "content": f"{row.get('title', '')}\n{row.get('content', '')}",
                    "stats": {'likes': 0, 'comments': 0, 'shares': 0},
                    "time": row.get('published_at', '')
                })
    return unified


def load_trends(csv_files):
    """Load trends from CSV files."""
    trends = {}
    for filepath in csv_files:
        if not os.path.exists(filepath):
            print(f"‚ö†Ô∏è Trends file not found: {filepath}")
            continue
        
        with open(filepath, 'r', encoding='utf-8') as f:
            reader = csv.reader(f)
            next(reader)  # Skip header
            for row in reader:
                if len(row) < 5:
                    continue
                main_trend = row[0].strip()
                keywords = [k.strip() for k in row[4].split(',') if k.strip()]
                if main_trend not in keywords:
                    keywords.insert(0, main_trend)
                trends[main_trend] = keywords
    return trends

In [None]:
# Load all data
print("üìÇ Loading data...")
fb_data = load_facebook_data(FB_DATA_PATH)
news_data = load_news_articles(NEWS_DATA_DIR)
all_data = fb_data + news_data

trends = load_trends(TRENDS_CSV_FILES)

print(f"‚úÖ Loaded: {len(fb_data)} FB + {len(news_data)} News = {len(all_data)} total")
print(f"‚úÖ Loaded: {len(trends)} Google Trends")

## 5. Semantic Matching

In [None]:
def find_matches(posts, trends, model_name, threshold=0.55):
    """Find matches using semantic similarity with alias normalization."""
    
    # Build aliases from trends
    build_alias_dictionary(trends)
    
    # Prepare texts
    trend_keys = list(trends.keys())
    trend_texts = [f"{t} " + " ".join(trends[t][:5]) for t in trend_keys]
    post_contents = [p.get('content', '')[:500] for p in posts]
    
    # Normalize with aliases
    print("üîÑ Normalizing texts with aliases...")
    trend_texts = batch_normalize(trend_texts)
    post_contents = batch_normalize(post_contents)
    print("‚úÖ Done!")
    
    # Encode
    print(f"üß† Loading model: {model_name}...")
    model = SentenceTransformer(model_name, device=device)
    
    print(f"üß† Encoding {len(trend_texts)} trends...")
    trend_embeddings = model.encode(trend_texts, show_progress_bar=True, batch_size=32)
    
    print(f"üß† Encoding {len(posts)} posts...")
    post_embeddings = model.encode(post_contents, show_progress_bar=True, batch_size=64)
    
    print("üìê Calculating similarity...")
    similarity_matrix = cosine_similarity(post_embeddings, trend_embeddings)
    
    # Match
    matches = []
    for i, post in enumerate(posts):
        sim_scores = similarity_matrix[i]
        best_idx = np.argmax(sim_scores)
        best_score = sim_scores[best_idx]
        
        if best_score > threshold:
            matches.append({
                "post_content": post.get('content', ''),
                "source": post.get('source', 'Unknown'),
                "time": post.get('time', ''),
                "stats": post.get('stats', {}),
                "trend": trend_keys[best_idx],
                "score": float(best_score),
                "is_matched": True
            })
    
    print(f"‚úÖ Found {len(matches)} matches")
    return matches, model  # Return model for reuse

In [None]:
# Run matching
matches, model = find_matches(all_data, trends, MODEL_NAME, SIMILARITY_THRESHOLD)
print(f"\nüéâ Total matches: {len(matches)}")

## 6. Trend Coverage Analysis

In [None]:
# Analyze which trends have real coverage
trend_coverage = Counter([m['trend'] for m in matches])

print("="*60)
print("üìä TREND COVERAGE ANALYSIS")
print("="*60)
print(f"   Total Google Trends: {len(trends)}")
print(f"   Trends with matches: {len(trend_coverage)}")
print(f"   Trends with NO data: {len(trends) - len(trend_coverage)}")

# Filter valid trends
valid_trends = {t: c for t, c in trend_coverage.items() if c >= MIN_POSTS_FOR_VALID_TREND}
print(f"\n‚úÖ Valid trends (>= {MIN_POSTS_FOR_VALID_TREND} posts): {len(valid_trends)}")

# Top trends
print("\nüî• TOP 20 REAL TRENDS:")
for i, (trend, count) in enumerate(sorted(valid_trends.items(), key=lambda x: -x[1])[:20]):
    print(f"   {i+1}. {trend}: {count} posts")

## 7. Visualization

In [None]:
# Bar chart of top trends
top_20 = sorted(valid_trends.items(), key=lambda x: -x[1])[:20]
labels = [t[:25] + "..." if len(t) > 25 else t for t, _ in top_20][::-1]
counts = [c for _, c in top_20][::-1]

plt.figure(figsize=(12, 10))
bars = plt.barh(labels, counts, color='steelblue')
plt.xlabel('Number of Posts')
plt.title('Top 20 Real Trends (with data coverage)')

for bar in bars:
    plt.text(bar.get_width() + 0.3, bar.get_y() + bar.get_height()/2,
             str(int(bar.get_width())), va='center')

plt.tight_layout()
plt.savefig('top_trends.png', dpi=150)
plt.show()

In [None]:
# t-SNE visualization by trend
valid_trend_names = set(valid_trends.keys())
filtered_matches = [m for m in matches if m['trend'] in valid_trend_names]

print(f"Visualizing {len(filtered_matches)} posts from {len(valid_trend_names)} trends")

texts = [m['post_content'][:500] for m in filtered_matches]
trend_labels = [m['trend'] for m in filtered_matches]

# Encode (reuse model)
print("üß† Generating embeddings...")
embeddings = model.encode(texts, show_progress_bar=True, batch_size=64)

# t-SNE
print("üìâ Running t-SNE...")
perplexity = min(30, len(texts) - 1)
tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity)
coords = tsne.fit_transform(embeddings)

# Color by top 10 trends
top_10 = [t for t, _ in sorted(valid_trends.items(), key=lambda x: -x[1])[:10]]
colors = [top_10.index(t) if t in top_10 else -1 for t in trend_labels]

plt.figure(figsize=(14, 10))
scatter = plt.scatter(coords[:, 0], coords[:, 1], c=colors, cmap='tab10', alpha=0.7, s=40)
plt.colorbar(scatter, label='Trend Index')
plt.title(f'Real Trends Visualization ({len(valid_trend_names)} trends)')

# Add labels for top 10
for i, trend in enumerate(top_10):
    mask = [c == i for c in colors]
    if any(mask):
        centroid = coords[[j for j, m in enumerate(mask) if m]].mean(axis=0)
        plt.annotate(trend[:20], centroid, fontsize=9, weight='bold',
                    bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

plt.tight_layout()
plt.savefig('trend_tsne.png', dpi=150)
plt.show()

## 8. Scoring & Classification

In [None]:
import math

def compute_scores(trend_name, items, google_volume=0):
    """Compute G, F, N scores for a trend."""
    
    # G Score (Google search volume)
    MAX_VOL = 1000000
    g_score = (math.log10(google_volume + 1) / math.log10(MAX_VOL + 1)) * 100 if google_volume > 0 else 0
    
    # F Score (Facebook engagement)
    total_interactions = sum(
        item['stats'].get('likes', 0) + 
        item['stats'].get('comments', 0) * 2 + 
        item['stats'].get('shares', 0) * 3
        for item in items if 'Face' in item.get('source', '')
    )
    MAX_INTERACTIONS = 20000
    f_score = (math.log10(total_interactions + 1) / math.log10(MAX_INTERACTIONS + 1)) * 100
    
    # N Score (News coverage)
    news_count = len([item for item in items if 'Face' not in item.get('source', '')])
    MAX_ARTICLES = 50
    n_score = (news_count / MAX_ARTICLES) * 100
    
    # Composite
    composite = 0.4 * min(g_score, 100) + 0.35 * min(f_score, 100) + 0.25 * min(n_score, 100)
    
    # Classification
    HIGH = 40
    if g_score > HIGH and f_score > HIGH and n_score > HIGH:
        classification = "Strong Multi-source"
    elif f_score > HIGH and n_score > HIGH:
        classification = "Social & News"
    elif f_score > HIGH:
        classification = "Social-Driven"
    elif n_score > HIGH:
        classification = "News-Driven"
    else:
        classification = "Emerging"
    
    return {
        "G": round(g_score, 1),
        "F": round(f_score, 1),
        "N": round(n_score, 1),
        "Composite": round(composite, 1),
        "Class": classification,
        "Interactions": total_interactions,
        "NewsCount": news_count
    }

In [None]:
# Build trend clusters from matches
trend_clusters = defaultdict(list)
for m in matches:
    if m['trend'] in valid_trends:
        trend_clusters[m['trend']].append(m)

# Compute scores for each trend
results = []
for trend_name, items in trend_clusters.items():
    scores = compute_scores(trend_name, items)
    results.append({
        "trend": trend_name,
        "posts": len(items),
        **scores
    })

# Sort by composite score
results = sorted(results, key=lambda x: x['Composite'], reverse=True)

# Display
print("\n" + "="*80)
print("üìä FINAL TREND RANKINGS")
print("="*80)
print(f"{'Rank':<5} {'Trend':<35} {'Class':<18} {'Score':>6} {'G/F/N':>12} {'Posts':>6}")
print("-"*80)

for i, r in enumerate(results[:30]):
    gfn = f"{r['G']:.0f}/{r['F']:.0f}/{r['N']:.0f}"
    print(f"{i+1:<5} {r['trend'][:35]:<35} {r['Class']:<18} {r['Composite']:>6.1f} {gfn:>12} {r['posts']:>6}")

## 9. Batch Summarization (Optional)

Pre-compute summaries for all posts/articles and cache for later use.

In [None]:
# ==========================================
# BATCH SUMMARIZE FACEBOOK POSTS
# ==========================================
import sys
sys.path.insert(0, '/kaggle/working/Real-time-Event-Detection-on-Social-Media-Data')

from scripts.batch_summarize import batch_summarize, merge_summaries_into_posts, load_posts

# --- CONFIG ---
FB_SUMMARY_OUTPUT = '/kaggle/working/fb_summaries.json'
SUMMARY_MODEL = 'vit5-base'  # 'vit5-large' for best quality

# --- RUN (only once!) ---
fb_summaries = batch_summarize(
    input_path=FB_DATA_PATH,
    output_path=FB_SUMMARY_OUTPUT,
    model_name=SUMMARY_MODEL,
    max_length=200,
    resume=True
)

# --- MERGE INTO DATA ---
fb_posts_with_summary = load_posts(FB_DATA_PATH)
fb_posts_with_summary = merge_summaries_into_posts(fb_posts_with_summary, FB_SUMMARY_OUTPUT)

# Preview
sample = [p for p in fb_posts_with_summary if p.get('summary')][:1]
if sample:
    print("\nüìã Sample FB post with summary:")
    print(f"Original: {str(sample[0].get('content', ''))[:200]}...")
    print(f"Summary:  {sample[0]['summary']}")

In [None]:
# ==========================================
# BATCH SUMMARIZE NEWS ARTICLES (ALL SOURCES)
# ==========================================
import os
import pandas as pd
from scripts.batch_summarize import batch_summarize, load_summaries_for_use

# --- CONFIG ---
NEWS_SOURCES = ['vnexpress', 'tuoitre', 'thanhnien', 'vietnamnet', 'nld']
NEWS_SUMMARY_DIR = '/kaggle/working/news_summaries'

os.makedirs(NEWS_SUMMARY_DIR, exist_ok=True)

# --- RUN FOR EACH SOURCE ---
all_news_summaries = {}

for source in NEWS_SOURCES:
    input_path = f'{NEWS_DATA_DIR}/{source}/articles.csv'
    output_path = f'{NEWS_SUMMARY_DIR}/{source}_summaries.json'
    
    if not os.path.exists(input_path):
        print(f"‚ö†Ô∏è Skipping {source}: {input_path} not found")
        continue
    
    print("\n" + "="*50)
    print(f"üì∞ Processing: {source.upper()}")
    print("="*50)
    
    summaries = batch_summarize(
        input_path=input_path,
        output_path=output_path,
        model_name=SUMMARY_MODEL,
        max_length=200,
        resume=True
    )
    all_news_summaries[source] = summaries

print(f"\n‚úÖ Total articles summarized: {sum(len(s) for s in all_news_summaries.values())}")

In [None]:
# ==========================================
# LOAD ALL SUMMARIES INTO DATAFRAMES
# ==========================================
all_news_dfs = {}

for source in NEWS_SOURCES:
    csv_path = f'{NEWS_DATA_DIR}/{source}/articles.csv'
    summary_path = f'{NEWS_SUMMARY_DIR}/{source}_summaries.json'
    
    if not os.path.exists(csv_path) or not os.path.exists(summary_path):
        continue
    
    df = pd.read_csv(csv_path)
    summaries = load_summaries_for_use(summary_path)
    
    # Merge using URL as key
    df['summary'] = df['url'].apply(lambda u: summaries.get(str(u), ''))
    all_news_dfs[source] = df
    print(f"‚úÖ {source}: {len(df)} articles, {(df['summary'] != '').sum()} with summaries")

# Combine all
if all_news_dfs:
    combined_news_df = pd.concat(all_news_dfs.values(), ignore_index=True)
    print(f"\nüìä Total: {len(combined_news_df)} articles")
    display(combined_news_df[["source", "title", "summary"]].head())

## 10. Save Results

In [None]:
# Save matches
with open('results.json', 'w', encoding='utf-8') as f:
    json.dump(matches, f, ensure_ascii=False, indent=2)
print(f"üíæ Saved {len(matches)} matches to results.json")

# Save rankings
with open('trend_rankings.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)
print(f"üíæ Saved {len(results)} trend rankings to trend_rankings.json")