# üöÄ Final Trend Detection Pipeline

This notebook implements the complete pipeline for detecting real-time events from social media data, incorporating best practices for data processing and retrieval.

### ‚ú® Key Improvements included:
1.  **Title-based Embedding**: Solves length mismatch for News articles.
2.  **Data Cleaning**: Removes OCR noise from Facebook posts.
3.  **Hybrid Search**: Combines BM25 (Keyword) + Dense Retrieval (Semantic).
4.  **Batch Summarization**: Optional step to summarize long content.

---

In [None]:
# 1. Setup & Clone Repo
!rm -rf Real-time-Event-Detection-on-Social-Media-Data  # Clean start
!git clone https://github.com/GadGadGad/Real-time-Event-Detection-on-Social-Media-Data
%cd Real-time-Event-Detection-on-Social-Media-Data
!pip install -r requirements.txt -q
!pip install -q rank_bm25 py_vncorenlp

# VNCoreNLP Setup
!mkdir -p vncorenlp_models
!python3 -c "import py_vncorenlp; py_vncorenlp.download_model(save_dir='vncorenlp_models')"

import sys
import os
import glob
import json
import pandas as pd
import numpy as np
import re
from rich.console import Console
from sentence_transformers import SentenceTransformer, util
from rank_bm25 import BM25Okapi

# Add project to path
sys.path.append(os.getcwd())
from src.pipeline.main_pipeline import load_social_data, load_news_data, load_google_trends
from src.utils.text_processing.vectorizers import get_embeddings

console = Console()
print("‚úÖ Setup Complete")

In [None]:
# 2. Load Data
# Paths are relative to the cloned repo root
fb_files = glob.glob("crawlers/facebook/*.json")
news_files = glob.glob("crawlers/news/**/*.csv", recursive=True)
trend_files = glob.glob("crawlers/trendings/*.csv")

print("üìÇ Loading Data...")
posts = load_social_data(fb_files) + load_news_data(news_files)
trends = load_google_trends(trend_files)

print(f"‚úÖ Loaded {len(posts)} posts and {len(trends)} trends.")

## üßπ 3. Data Cleaning & Preprocessing

In [None]:
def clean_facebook_content(text):
    if not isinstance(text, str): return ""
    noise_patterns = [
        r"May be an image of.*?\n",
        r"No photo description available.*?\n",
        r"\+?\d+ others",
        r"Theanh28.*?\n",
        r"\d+K likes",
        r"\d+ comments"
    ]
    cleaned = text
    for pattern in noise_patterns:
        cleaned = re.sub(pattern, "", cleaned, flags=re.IGNORECASE)
    return cleaned.strip()

# CONFIG
USE_TITLE_EMBEDDING = True  # Use Title for News, Content for FB
EMBEDDING_CHAR_LIMIT = 300

print("üßπ Cleaning and Preprocessing...")
processed_texts = []
cleaned_count = 0

for p in posts:
    # 1. Clean FB Content
    if p.get('source') == 'Facebook':
        p['content'] = clean_facebook_content(p.get('content', ''))
        cleaned_count += 1

    # 2. Select Text for Embedding
    text_to_embed = ""
    if USE_TITLE_EMBEDDING:
        # Use title if available and reasonable length (News)
        title = p.get('title', '')
        if title and len(str(title)) > 5:
            text_to_embed = str(title)
        else:
            text_to_embed = p.get('content', '')
    else:
        text_to_embed = p.get('content', '')
    
    processed_texts.append(str(text_to_embed)[:EMBEDDING_CHAR_LIMIT])

print(f"‚úÖ Cleaned {cleaned_count} FB posts.")
print(f"‚ÑπÔ∏è Using {'TITLES' if USE_TITLE_EMBEDDING else 'CONTENT'} for embedding.")

## üß† 4. Hybrid Search Setup (BM25 + Dense)

In [None]:
MODEL_NAME = "keepitreal/vietnamese-sbert"
print("‚öôÔ∏è Initializing Models...")

# 1. Initialize Dense Retrieval (SentenceTransformer)
embedder = SentenceTransformer(MODEL_NAME)

# 2. Initialize Sparse Retrieval (BM25)
tokenized_corpus = [doc.split(" ") for doc in processed_texts]
bm25 = BM25Okapi(tokenized_corpus)

# Hybrid Search Function
def hybrid_search(query, top_k=5, alpha=0.5):
    # Dense
    query_emb = embedder.encode(query, convert_to_tensor=True)
    # (In production, pre-compute corpus_embs!)
    corpus_embs = embedder.encode(processed_texts, convert_to_tensor=True, show_progress_bar=False)
    dense_scores = util.cos_sim(query_emb, corpus_embs)[0].cpu().numpy()
    
    # Sparse
    tokenized_query = query.split(" ")
    sparse_scores = np.array(bm25.get_scores(tokenized_query))
    if sparse_scores.max() > 0:
        sparse_scores /= sparse_scores.max()
        
    # Combine
    final_scores = alpha * dense_scores + (1 - alpha) * sparse_scores
    
    # Get Top K
    top_indices = np.argsort(final_scores)[::-1][:top_k]
    return [(i, final_scores[i]) for i in top_indices]

print("‚úÖ Hybrid Search Ready.")

In [None]:
# TEST IT
query = "SEA Games 33"
results = hybrid_search(query, top_k=3)

print(f"üîç Top results for '{query}':")
for idx, score in results:
    print(f"[{score:.4f}] {processed_texts[idx]}")

## üìù 5. Batch Summarization (Optional)

In [None]:
from scripts.batch_summarize import batch_summarize

# Only summarize if you need it later
RUN_SUMMARIZATION = False

if RUN_SUMMARIZATION:
    print("Running batch summarization for FB posts...")
    fb_path = fb_files[0] if fb_files else None
    if fb_path:
        batch_summarize(
            input_path=fb_path, 
            output_path='fb_summaries.json', 
            model_name='vit5-base', 
            resume=True
        )