In [None]:
import os
import pandas as pd
import numpy as np
import torch
from datetime import datetime
import sys

# Add project root to path
sys.path.append(os.path.abspath('..'))

from src.pipeline.main_pipeline import find_matches_hybrid, load_json, load_trends
from src.utils.demo_state import save_demo_state

# Configuration
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") # Ensure this is set in your env
THRESHOLD = 0.45
USE_RRF = True
RRF_K = 60
USE_PRF = True
USE_CACHE = True
USE_NER = True
MATCH_WEIGHTS = {'dense': 0.6, 'sparse': 0.4}
EMBEDDING_CHAR_LIMIT = 500
COHERENCE_THRESHOLD = 0.3
DEBUG_LLM = False


In [None]:
# Data Paths
post_files = [
    'data/crawler/threads/threads_posts_20251221_194726.json',
    'data/crawler/facebook/facebook_posts_20251221_194726.json'
]
trend_file = 'data/google_trends/news_trends_20251221.json'

# Load data
posts = []
for f in post_files:
    if os.path.exists(f):
        posts.extend(load_json(f))
    else:
        print(f"⚠️ Warning: {f} not found.")

trends = load_trends(trend_file)
print(f"✅ Loaded {len(posts)} posts and {len(trends)} trends.")


In [None]:
# Run the full pipeline
matches, components = find_matches_hybrid(
    posts=posts, 
    trends=trends, 
    use_llm=True, 
    gemini_api_key=GEMINI_API_KEY, 
    llm_provider='gemini',
    min_cluster_size=10,
    no_dedup=False,
    debug_llm=DEBUG_LLM,
    save_all=True,
    embedding_char_limit=EMBEDDING_CHAR_LIMIT,
    threshold=THRESHOLD,
    use_rrf=USE_RRF,
    rrf_k=RRF_K,
    use_prf=USE_PRF,
    use_cache=USE_CACHE,
    use_ner=USE_NER,
    coherence_threshold=COHERENCE_THRESHOLD,
    match_weights=MATCH_WEIGHTS,
    return_components=True
)

# Extract components
trend_embeddings = components['trend_embeddings']
post_embeddings = components['post_embeddings']
cluster_labels = components['cluster_labels']
cluster_mapping = components['cluster_mapping']
MODEL_NAME = components['model_name']

df_results = pd.DataFrame(matches)
print(f"✅ Pipeline completed. Found {len(df_results)} matches.")
df_results.head(10)


In [None]:
# Save state for Streamlit Demo
save_demo_state(
    save_dir='demo_data',
    df_results=df_results,
    trends=trends,
    trend_embeddings=trend_embeddings,
    post_embeddings=post_embeddings,
    cluster_labels=cluster_labels,
    cluster_mapping=cluster_mapping,
    model_name=MODEL_NAME,
    metadata={
        'threshold': THRESHOLD,
        'run_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'use_rrf': USE_RRF,
        'use_prf': USE_PRF
    }
)
print("✅ Demo state saved to 'demo_data/'")
