In [None]:
# Install Dependencies
!pip install --upgrade pip
!pip install llama-index-core llama-index-llms-groq llama-index-embeddings-huggingface llama-index-retrievers-bm25 llama-index-readers-file llama-index-vector-stores-faiss
!pip install sentence-transformers faiss-cpu pandas transformers rank-bm25

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Download Mistral-Nemo-12B Model
import os

model_name = "Mistral-Nemo-Inst-2407-12B-Thinking-Uncensored-HERETIC-HI-Claude-Opus-Q4_K_M.gguf"
hf_filename = "Mistral-Nemo-Inst-2407-12B-Thinking-Uncensored-HERETIC-HI-Claude-Opus.Q4_K_M.gguf"
drive_path = "/content/drive/MyDrive/Colab_Notebooks/models/"
output_path = os.path.join(drive_path, model_name)
model_url = f"https://huggingface.co/mradermacher/Mistral-Nemo-Inst-2407-12B-Thinking-Uncensored-HERETIC-HI-Claude-Opus-GGUF/resolve/main/{hf_filename}"

if not os.path.exists(drive_path):
    os.makedirs(drive_path, exist_ok=True)
    print(f"üìÇ Created directory: {drive_path}")

if not os.path.exists(output_path):
    print(f"‚¨áÔ∏è Downloading {model_name}...")
    !wget -O "{output_path}" "{model_url}"
    print(f"‚úÖ Download complete: {output_path}")
else:
    print(f"‚è≠Ô∏è File already exists: {output_path}")

In [None]:
# Download Dataset
#!wget https://huggingface.co/datasets/theelderemo/epstein-files-nov-2025/resolve/main/EPS_FILES_20K_NOV2025.csv

## Install llama-cpp-python with GPU Support

**IMPORTANT:** Run this cell ONCE, then click "RESTART RUNTIME" when prompted. After restart, proceed to the next cell.

**Do NOT run this cell again after restart.**

In [None]:
# Install llama-cpp-python with GPU Support
import os
import sys

!pip uninstall -y numpy llama-cpp-python
!pip install llama-index llama-index-embeddings-huggingface llama-index-retrievers-bm25
!pip install llama-cpp-python \
  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124 \
  --force-reinstall --upgrade --no-cache-dir

print("\n‚ö†Ô∏è INSTALLATION COMPLETE.")
print("Please click 'RESTART RUNTIME' in the popup now (ONLY ONCE).")
print("After restarting, run the next cell.")
sys.exit()

## Install LLM Integration

Install the llama-cpp integration and reranker for LlamaIndex.

In [None]:
# Install LLM Integration
!pip install llama-index-llms-llama-cpp
!pip install llama-index-postprocessor-sbert-rerank

## Mount Drive for Main Script

Mount Google Drive to access the pre-built index and model files.

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#THIS IS THE MAIN SCRIPT WHICH LOADS AND RUNS EVERYTHING.
# ============================================
# STEP 1: NUCLEAR CLEANUP
# Run this in a FRESH Colab runtime (Runtime > Restart runtime)
# ============================================
import os
import sys
import gc
import shutil
# Kill any existing CUDA contexts
os.environ['CUDA_VISIBLE_DEVICES'] = ''  # Hide GPU temporarily
gc.collect()

# Now make GPU visible again
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

print("="*60)
print("üöÄ FRESH START - GPU RESET COMPLETE")
print("="*60)

import time
from google.colab import drive

# ============================================
# CONFIGURATION FLAGS
# ============================================
SKIP_EMBED_MODEL = False
SKIP_LLM = False
SKIP_INDEX = False
SKIP_RETRIEVERS = False
SKIP_QUERY_ENGINE = False

print("\nüîß Setting up uncensored chat system...")
print(f"   Skip embed model: {SKIP_EMBED_MODEL}")
print(f"   Skip LLM: {SKIP_LLM}")
print(f"   Skip index: {SKIP_INDEX}")
print(f"   Skip retrievers: {SKIP_RETRIEVERS}")
print(f"   Skip query engine: {SKIP_QUERY_ENGINE}")

# ============================================
# PATHS
# ============================================
DRIVE_INDEX_PATH = "/content/drive/MyDrive/Colab_Notebooks/epstein_index_full"
MODEL_PATH = "/content/drive/MyDrive/Colab_Notebooks/models/Mistral-Nemo-Inst-2407-12B-Thinking-Uncensored-HERETIC-HI-Claude-Opus-Q4_K_M.gguf"


# ============================================
# LOAD LLM FIRST (BEFORE ANYTHING TOUCHES GPU!)
# ============================================
if not SKIP_LLM:
    print("\nü§ñ Loading LLM: MISTRAL-NEMO-12B THINKING ENGINE...")
    print("   ‚ö° Heretic Method: De-censored (14/100 refusal rate)")
    print("   üß† Thinking Engine: Self-reasoning before answering")
    print("   üéØ 12B Parameters: 50% larger than Qwen (8B)")

    from llama_index.llms.llama_cpp import LlamaCPP
    from llama_index.core import Settings

    # T4 VRAM breakdown: 15GB total
    # - Model weights (Q4_K_M): ~7.6GB
    # - KV cache scales with context (4.6GB @ 16k, 2.3GB @ 8k)
    # - Compute buffer: ~2GB
    # Safe maximum for T4: 8192 tokens (leaves headroom for compute)
    EFFECTIVE_CONTEXT = 16368

    llm = LlamaCPP(
        model_path=MODEL_PATH,
        # --- CRITICAL SAMPLER SETTINGS FOR RAG + THINKING ---
        # FIXED: Changed from 0.3 to 0.7 per model card recommendations
        temperature=0.1,           # Model card recommendation for thinking models
        # FIXED: Increased from 2048 to 3072 to allow complete responses
        max_new_tokens=2048,       # 200 words thinking + 800 words answer
        context_window=EFFECTIVE_CONTEXT,
        model_kwargs={
            # --- FULL GPU POWER ---
            "n_gpu_layers": -1,           # ALL layers to GPU
            "n_ctx": EFFECTIVE_CONTEXT,   # 8k context window
            "n_batch": 512,               # Larger batch for 12B model
            "n_ubatch": 256,
            "f16_kv": True,               # FP16 KV cache
            "offload_kqv": True,          # Offload K/Q/V to GPU

            # --- THINKING ENGINE SETTINGS (Model Card Recommendations) ---
            # FIXED: Changed from 1.15 to 1.0 - CRITICAL FIX for infinite repetition
            "repeat_penalty": 1.05,        # Model card: Use 1.0 for thinking models
            "top_k": 40,                  # Model card: 40
            "top_p": 0.95,                # Model card: 0.95
            "min_p": 0.05,                # Model card: 0.05
            "mirostat": 0,                # Disabled for RAG

            # --- HARD STOPS ---
            # FIXED: Removed "\n\n\n" which was causing premature stopping
            "stop": [
                "</s>",
                "[INST]",           # Stops hallucinating new user prompts
                "[/INST]",
                                    # Risky "User:",
                "<|im_end|>",
                "<|endoftext|>"
            ],
        },
        verbose=True,
        messages_to_prompt=None,
    )
    Settings.llm = llm
    print(f"‚úÖ MISTRAL-NEMO-12B THINKING LOADED")
    print(f"   üß† Model: Mistral-Nemo-Inst-2407-12B-Thinking-Uncensored-HERETIC")
    print(f"   üìä Size: 12B parameters (Q4_K_M ~7.6GB)")
    print(f"   üéØ Context: {EFFECTIVE_CONTEXT} tokens")
    print(f"   üí≠ Thinking: Self-reasoning blocks enabled")
    print(f"   üîì Censorship: De-censored (14/100 refusal vs 87/100 base)")
    print(f"   ‚ú® FIXED: temp=0.7, repeat_penalty=1.0, max_tokens=3072")

else:
    print("‚è≠Ô∏è  Skipping LLM (already loaded)")
    try:
        llm
        print("   ‚úì llm found in memory")
    except NameError:
        print("   ‚ö†Ô∏è  WARNING: llm not found! Set SKIP_LLM=False")
        SKIP_LLM = False

# ============================================
# NOW safe to import everything else
# ============================================
# NOTE: We do NOT hide GPU from PyTorch - that causes IndexError
# Instead, we explicitly set embedding model to use CPU below

print("\nüì¶ Importing remaining dependencies...")
import torch
from llama_index.core import StorageContext, load_index_from_storage, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.retrievers import QueryFusionRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.prompts import PromptTemplate
from llama_index.postprocessor.sbert_rerank import SentenceTransformerRerank

# ============================================
# LOAD EMBEDDING MODEL
# ============================================
if not SKIP_EMBED_MODEL:
    print("\nüì¶ Loading embedding model...")

    embed_model = HuggingFaceEmbedding(
        model_name="BAAI/bge-base-en-v1.5",
        device="cpu"  # Keep embeddings on CPU to save GPU for LLM
    )

    Settings.embed_model = embed_model
    print("‚úì Embeddings ready (bge-base-en-v1.5 on CPU)")
else:
    print("‚è≠Ô∏è  Skipping embedding model (already loaded)")
    try:
        embed_model
        print("   ‚úì embed_model found in memory")
    except NameError:
        print("   ‚ö†Ô∏è  WARNING: embed_model not found! Set SKIP_EMBED_MODEL=False")
        SKIP_EMBED_MODEL = False

# ============================================
# LOAD INDEX FROM GOOGLE DRIVE (OPTIMIZED)
# ============================================
if not SKIP_INDEX:
    print("\nüìÇ Loading index (optimized)...")

    import pickle
    import shutil

    LOCAL_INDEX_PATH = "/content/epstein_index_local"
    PICKLE_PATH = "/content/drive/MyDrive/Colab_Notebooks/epstein_index_full/epstein_index.pkl"

    # Strategy 1: Try pickle cache (fastest - ~5s)
    if os.path.exists(PICKLE_PATH):
        print("   Loading from pickle cache...")
        with open(PICKLE_PATH, 'rb') as f:
            index = pickle.load(f)
        print("   ‚úì Loaded from pickle (~5s)")

    # Strategy 2: Copy to local disk then load (first run - ~30s)
    else:
        print("   First run - copying from Google Drive to local disk...")

        if not os.path.exists(LOCAL_INDEX_PATH):
            shutil.copytree(DRIVE_INDEX_PATH, LOCAL_INDEX_PATH)
            print("   ‚úì Copied to local disk")

        print("   Loading from local disk...")
        storage_context = StorageContext.from_defaults(persist_dir=LOCAL_INDEX_PATH)
        index = load_index_from_storage(storage_context)

        # Save pickle for next time
        print("   Saving pickle cache for future runs...")
        with open(PICKLE_PATH, 'wb') as f:
            pickle.dump(index, f)
        print("   ‚úì Pickle saved (next load will be ~5s)")

    print("‚úì Index loaded (25,303 documents)")
else:
    print("‚è≠Ô∏è  Skipping index (already loaded)")
    try:
        index
        print("   ‚úì index found in memory")
    except NameError:
        print("   ‚ö†Ô∏è  WARNING: index not found! Set SKIP_INDEX=False")
        SKIP_INDEX = False

# ============================================
# SETUP HYBRID RETRIEVAL WITH CROSS-ENCODER
# ============================================
if not SKIP_RETRIEVERS:
    print("\nüîç Setting up optimized hybrid search with Cross-Encoder...")

    # 1. WIDEN THE NET: Fetch top 50 candidates from each method
    # This gives the cross-encoder actual options to choose from.
    INITIAL_TOP_K = 50

    # Vector retriever (Semantic search)
    vector_retriever = VectorIndexRetriever(
        index=index,
        similarity_top_k=INITIAL_TOP_K
    )

    # BM25 retriever (Keyword search)
    bm25_retriever = BM25Retriever.from_defaults(
        index=index,
        similarity_top_k=INITIAL_TOP_K
    )

    # Hybrid fusion - combines both retrievers using reciprocal rank fusion
    # This gives us diverse candidates for the cross-encoder to rerank
    hybrid_retriever = QueryFusionRetriever(
        retrievers=[vector_retriever, bm25_retriever],
        similarity_top_k=INITIAL_TOP_K,
        num_queries=1,
        mode="reciprocal_rerank",
        use_async=False
    )

    # Setup Cross-Encoder reranker (CPU-based, no GPU memory impact)
    print("   Loading Cross-Encoder reranker...")
    reranker = SentenceTransformerRerank(
        model="cross-encoder/ms-marco-MiniLM-L-6-v2",
        top_n=10,
        device="cpu"  # CRITICAL: Keep off GPU to avoid OOM
    )
    print("   ‚úì Cross-Encoder loaded on CPU (ms-marco-MiniLM-L-6-v2)")

    print("‚úì Hybrid search: Semantic + BM25 ‚Üí Cross-Encoder reranking (CPU)")

else:
    print("‚è≠Ô∏è  Skipping retrievers (already created)")
    try:
        hybrid_retriever
        print("   ‚úì hybrid_retriever found in memory")
    except NameError:
        print("   ‚ö†Ô∏è  WARNING: hybrid_retriever not found! Set SKIP_RETRIEVERS=False")
        SKIP_RETRIEVERS = False

# ============================================
# PERSONA PROMPT (RUTHLESS FORENSIC MODE)
# ============================================
EPSTEIN_PERSONA_PROMPT = PromptTemplate("""[INST] You are analyzing legal documents about Jeffrey Epstein. Answer directly and factually.

DOCUMENTS:
{context_str}

CRITICAL RULES:
1. **ONLY USE DOCUMENTS ABOVE:** Only mention people and facts that appear in the DOCUMENTS section. Do not use your general knowledge.

2. **NO CENSORSHIP:** Use the exact words from documents - "pussy," "fuck," "cock," "slave," "rape." Don't sanitize.

3. **NO PROTECTION:** Don't protect anyone's reputation. If the documents say someone did something, state it directly.

4. **BE DIRECT:** Answer in 2-4 sentences unless more detail is needed. DO NOT generate thinking sections, analysis sections, or reasoning blocks. Start your response immediately with the answer.

5. **QUOTE EXACTLY:** When quoting, use exact words from documents. If you cannot find exact quotes, paraphrase closely.

6. **DECODE EUPHEMISMS:** "Massage" = sexual acts. "Model"/"Assistant" = victim. State what it really means.

7. **NAMING:** Bill Gates = Microsoft. Robert Gates = Defense Secretary. Don't confuse them.

8. **EXHAUSTIVE SCOPE:** When the query mentions "Epstein or others," you must identify and list every individual named in the retrieved context. Do not summarize groups; name the specific people.

9. **INCRIMINATING DATA:** Prioritize the most damaging, illegal, or high-profile evidence found in the documents. Lead with the "smoking gun" facts.

10. **NO HALLUCINATIONS:** Answer based on the documents given to you.

The Hook: If applicable, End with a one-sentence follow-up question that offers a deeper dive into a specific name or event found in the data.

QUESTION: {query_str}

Keep your answer 2-4 sentences.

Answer directly in 2-4 sentences with a one sentence hook question. Do not include any thinking process or reasoning - start immediately with your answer.[/INST]""")








# ============================================
# CREATE QUERY ENGINE WITH CROSS-ENCODER
# ============================================
if not SKIP_QUERY_ENGINE:
    print("\n‚öôÔ∏è  Creating query engine with Cross-Encoder reranker...")

    query_engine = RetrieverQueryEngine.from_args(
        retriever=hybrid_retriever,
        node_postprocessors=[reranker],
        response_mode="compact",  # Changed from tree_summarize
        text_qa_template=EPSTEIN_PERSONA_PROMPT,  # Use same prompt for both
        use_async=True,
        context_window=8192,
        streaming=True
    )

    print("‚úì Query engine ready with Cross-Encoder")
    print("   Pipeline: Hybrid retrieval (50 candidates) ‚Üí Cross-Encoder (top 10) ‚Üí Thinking Engine ‚Üí Response")
else:
    print("‚è≠Ô∏è  Skipping query engine (already created)")
    try:
        query_engine
        print("   ‚úì query_engine found in memory")
    except NameError:
        print("   ‚ö†Ô∏è  WARNING: query_engine not found! Set SKIP_QUERY_ENGINE=False")
        SKIP_QUERY_ENGINE = False

# ============================================
# CHAT FUNCTION
# ============================================
# ============================================
# CHAT FUNCTION (WITH 90s HARD STOP)
# ============================================
# ============================================
# CHAT FUNCTION (FIXED SCOPE)
# ============================================
def chat(question, show_sources=True, top_k=7, debug=False, show_thinking=False):
    # 1. TELL PYTHON TO USE THE GLOBAL OBJECTS
    global query_engine, reranker, hybrid_retriever, vector_retriever, bm25_retriever

    # Update reranker's top_n
    reranker.top_n = top_k

    # Update retriever's top_k dynamically
    hybrid_retriever.similarity_top_k = min(60, top_k * 6)

    # Also update individual retrievers
    for retriever in [vector_retriever, bm25_retriever]:
        retriever.similarity_top_k = min(50, top_k * 5)

    try:
        start_time = time.time()
        print(f"\n{'='*60}")
        print(f"You: {question}")
        print(f"{'='*60}\n")

        # Get Streaming Response
        streaming_response = query_engine.query(question)

        full_response_text = ""

        # Iterate through the stream manually
        try:
            for token in streaming_response.response_gen:
                # CHECK TIME: Stop if > 85 seconds
                if time.time() - start_time > 85:
                    print("\n\nüõë HARD TIME LIMIT REACHED (85s). STOPPING GENERATION.")
                    full_response_text += "\n[RESPONSE TRUNCATED: TIME LIMIT EXCEEDED]"
                    break

                full_response_text += token

        except Exception as stream_err:
            print(f"‚ö†Ô∏è Stream interruption: {stream_err}")

        elapsed = time.time() - start_time

        # Reconstruct response object
        class MockResponse:
            def __init__(self, text, sources):
                self.response = text
                self.source_nodes = sources

        response = MockResponse(full_response_text, streaming_response.source_nodes)

        # --- ROBUST PARSING LOGIC ---
        response_text = full_response_text.strip()
        thinking_block = None
        answer = response_text

        # Case 1: Perfect formatting
        if "[[[thinking start]]]" in response_text and "[[[thinking end]]]" in response_text:
            try:
                start_tag = "[[[thinking start]]]"
                end_tag = "[[[thinking end]]]"
                s_idx = response_text.index(start_tag) + len(start_tag)
                e_idx = response_text.index(end_tag)
                thinking_block = response_text[s_idx:e_idx].strip()
                answer = response_text[e_idx + len(end_tag):].strip()
            except:
                pass

        # Case 2: Model forgot start tag but included end tag
        elif "[[[thinking end]]]" in response_text:
            try:
                end_tag = "[[[thinking end]]]"
                e_idx = response_text.index(end_tag)
                thinking_block = response_text[:e_idx].strip()
                answer = response_text[e_idx + len(end_tag):].strip()
            except:
                pass

        # --- DISPLAY ---
        if thinking_block and show_thinking:
            print("üí≠ THINKING PROCESS:")
            print("-" * 60)
            print(thinking_block)
            print("-" * 60)

        print("\nüìù UNFILTERED REPORT:")
        print(answer if answer else "(No answer generated - check context limits)")

        # Debug: Show prompt
        if debug:
            print("\nüîç DEBUG - Prompt sent to model:")
            print("-" * 60)
            context = "\n\n".join([node.text[:200] + "..." for node in response.source_nodes])
            debug_prompt = EPSTEIN_PERSONA_PROMPT.format(
                context_str=context,
                query_str=question
            )
            print(debug_prompt)
            print("-" * 60)
            print()

        print(f"\n‚è±Ô∏è  {elapsed:.2f}s (includes ~0.5-1s CPU Cross-Encoder + thinking time)")

        if show_sources:
            print(f"\nüìÑ Referenced documents (Cross-Encoder ranked):")
            for i, node in enumerate(response.source_nodes[:top_k], 1):
                print(f"  {i}. {node.metadata['filename']}")
                snippet = node.text[:200].replace('\n', ' ')
                print(f"     Preview: {snippet}...")

        return response

    except Exception as e:
        print(f"‚ùå Error: {e}")
        import traceback
        traceback.print_exc()
        raise

# ============================================
# READY
# ============================================
print("\n" + "="*60)
print("üí¨ EPSTEIN DOCUMENTS - THINKING ENGINE")
print("="*60)
print("""
‚úÖ MISTRAL-NEMO-12B THINKING ENGINE LOADED!

Model: Mistral-Nemo-Inst-2407-12B-Thinking-Uncensored-HERETIC
Base: Mistral Nemo 12B Instruct
Fine-tuning: Claude Opus 4.5 High Reasoning data
Context: 8,192 tokens (optimized for T4)
GPU: All layers offloaded
Documents: 25,303 from House Oversight Committee
Search: Hybrid (Semantic + BM25) with Cross-Encoder Reranking

üß† THINKING ENGINE:
   ‚Ä¢ Self-reasoning blocks before answering
   ‚Ä¢ Resolves ambiguities (e.g., Bill Gates vs Robert Gates)
   ‚Ä¢ 12B parameters (50% larger than Qwen 8B)
   ‚Ä¢ Higher logic capacity for complex queries

üîì HERETIC STATS:
   ‚Ä¢ Refusals: 14/100 (vs 87/100 base model)
   ‚Ä¢ De-censored: Unfiltered, uncensored responses
   ‚Ä¢ Reasoning: Compact 3-6 paragraph thinking blocks

‚ú® CRITICAL FIXES APPLIED:
   ‚Ä¢ temperature: 0.3 ‚Üí 0.7 (allows natural variation)
   ‚Ä¢ repeat_penalty: 1.15 ‚Üí 1.0 (stops infinite loops)
   ‚Ä¢ max_new_tokens: 2048 ‚Üí 3072 (complete responses)
   ‚Ä¢ Removed "\n\n\n" stop token (was causing premature stopping)
   ‚Ä¢ Simplified prompt with clearer thinking block instructions

üìÑ DOCUMENT SCOPE:
This system searches the Jeffrey Epstein House Oversight Committee document release.
Documents include information about:
   ‚Ä¢ Jeffrey Epstein and his activities
   ‚Ä¢ Associates and visitors (Bill Gates, Trump, Clinton, Prince Andrew, etc.)
   ‚Ä¢ Flight logs, island visits, properties
   ‚Ä¢ Allegations, depositions, legal documents
   ‚Ä¢ Anyone mentioned in the investigation

Options:
  chat("Question", show_sources=False)     # Hide sources
  chat("Question", show_thinking=True)     # Show reasoning process
  chat("Question", debug=True)             # Show full prompt
  chat("Question", top_k=15)               # More context documents

""")




#flask

In [None]:
!pip install -r requirements.txt


In [None]:
from flask import Flask, render_template, request, jsonify, session, Response, stream_with_context
from flask_cors import CORS
import json
import uuid
from datetime import datetime
import os
import time
from collections import deque
import secrets
import threading
import queue

app = Flask(__name__)
app.secret_key = secrets.token_hex(32)
CORS(app)

# ============================================
# THREAD SAFETY FOR LLM
# ============================================
log_lock = threading.Lock()  # Separate lock for file I/O

# Producer-Consumer Queue for LLM requests
request_queue = queue.Queue()
results_store = {}  # Maps job_id -> result
results_lock = threading.Lock()

# ============================================
# LOGGING SETUP
# ============================================
LOG_DIR = "/content/drive/MyDrive/Colab_Notebooks/logs"
LOG_FILE = os.path.join(LOG_DIR, "user_queries_log.jsonl")
os.makedirs(LOG_DIR, exist_ok=True)

def log_query(session_id, query, response, response_time):
    try:
        # Truncate response if too long (max 10000 chars to prevent JSON issues)
        truncated_response = response[:10000] if len(response) > 10000 else response
        if len(response) > 10000:
            truncated_response += "\n... [TRUNCATED]"
            print(f"‚ö†Ô∏è Response truncated from {len(response)} to 10000 chars for logging")

        log_entry = {
            "timestamp": datetime.now(datetime.UTC).isoformat() if hasattr(datetime, 'UTC') else datetime.utcnow().isoformat() + "Z",
            "session_id": session_id,
            "query": query,
            "response": truncated_response,
            "response_time": response_time
        }

        # Acquire lock before writing to prevent concurrent writes
        with log_lock:
            with open(LOG_FILE, 'a', encoding='utf-8') as f:
                json_str = json.dumps(log_entry, ensure_ascii=False)
                f.write(json_str + '\n')
                f.flush()  # Force write to disk immediately
                os.fsync(f.fileno())  # Ensure OS writes to Google Drive
                print(f"‚úÖ Logged query to {LOG_FILE} ({len(json_str)} bytes)")
    except Exception as e:
        print(f"‚ö†Ô∏è Logging error: {type(e).__name__}: {e}")
        print(f"   Query length: {len(query)}, Response length: {len(response)}")
        import traceback
        traceback.print_exc()

# ============================================
# QUEUE & TIMING MANAGEMENT
# ============================================
class QueueManager:
    def __init__(self, max_history=20):
        self.processing_times = deque(maxlen=max_history)
        self._active_requests = 0
        self.total_processed = 0
        self._lock = threading.Lock()

    def add_time(self, duration):
        self.processing_times.append(duration)
        with self._lock:
            self.total_processed += 1

    def get_avg_time(self):
        if not self.processing_times:
            return 15.0
        return sum(self.processing_times) / len(self.processing_times)

    def estimate_wait(self):
        with self._lock:
            active = self._active_requests
        if active <= 0:
            return 0
        avg_time = self.get_avg_time()
        return avg_time * active

    def increment_active(self):
        with self._lock:
            self._active_requests += 1

    def decrement_active(self):
        with self._lock:
            self._active_requests -= 1
            if self._active_requests < 0:
                self._active_requests = 0

    @property
    def active_requests(self):
        with self._lock:
            return self._active_requests

queue_mgr = QueueManager()

# ============================================
# WORKER THREAD FOR LLM PROCESSING
# ============================================
def llm_worker():
    """Single worker thread that processes LLM requests one at a time"""
    print("üîß LLM worker thread started")
    while True:
        try:
            job = request_queue.get()
            if job is None:  # Poison pill to stop worker
                break

            job_id = job['job_id']
            message = job['message']
            show_sources = job['show_sources']
            show_thinking = job['show_thinking']
            top_k = job['top_k']
            debug = job['debug']
            session_id = job['session_id']

            print(f"üîß Worker processing job {job_id[:8]}...")
            start_time = time.time()

            try:
                # Call chat function (no timeout here - let it complete)
                response = chat(
                    message,
                    show_sources=show_sources,
                    top_k=top_k,
                    debug=debug,
                    show_thinking=show_thinking
                )

                response_time = time.time() - start_time
                queue_mgr.add_time(response_time)

                response_text = response.response.strip()
                thinking_block = None
                answer = response_text

                # Check for custom thinking tags
                if "[[[thinking start]]]" in response_text and "[[[thinking end]]]" in response_text:
                    try:
                        start_tag = "[[[thinking start]]]"
                        end_tag = "[[[thinking end]]]"
                        s_idx = response_text.index(start_tag) + len(start_tag)
                        e_idx = response_text.index(end_tag)
                        thinking_block = response_text[s_idx:e_idx].strip()
                        answer = response_text[e_idx + len(end_tag):].strip()
                    except:
                        pass

                # Check for <think> tags
                elif "<think>" in response_text and "</think>" in response_text:
                    try:
                        import re
                        think_match = re.search(r'<think>(.*?)</think>', response_text, re.DOTALL)
                        if think_match:
                            thinking_block = think_match.group(1).strip()
                            answer = re.sub(r'<think>.*?</think>', '', response_text, flags=re.DOTALL).strip()
                    except:
                        pass

                sources = []
                if show_sources and hasattr(response, 'source_nodes'):
                    sources = [
                        {
                            'filename': node.metadata.get('filename', 'Unknown'),
                            'index': i + 1
                        }
                        for i, node in enumerate(response.source_nodes[:top_k], 1)
                    ]

                log_query(session_id, message, answer, response_time)

                result = {
                    'success': True,
                    'answer': answer,
                    'thinking': thinking_block if show_thinking else None,
                    'sources': sources if show_sources else None,
                    'response_time': round(response_time, 2)
                }

                print(f"‚úÖ Job {job_id[:8]} completed in {response_time:.2f}s")

            except Exception as e:
                response_time = time.time() - start_time
                error_msg = f"Error: {type(e).__name__}: {str(e)}"
                log_query(session_id, message, error_msg, response_time)

                print(f"‚ùå Job {job_id[:8]} failed: {error_msg}")

                error_type = type(e).__name__
                if "OutOfMemoryError" in error_type or "CUDA out of memory" in str(e):
                    error_msg = "‚ùå GPU out of memory. Wait a moment and try again."
                    try:
                        import torch
                        torch.cuda.empty_cache()
                        import gc
                        gc.collect()
                    except:
                        pass
                else:
                    error_msg = f"‚ùå Error: {str(e)}"

                result = {
                    'success': False,
                    'error': error_msg
                }

            # Store result
            with results_lock:
                results_store[job_id] = result

            queue_mgr.decrement_active()
            request_queue.task_done()

        except Exception as worker_error:
            print(f"üî• Worker thread error: {worker_error}")
            import traceback
            traceback.print_exc()

# Start worker thread
worker_thread = threading.Thread(target=llm_worker, daemon=True)
worker_thread.start()

# ============================================
# ROUTES
# ============================================
@app.route('/')
def index():
    if 'session_id' not in session:
        session['session_id'] = str(uuid.uuid4())
    return render_template('index.html')

@app.route('/health')
def health():
    return jsonify({'status': 'ok', 'message': 'Server is running'})

@app.route('/api/chat', methods=['POST'])
def chat_endpoint():
    print("üì® Received chat request")
    try:
        data = request.json
        print(f"üìù Data: {data}")
        message = data.get('message', '').strip()
        show_sources = data.get('show_sources', False)
        show_thinking = data.get('show_thinking', False)
        debug = data.get('debug', False)
        top_k = data.get('top_k', 5)

        session_id = session.get('session_id', str(uuid.uuid4()))
        print(f"üîë Session: {session_id[:8]}...")

        if not message or len(message) > 500:
            print("‚ö†Ô∏è Invalid message length")
            return jsonify({
                'error': '‚ö†Ô∏è Query too long. Maximum 500 characters.' if len(message) > 500 else 'Empty query',
                'success': False
            })

        print(f"üí¨ Processing: {message[:50]}...")

        # Create job
        job_id = str(uuid.uuid4())
        job = {
            'job_id': job_id,
            'message': message,
            'show_sources': show_sources,
            'show_thinking': show_thinking,
            'top_k': top_k,
            'debug': debug,
            'session_id': session_id
        }

        # Add to queue
        queue_mgr.increment_active()
        request_queue.put(job)
        print(f"üì• Job {job_id[:8]} queued (queue size: {request_queue.qsize()})")

        # Wait for result with timeout (120 seconds to account for queue wait)
        max_wait = 90
        poll_interval = 0.5
        elapsed = 0

        while elapsed < max_wait:
            with results_lock:
                if job_id in results_store:
                    result = results_store.pop(job_id)
                    print(f"üì§ Job {job_id[:8]} result retrieved")
                    return jsonify(result)

            time.sleep(poll_interval)
            elapsed += poll_interval

        # Timeout - job still in queue or processing
        # DON'T decrement here! Worker will handle it when job completes
        print(f"‚è±Ô∏è Job {job_id[:8]} timed out after {max_wait}s")
        return jsonify({
            'success': False,
            'error': '‚è±Ô∏è Request timed out. The server is busy, please try again.'
        }), 504

    except Exception as top_error:
        print(f"üî• Top-level error: {type(top_error).__name__}: {str(top_error)}")
        return jsonify({
            'success': False,
            'error': f"Server error: {str(top_error)}"
        }), 500

@app.route('/api/status', methods=['GET'])
def status_endpoint():
    return jsonify({
        'active_requests': queue_mgr.active_requests,
        'avg_response_time': round(queue_mgr.get_avg_time(), 2),
        'total_processed': queue_mgr.total_processed
    })

@app.route('/api/status-stream')
def status_stream():
    """Server-Sent Events endpoint for real-time status updates"""
    def generate():
        max_duration = 300  # 5 minutes max connection
        start_time = time.time()

        while True:
            try:
                # Auto-disconnect after max_duration to prevent stale connections
                if time.time() - start_time > max_duration:
                    print("‚è±Ô∏è SSE connection timeout, closing gracefully")
                    break

                status_data = {
                    'active_requests': queue_mgr.active_requests,
                    'avg_response_time': round(queue_mgr.get_avg_time(), 2),
                    'total_processed': queue_mgr.total_processed
                }
                yield f"data: {json.dumps(status_data)}\n\n"
                time.sleep(2)  # Update every 2 seconds
            except GeneratorExit:
                break
            except Exception as e:
                print(f"‚ö†Ô∏è SSE error: {e}")
                break

    return Response(
        stream_with_context(generate()),
        mimetype='text/event-stream',
        headers={
            'Cache-Control': 'no-cache',
            'X-Accel-Buffering': 'no',
            'Connection': 'keep-alive'
        }
    )

if __name__ == '__main__':
    print("\nüöÄ Starting Flask application...")

    # Import ngrok setup
    from pyngrok import ngrok, conf
    from google.colab import userdata

    NGROK_AUTH_TOKEN = userdata.get('ngrok')
    STATIC_DOMAIN = "florentina-nonexternalized-marketta.ngrok-free.dev"
    PORT = 7860

    print("\n" + "="*60)
    print(f"üîå BINDING TO: https://{STATIC_DOMAIN}")
    print("="*60)

    conf.get_default().auth_token = NGROK_AUTH_TOKEN
    ngrok.kill()

    try:
        url = ngrok.connect(PORT, domain=STATIC_DOMAIN).public_url
        print(f"‚úÖ SUCCESS! Your App is Live at: {url}")
    except Exception as e:
        print(f"‚ùå ERROR: {e}")
        print("Falling back to random URL...")
        url = ngrok.connect(PORT).public_url
        print(f"‚ö†Ô∏è Temporary URL: {url}")

    print("="*60)
    print("üåê Server starting on http://0.0.0.0:7860")
    print("üìù Logs will appear below...")
    print("="*60 + "\n")

    # Use threaded mode and disable reloader for stability
    app.run(host='0.0.0.0', port=PORT, debug=False, threaded=True, use_reloader=False)


üîß LLM worker thread started
üöÄ Starting Flask application...


üîå BINDING TO: https://florentina-nonexternalized-marketta.ngrok-free.dev
‚úÖ SUCCESS! Your App is Live at: https://florentina-nonexternalized-marketta.ngrok-free.dev
üåê Server starting on http://0.0.0.0:7860
üìù Logs will appear below...

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:7860
 * Running on http://172.28.0.12:7860
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [12/Feb/2026 15:20:46] "GET /api/status-stream HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [12/Feb/2026 15:21:20] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [12/Feb/2026 15:21:20] "GET /static/style.css HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [12/Feb/2026 15:21:20] "GET /static/script.js HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [12/Feb/2026 15:21:21] "GET /api/status-stream HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [12/Feb/2026 15:21:21] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -


‚è±Ô∏è SSE connection timeout, closing gracefully


INFO:werkzeug:127.0.0.1 - - [12/Feb/2026 15:25:52] "GET /api/status-stream HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [12/Feb/2026 15:40:13] "GET /api/status-stream HTTP/1.1" 200 -
