In [None]:
# @title ‚öôÔ∏è CELL 0 ‚Äî Configuration & Environment Setup (The Golden Copy)
# English-only comments as requested.

import os
import sys
import logging
import warnings
import torch
import shutil
from pathlib import Path

# --- 1. Suppress Non-Critical Warnings ---
# This keeps the notebook output clean and professional.
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
# Suppress specific LangChain deprecation warnings for cleaner logs
warnings.filterwarnings("ignore", module="langchain")

print("üöÄ Initializing Golden Copy Environment...")

# --- 2. GPU Validation ---
if not torch.cuda.is_available():
    raise RuntimeError("‚ùå No GPU detected! This notebook requires an A100/V100 GPU.")
    
gpu_name = torch.cuda.get_device_name(0)
print(f"‚úÖ GPU Detected: {gpu_name}")

# --- 3. Central Configuration Class ---
class Config:
    # =====================================================
    # üèóÔ∏è PATHS (Ibex Cluster Specific)
    # =====================================================
    
    # Base Project Directory
    PROJECT_DIR = Path("/ibex/user/rashidah/projects/MOI_ChatBot/chatbot_project")
    
    # üî¥ CRITICAL: Pointing to the SFT MERGED Model (From Notebook 2)
    LLM_MODEL_PATH = Path("/ibex/user/rashidah/projects/MOI_ChatBot/al-lam_bilingual_sft/artifacts/ALLaM-7B-MOI-Bilingual-Merged")
    
    # External Data Source (The CSVs)
    DATA_MASTER_DIR = PROJECT_DIR / "1_data" / "Data_Master"
    DATA_CHUNKS_DIR = PROJECT_DIR / "1_data" / "Data_chunks"
    
    # Internal Artifacts (Where we save Vector DB & Logs)
    VECTOR_DB_DIR = PROJECT_DIR / "vector_db_hybrid"  # New folder for the Hybrid DB
    LOGS_DIR = PROJECT_DIR / "4_outputs" / "logs"
    
    # =====================================================
    # üß† MODEL SETTINGS
    # =====================================================
    EMBEDDING_MODEL_NAME = "BAAI/bge-m3"
    
    # üé§ ASR (Audio) Settings - [ADDED IMPROVEMENT]
    ASR_MODEL_NAME = "openai/whisper-large-v3"
    
    # RAG Retrieval Settings
    RETRIEVAL_K = 8        # Fetch more docs initially (High Recall)
    RERANK_TOP_K = 5       # Filter down to the best 5 (High Precision)
    
    # Generation Settings (SFT Tuned)
    GEN_MAX_TOKENS = 1024
    GEN_TEMP = 0.3         # Low temperature for factual consistency
    GEN_REP_PENALTY = 1.15 # Slightly increased to kill loop repetition
    
    # =====================================================
    # üìù LOGGING
    # =====================================================
    LOG_FILE = LOGS_DIR / "golden_app.log"

# --- 4. Directory Initialization ---
for d in [Config.VECTOR_DB_DIR, Config.LOGS_DIR]:
    os.makedirs(d, exist_ok=True)

# --- 5. Logger Setup ---
logging.basicConfig(
    filename=Config.LOG_FILE,
    level=logging.INFO,
    format="%(asctime)s - [HybridBot] - %(levelname)s - %(message)s",
    force=True
)

print(f"‚úÖ Configuration Loaded.")
print(f"üß† Using SFT Model: {Config.LLM_MODEL_PATH.name}")
print(f"üé§ Using ASR Model: {Config.ASR_MODEL_NAME}")
print(f"üìÇ Logs path: {Config.LOG_FILE}")

In [None]:
# @title üìä CELL 1 ‚Äî Data Processing & Vector DB (The Ultimate Memory: CSV + Master + SFT) üß†
# English-only comments as requested.

import os
import re
import glob
import json
import unicodedata
import pandas as pd
from langchain.docstore.document import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

print("üìä Starting Ultimate Data Pipeline...")

# üî¥ UPDATED PATH: SFT Data Location
SFT_DATA_PATH = "/ibex/user/rashidah/projects/MOI_ChatBot/chatbot_project/2_processed/bilingual_moi_absher_sFT_v2.jsonl"

# =====================================================
# 1. Text Normalization
# =====================================================
def normalize_ar(text):
    if not isinstance(text, str): return ""
    text = unicodedata.normalize("NFC", text)
    text = re.sub(r"[\u0617-\u061A\u064B-\u0652\u0670\u06D6-\u06ED]", "", text)
    text = re.sub(r"[ÿ£ÿ•ÿ¢Ÿ±]", "ÿß", text)
    text = text.replace("Ÿâ", "Ÿä")
    return re.sub(r"\s+", " ", text).strip()

processed_docs = []

# =====================================================
# 2. Phase A: Master Mapping (To fix generic titles)
# =====================================================
print("üîπ [Phase A] Building Service Name Map from Masters...")
master_files = sorted(glob.glob(os.path.join(Config.DATA_MASTER_DIR, "*.csv")))
df_master = pd.concat([pd.read_csv(f) for f in master_files], ignore_index=True)

service_map = {}
for _, row in df_master.iterrows():
    sid = str(row.get("service_id", "")).strip()
    title = normalize_ar(str(row.get("service_title_ar", "")))
    if sid and title:
        service_map[sid] = title

print(f"   ‚úÖ Mapped {len(service_map)} services.")

# =====================================================
# 3. Phase B: CSV Chunks (The Raw Info)
# =====================================================
print("üîπ [Phase B] Processing Raw CSV Chunks...")
chunk_files = sorted(glob.glob(os.path.join(Config.DATA_CHUNKS_DIR, "*.csv")))
df_chunks = pd.concat([pd.read_csv(f) for f in chunk_files], ignore_index=True).fillna("")

csv_count = 0
dropped_csv = 0

for _, row in df_chunks.iterrows():
    raw_text = str(row.get("chunk_text", "")).strip()
    clean_text = normalize_ar(raw_text)
    
    # Drop only noise (< 3 chars)
    if len(clean_text) < 3: 
        dropped_csv += 1
        continue
        
    # üíâ CONTEXT INJECTION
    sid = str(row.get("service_id", "")).strip()
    # Use the Real Name from Phase A, fallback to chunk title
    real_name = service_map.get(sid, normalize_ar(str(row.get("chunk_title", ""))))
    section_name = normalize_ar(str(row.get("chunk_title", "")))
    
    # Format: "Service: [Name] | Section: [Type] | Content: [Text]"
    enriched_text = f"ÿßŸÑÿÆÿØŸÖÿ©: {real_name} | ÿßŸÑŸÇÿ≥ŸÖ: {section_name} | ÿßŸÑŸÖÿ≠ÿ™ŸàŸâ: {clean_text}"
    
    meta = {
        "source": "csv",
        "service_id": sid,
        "type": "raw_info"
    }
    processed_docs.append(Document(page_content=enriched_text, metadata=meta))
    csv_count += 1

print(f"   ‚úÖ Processed {csv_count} chunks (Dropped {dropped_csv}).")

# =====================================================
# 4. Phase C: SFT Data Injection (The Golden QA Pairs) üíé
# =====================================================
print(f"üîπ [Phase C] Injecting SFT Q&A Pairs from: {os.path.basename(SFT_DATA_PATH)}")
sft_count = 0

if os.path.exists(SFT_DATA_PATH):
    with open(SFT_DATA_PATH, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                data = json.loads(line)
                q = normalize_ar(data.get("instruction", ""))
                a = normalize_ar(data.get("response", ""))
                
                if len(q) < 5 or len(a) < 5: continue
                
                # üíâ QA FORMAT INJECTION
                # This helps the model find exact answers to similar questions
                qa_text = f"ÿ≥ÿ§ÿßŸÑ: {q} | ÿßŸÑÿ¨Ÿàÿßÿ® ÿßŸÑÿ±ÿ≥ŸÖŸä: {a}"
                
                meta = {
                    "source": "sft_golden_data",
                    "type": "qa_pair"
                }
                processed_docs.append(Document(page_content=qa_text, metadata=meta))
                sft_count += 1
            except:
                continue
    print(f"   ‚úÖ Injected {sft_count} Golden Q&A pairs.")
else:
    print(f"   ‚ö†Ô∏è WARNING: SFT file not found at path!")

# =====================================================
# 5. Build & Save Vector DB
# =====================================================
total_docs = len(processed_docs)
print(f"üìä Total Knowledge Base Size: {total_docs} Documents.")

print("üîπ Initializing Embedding Model (BGE-M3)...")
embedding_model = HuggingFaceEmbeddings(
    model_name=Config.EMBEDDING_MODEL_NAME,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True}
)

print("‚ö° Building Ultimate Vector DB...")
vector_store = FAISS.from_documents(processed_docs, embedding=embedding_model)
vector_store.save_local(Config.VECTOR_DB_DIR)
print(f"‚úÖ Saved to: {Config.VECTOR_DB_DIR}")

print("üéâ CELL 1 COMPLETE.")

In [None]:
# @title üß† CELL 2 ‚Äî Load AI Engines (SFT + RAG + Whisper)
# English-only comments as requested.

import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

print("üîß Loading AI Engines (Golden Setup)...")

# =====================================================
# 1. Load Embedding Model & Vector Store
# =====================================================
print("üîπ [1/4] Loading BGE-M3 Embeddings...")
embedding_model = HuggingFaceEmbeddings(
    model_name=Config.EMBEDDING_MODEL_NAME,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True}
)

print(f"üîπ [2/4] Loading Vector Database from: {Config.VECTOR_DB_DIR}")
try:
    vector_store = FAISS.load_local(
        Config.VECTOR_DB_DIR, 
        embedding_model, 
        allow_dangerous_deserialization=True
    )
    print("‚úÖ Vector Store Loaded Successfully.")
except Exception as e:
    raise RuntimeError(f"‚ùå Failed to load Vector DB! Run CELL 1 first. Error: {e}")

# =====================================================
# 2. Load Whisper (ASR) Model (ADDED FOR AUDIO SUPPORT)
# =====================================================
print(f"üîπ [3/4] Loading Whisper ASR ({Config.ASR_MODEL_NAME})...")
try:
    asr_pipe = pipeline(
        "automatic-speech-recognition",
        model=Config.ASR_MODEL_NAME,
        device="cuda:0",
        torch_dtype=torch.float16,
        model_kwargs={"attn_implementation": "sdpa"} # Fast attention
    )
    print("‚úÖ Whisper Model Loaded.")
except Exception as e:
    print(f"‚ö†Ô∏è Warning: Failed to load Whisper. Audio features will be disabled. Error: {e}")
    asr_pipe = None

# =====================================================
# 3. Load SFT Merged Model (The Brain)
# =====================================================
print(f"üîπ [4/4] Loading SFT Model: {Config.LLM_MODEL_PATH.name}...")

# Check availability
if not Config.LLM_MODEL_PATH.exists():
    raise FileNotFoundError(f"‚ùå SFT Model not found at: {Config.LLM_MODEL_PATH}")

tokenizer = AutoTokenizer.from_pretrained(Config.LLM_MODEL_PATH, use_fast=False)

model = AutoModelForCausalLM.from_pretrained(
    Config.LLM_MODEL_PATH,
    torch_dtype=torch.bfloat16,  # Best for A100
    device_map="auto",
    low_cpu_mem_usage=True
)

# =====================================================
# 4. Create Specialized Pipelines
# =====================================================

# A) Chat Pipeline (Creative but Controlled)
chat_pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=Config.GEN_MAX_TOKENS,
    do_sample=True,
    temperature=Config.GEN_TEMP,
    top_p=0.9,
    repetition_penalty=Config.GEN_REP_PENALTY,
    pad_token_id=tokenizer.eos_token_id
)

# B) Translation Pipeline (Strict/Greedy)
trans_pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    do_sample=False, # Greedy decoding for accuracy
    pad_token_id=tokenizer.eos_token_id
)

print("\n‚úÖ All Engines Ready:")
print("   - Memory: Vector Store (BGE-M3)")
print("   - Ears:   Whisper ASR")
print("   - Brain:  SFT Model (Chat + Translation)")
print("üéâ CELL 2 COMPLETE.")

In [None]:
# @title üîó CELL 3 ‚Äî Hybrid RAG Chain (The Brain with Query Translation)
# English-only comments as requested.

import re
from langchain_community.retrievers import BM25Retriever

print("üß† Building the Logic Chain...")

# =====================================================
# 1. Setup Retrievers (Hybrid)
# =====================================================
# Ensure processed_docs exists from Cell 1
if 'processed_docs' not in globals():
    raise RuntimeError("‚ùå 'processed_docs' missing. Please run CELL 1 first.")

# BM25 for keyword matching (Exact numbers, service names)
bm25_retriever = BM25Retriever.from_documents(processed_docs)
bm25_retriever.k = Config.RETRIEVAL_K

# FAISS for semantic matching (Concepts)
faiss_retriever = vector_store.as_retriever(search_kwargs={"k": Config.RETRIEVAL_K})

# =====================================================
# 2. Helper Functions (The Secret Sauce)
# =====================================================
def detect_language(text):
    """Simple heuristic: if it has English letters, treat as EN."""
    return "en" if re.search(r"[a-zA-Z]", text) else "ar"

def translate_text(text, target_lang):
    """
    Uses the SFT model (Greedy Mode) to translate text.
    Crucial for: EN Query -> AR Search -> EN Answer.
    """
    if target_lang == "ar":
        prompt = f"Translate the following English text to Arabic. Provide *only* the translated text.\n\nEnglish: {text}\n\nArabic:"
        split_token = "Arabic:"
    else:
        prompt = f"Translate the following Arabic text to English. Provide *only* the translated text.\n\nArabic: {text}\n\nEnglish:"
        split_token = "English:"

    try:
        # Use the strict translation pipeline (trans_pipe from Cell 2)
        raw = trans_pipe(prompt)[0]['generated_text']
        out = raw.split(split_token)[-1].strip()
        
        # Safety check: if model repeats prompt or fails
        if len(out) < 2 or out == text:
            return text 
        return out
    except Exception as e:
        print(f"‚ö†Ô∏è Translation Failed: {e}")
        return text

# =====================================================
# 3. The Hybrid Chain Class
# =====================================================
class HybridChain:
    def __init__(self):
        # MUST match the SFT training format
        self.system_template = """<s>[INST] <<SYS>>
You are an expert assistant for Absher and MOI services.
- Answer in the SAME language as the user's question.
- Use the provided [Context] to answer accurately.
- If the info is missing in the context, say "Information is not available in the documents."
<</SYS>>

[Context]
{context}

[User Question]
{question} [/INST]"""

    def retrieve_hybrid(self, query):
        """Combines BM25 and FAISS results with deduplication."""
        # Get docs from both sources
        docs_bm25 = bm25_retriever.invoke(query)
        docs_dense = faiss_retriever.invoke(query)
        
        # Merge and Deduplicate (Round Robin)
        seen = set()
        final_docs = []
        
        import itertools
        for d in itertools.chain.from_iterable(itertools.zip_longest(docs_dense, docs_bm25)):
            if d and d.page_content not in seen:
                final_docs.append(d)
                seen.add(d.page_content)
        
        # Return top K unique docs
        return final_docs[:Config.RERANK_TOP_K]

    def answer(self, user_query):
        # 1. Detect Language
        lang = detect_language(user_query)
        search_query = user_query

        # 2. Query Translation (The Fix for English Search) üîÑ
        if lang == "en":
            print(f"üîÑ Auto-Translating Query: '{user_query}' -> Arabic...")
            translated_query = translate_text(user_query, "ar")
            # Only use translation if it looks valid
            if translated_query and translated_query != user_query:
                search_query = translated_query
            print(f"   ‚Ü≥ Search Query: '{search_query}'")

        # 3. Retrieve (Always searches in Arabic now to find the data)
        docs = self.retrieve_hybrid(normalize_ar(search_query))
        
        # 4. Build Context
        if not docs:
            return "Information is not available in the documents. / ÿßŸÑŸÖÿπŸÑŸàŸÖÿ© ÿ∫Ÿäÿ± ŸÖÿ™ŸàŸÅÿ±ÿ©."
        
        context_text = "\n\n".join([f"- {d.page_content}" for d in docs])

        # 5. Generate Answer (SFT Model)
        full_prompt = self.system_template.format(context=context_text, question=user_query)
        
        response_raw = chat_pipe(full_prompt)[0]['generated_text']
        response_clean = response_raw.split("[/INST]")[-1].strip()

        # 6. Safety Net: Post-hoc Translation üõ°Ô∏è
        # If user asked in EN but bot answered in AR (Context Leakage)
        if lang == "en" and detect_language(response_clean) == "ar":
            print("‚ö†Ô∏è Context Leakage Detected (AR response to EN query). Auto-Fixing...")
            response_clean = translate_text(response_clean, "en")

        return response_clean

# Initialize
hybrid_chain = HybridChain()
print("‚úÖ Hybrid RAG Chain is Ready (With Auto-Translation Logic).")

In [None]:
# @title ü§ñ CELL 5 ‚Äî Pro Chat UI (MOI Edition with Language Selection) üá∏üá¶
# Multi-stage UI: Language Selection -> Chat Interface

import gradio as gr
import time
import os

# Disable tokenizers parallelism
os.environ["TOKENIZERS_PARALLELISM"] = "false"

print("üöÄ Launching MOI Branded Interface...")

# =====================================================
# 1. Custom CSS
# =====================================================
moi_css = """
.moi-header {
    text-align: center;
    padding: 30px;
    background: linear-gradient(90deg, #006C35 0%, #004D26 100%);
    border-radius: 10px;
    color: white;
    margin-bottom: 20px;
}
.moi-header h1 { color: white !important; font-size: 2.5em; }
.lang-btn { font-size: 1.2em; height: 60px; }
"""

# =====================================================
# 2. Logic Handlers
# =====================================================
def start_chat(lang):
    """Transition from Welcome Screen to Chat Screen"""
    # Set welcome message based on language
    if lang == "Arabic":
        greeting = [(None, "üëã ÿ≠ŸäÿßŸÉ ÿßŸÑŸÑŸá! ÿ£ŸÜÿß ŸÖÿ≥ÿßÿπÿØŸÉ ÿßŸÑÿ∞ŸÉŸä ŸÑÿÆÿØŸÖÿßÿ™ Ÿàÿ≤ÿßÿ±ÿ© ÿßŸÑÿØÿßÿÆŸÑŸäÿ©. ÿ™ŸÅÿ∂ŸÑ ÿ®ÿ∑ÿ±ÿ≠ ÿ≥ÿ§ÿßŸÑŸÉ.")]
        rtl = True
        label = "ÿßŸÑŸÖÿ≠ÿßÿØÿ´ÿ© ÿßŸÑŸÅŸàÿ±Ÿäÿ©"
        placeholder = "ÿßŸÉÿ™ÿ® ÿ≥ÿ§ÿßŸÑŸÉ ŸáŸÜÿß..."
    else:
        greeting = [(None, "üëã Hello! I am your MOI Smart Assistant. How can I help you today?")]
        rtl = False
        label = "Live Chat"
        placeholder = "Type your question here..."
        
    return (
        gr.update(visible=False), # Hide Welcome
        gr.update(visible=True),  # Show Chat
        greeting,                 # Set History
        lang,                     # Set State
        gr.update(value=lang),    # Update Radio
        gr.update(label=label, rtl=rtl), # Update Chatbot
        gr.update(placeholder=placeholder, rtl=rtl) # Update Input
    )

def chat_response(message, history, audio_file, lang_val):
    if audio_file:
        try:
            target_lang = "ar" if lang_val == "Arabic" else "en"
            if 'asr_pipe' in globals() and asr_pipe:
                text = asr_pipe(audio_file, generate_kwargs={"language": target_lang})["text"].strip()
                message = text
                user_display = f"üé§ {text}"
            else:
                return history + [[None, "‚ö†Ô∏è Whisper not loaded"]]
        except Exception as e:
            return history + [[None, f"‚ùå Error: {e}"]]
    else:
        user_display = message

    if not message: return history

    try:
        if 'hybrid_chain' in globals():
            response = hybrid_chain.answer(message)
        else:
            response = "‚ö†Ô∏è System Warning: Chain not loaded."
    except Exception as e:
        response = f"‚ùå Error: {e}"

    history.append((user_display, response))
    return history

def reset_app():
    """Return to language selection"""
    return (
        gr.update(visible=True),  # Show Welcome
        gr.update(visible=False), # Hide Chat
        None,                     # Clear State
        []                        # Clear History
    )

def clear_inputs(): return "", None

# =====================================================
# 3. Layout
# =====================================================
with gr.Blocks(theme=gr.themes.Glass(), css=moi_css, title="MOI Assistant") as demo:
    
    # State to hold language preference
    lang_state = gr.State(value="Arabic")

    # --- Header ---
    gr.HTML("""
    <div class='moi-header'>
        <h1>ÿßŸÑŸÖÿ≥ÿßÿπÿØ ÿßŸÑÿ∞ŸÉŸä ŸÑŸàÿ≤ÿßÿ±ÿ© ÿßŸÑÿØÿßÿÆŸÑŸäÿ©</h1>
        <p>MOI Smart Assistant | Powered by ALLaM-7B</p>
    </div>
    """)
    
    # --- SCREEN 1: Welcome & Language Selection ---
    with gr.Group(visible=True) as welcome_screen:
        gr.Markdown("### üåç Please select your preferred language / ÿßŸÑÿ±ÿ¨ÿßÿ° ÿßÿÆÿ™Ÿäÿßÿ± ÿßŸÑŸÑÿ∫ÿ© ÿßŸÑŸÖŸÅÿ∂ŸÑÿ©", elem_id="lang-text")
        with gr.Row():
            btn_ar = gr.Button("ÿßŸÑÿπÿ±ÿ®Ÿäÿ© üá∏üá¶", variant="primary", elem_classes=["lang-btn"])
            btn_en = gr.Button("English üá¨üáß", variant="secondary", elem_classes=["lang-btn"])

    # --- SCREEN 2: Chat Interface ---
    with gr.Group(visible=False) as chat_screen:
        with gr.Row():
            # Left: Chat
            with gr.Column(scale=3):
                chatbot = gr.Chatbot(label="Chat", height=500)
                with gr.Row():
                    msg = gr.Textbox(show_label=False, container=False, scale=4)
                    submit_btn = gr.Button("üöÄ", variant="primary", scale=1)

            # Right: Settings
            with gr.Column(scale=1):
                gr.Markdown("### ‚öôÔ∏è Settings")
                audio_input = gr.Audio(source="microphone", type="filepath", label="Voice Input")
                # Hidden radio just to store state visually if needed
                lang_display = gr.Radio(["Arabic", "English"], label="Language", interactive=False)
                
                gr.Markdown("---")
                restart_btn = gr.Button("üîÑ Change Language / ÿ™ÿ∫ŸäŸäÿ± ÿßŸÑŸÑÿ∫ÿ©", variant="secondary")

    # =====================================================
    # 4. Events
    # =====================================================
    
    # Language Selection
    btn_ar.click(
        fn=lambda: start_chat("Arabic"),
        outputs=[welcome_screen, chat_screen, chatbot, lang_state, lang_display, chatbot, msg]
    )
    btn_en.click(
        fn=lambda: start_chat("English"),
        outputs=[welcome_screen, chat_screen, chatbot, lang_state, lang_display, chatbot, msg]
    )

    # Chat Actions
    msg.submit(chat_response, [msg, chatbot, audio_input, lang_state], [chatbot]) \
       .then(clear_inputs, None, [msg, audio_input])
    
    submit_btn.click(chat_response, [msg, chatbot, audio_input, lang_state], [chatbot]) \
              .then(clear_inputs, None, [msg, audio_input])

    # Restart
    restart_btn.click(reset_app, outputs=[welcome_screen, chat_screen, lang_state, chatbot])

# =====================================================
# 5. Launch
# =====================================================
print("‚úÖ Launching MOI App...")
demo.queue().launch(share=True, inline=False, show_api=False)