# Step 0 – Install required libraries (run once per runtime)
!pip install -q faiss-cpu sentence-transformers groq


In [None]:
!pip install -q sentence-transformers faiss-cpu numpy pandas
!pip install groq


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.6/23.6 MB[0m [31m42.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting groq
  Downloading groq-0.37.0-py3-none-any.whl.metadata (16 kB)
Downloading groq-0.37.0-py3-none-any.whl (137 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 kB[0m [31m457.7 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.37.0


In [None]:
# Step 1 – Import libraries and mount Google Drive

import os
import json
import faiss
from sentence_transformers import SentenceTransformer
from groq import Groq

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Step 2 – Load the academic advisor dataset (same used in RAG)

data_path = "/content/drive/MyDrive/Classroom/DAB_RAG_ZakyProject/data/processed/academic_advisor_rag_dataset.json"

with open(data_path, "r", encoding="utf-8") as f:
    data = json.load(f)

print("Dataset loaded ✅")
print("Total records:", len(data))
print("First record keys:", data[0].keys())


Dataset loaded ✅
Total records: 5069
First record keys: dict_keys(['id', 'source_file', 'university', 'catalog_label', 'section', 'section_chunk_index', 'category', 'program', 'college', 'degree', 'level', 'text'])


In [None]:
# Step 3 – Load the same embedding model and FAISS index used in the main RAG

# Load embedding model (must match the main RAG notebook)
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
print("Embedding model loaded ✅")

# Load FAISS index from Drive
index_path = "/content/drive/MyDrive/Classroom/DAB_RAG_ZakyProject/rag_artifacts/academic_faiss.index"
index = faiss.read_index(index_path)

print("FAISS index loaded ✅")
print("Total vectors in index:", index.ntotal)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding model loaded ✅
FAISS index loaded ✅
Total vectors in index: 5069


In [None]:
def translate_text(text, target_lang):
    """
    Translate any text using Groq LLaMA.
    target_lang: 'en' or 'ar'
    """
    system_msg = "You are a translation assistant. Translate the text accurately."
    user_msg = f"Translate this text to {target_lang}:\n{text}"

    response = client.chat.completions.create(
        model="llama-3.1-8b-instant",
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg}
        ]
    )

    return response.choices[0].message.content.strip()


In [None]:
def detect_arabic(text):
    """
    Returns True if the text contains Arabic characters.
    """
    for ch in text:
        if '\u0600' <= ch <= '\u06FF':
            return True
    return False


In [None]:
# Step 4 – Define the semantic search function (same behavior as in RAG)

def search(query, top_k=5, filters=None):
    """
    Semantic search over the academic dataset using FAISS.
    Optionally filter by metadata such as program, level, or category.
    """
    # 1) Embed the query
    query_embedding = model.encode([query], normalize_embeddings=True)

    # 2) Search in FAISS
    scores, indices = index.search(query_embedding, top_k)

    results = []
    for score, idx in zip(scores[0], indices[0]):
        record = data[int(idx)]

        # 3) Optional metadata filtering
        if filters:
            skip = False
            for k, v in filters.items():
                if record.get(k) != v:
                    skip = True
                    break
            if skip:
                continue

        results.append({
            "score": float(score),
            "text": record.get("text", ""),
            "program": record.get("program"),
            "level": record.get("level"),
            "category": record.get("category"),
        })

    return results


In [None]:
# Step 5 – Initialize Groq client

os.environ["GROQ_API_KEY"] = "YOUR_GROQ_API_KEY"

client = Groq(
    api_key=os.environ.get("GROQ_API_KEY"),
)

print("Groq client initialized ✅")


Groq client initialized ✅


In [None]:
print(client)


<groq.Groq object at 0x7a56b0a43500>


In [None]:
# Step 6.1 – Build context text from retrieved chunks

def build_context_from_results(results):
    """
    Combine top-k retrieved chunks into a single context string
    with some basic metadata to help the LLM.
    """
    parts = []
    for i, r in enumerate(results, start=1):
        header = f"[Source {i}] Program: {r['program']} | Category: {r['category']} | Level: {r['level']}"
        body = r["text"]
        parts.append(header + "\n" + body)
    return "\n\n".join(parts)


In [None]:
# Step 6.2 – RAG answer using Groq + LLaMA

def rag_answer_with_llm(question, top_k=3):
    # 1) Retrieve relevant chunks from FAISS
    results = search(question, top_k=top_k)
    context_text = build_context_from_results(results)

    # 2) Build the prompt for the LLM
    system_prompt = (
        "You are a Smart Academic Advisor. "
        "You answer questions ONLY based on the provided university catalog context. "
        "If the answer is not clearly stated in the context, say that you don't know "
        "or that the information is not available."
    )

    user_prompt = (
        f"Question:\n{question}\n\n"
        f"Context (university catalog snippets):\n{context_text}\n\n"
        "Based only on the context above, provide a clear and concise answer for the student."
    )

    # 3) Call Groq chat completion
    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user",   "content": user_prompt},
        ],

        model="llama-3.1-8b-instant",
    )

    answer = chat_completion.choices[0].message.content

    return answer, results


In [None]:
def smart_rag_answer(question):
    """
    Full pipeline:
    1. Detect Arabic question
    2. Translate Arabic → English
    3. Pass through RAG pipeline
    4. Translate answer back to Arabic
    """

    original_question = question

    # Step A: Detect language
    is_arabic = detect_arabic(question)

    # Step B: Translate to English if needed
    if is_arabic:
        question = translate_text(question, "en")

    # Step C: Get RAG answer
    answer_en, retrieved = rag_answer_with_llm(question)

    # Step D: Translate back to Arabic if input was Arabic
    if is_arabic:
        answer_ar = translate_text(answer_en, "ar")
        return answer_ar, retrieved

    # Otherwise return English answer
    return answer_en, retrieved


In [None]:
# Step 7 – Test the full RAG + LLM pipeline
# the question from the data set .
question = " What are the admission requirements for the Accountancy bachelor program?"

answer, results = smart_rag_answer(question)

print("QUESTION:")
print(question)
print("=" * 80)
print("ANSWER (LLM based on RAG context):")
print(answer)
print("\n" + "=" * 80)
print("Top retrieved sources (for debugging):\n")

for i, r in enumerate(results, start=1):
    print(f"--- Source {i} ---")
    print("Score:", round(r["score"], 3))
    print("Program:", r["program"], "| Category:", r["category"], "| Level:", r["level"])
    print(r["text"][:500], "...")
    print("-" * 80)


QUESTION:
 What are the admission requirements for the Accountancy bachelor program?
ANSWER (LLM based on RAG context):
To be admitted to the Accountancy bachelor program, a student must meet the following requirements:

1. Be a continuing CSUN student with an overall and CSUN GPA of 3.2 or higher, or a first-semester transfer student with an overall GPA of 3.2 or higher. If openings are available, a minimum GPA of 3.0 can also satisfy this requirement.
2. Be declared as a Pre-Accountancy major.
3. Have completed a minimum of 60 units of college work (junior class standing). For transfer students, the units must be CSU transferable.
4. Complete ACCT 350 with a grade of "C" or higher.
5. Have successfully completed the Pre-Accountancy portion of the major.
6. Maintain an overall and CSUN GPA of 3.0 or higher.

Note that the information provided does not mention any requirement for a GMAT or GRE test.

Top retrieved sources (for debugging):

--- Source 1 ---
Score: 0.73
Program: Accounti

In [None]:
question = "What is the dress code policy for accounting students?"
# the question is not from the data set
answer, results = rag_answer_with_llm(question, top_k=3)

print("QUESTION:")
print(question)
print("=" * 80)
print("ANSWER (LLM based on RAG context):")
print(answer)

print("\n" + "=" * 80)
print("Top retrieved sources (for debugging):\n")

for i, r in enumerate(results, start=1):
    print(f"--- Source {i} ---")
    print("Score:", round(r["score"], 3))
    print("Program:", r["program"], "| Category:", r["category"], "| Level:", r["level"])
    print(r["text"][:400], "...")
    print("-" * 80)


QUESTION:
What is the dress code policy for accounting students?
ANSWER (LLM based on RAG context):
I don't know the dress code policy for accounting students. The provided university catalog context does not include any information regarding dress code policies.

Top retrieved sources (for debugging):

--- Source 1 ---
Score: 0.501
Program: Accounting | Category: course_description | Level: undergraduate
Students will develop critical skills in researching and critically interpreting tax authority. They also will acquire and develop the specialized oral and written communication skills needed to interact internally and externally with clients and government agencies. This course also will introduce students to the professional responsibilities and ethical dilemmas that face tax practitioners in to ...
--------------------------------------------------------------------------------
--- Source 2 ---
Score: 0.498
Program: Accounting | Category: course_description | Level: undergraduate
P