In [1]:
import os
import json
import hashlib
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm import tqdm
from langdetect import detect, DetectorFactory

# Make language detection deterministic
DetectorFactory.seed = 42

# ================== MANUAL PATH CONFIG ==================
BASE_DIR = Path(r"C:\Users\VICTUS-H\Desktop\Faris\Projects\Legal Assistant\Notebooks")

CHUNKS_PATH = BASE_DIR / "labour_law_chunks.json"

ARTIFACTS_DIR = BASE_DIR / "rag_artifacts"
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)

print("BASE_DIR:", BASE_DIR)
print("CHUNKS_PATH exists:", CHUNKS_PATH.exists())
print("ARTIFACTS_DIR:", ARTIFACTS_DIR)


BASE_DIR: C:\Users\VICTUS-H\Desktop\Faris\Projects\Legal Assistant\Notebooks
CHUNKS_PATH exists: True
ARTIFACTS_DIR: C:\Users\VICTUS-H\Desktop\Faris\Projects\Legal Assistant\Notebooks\rag_artifacts


In [2]:
if not CHUNKS_PATH.exists():
    raise FileNotFoundError(f"Cannot find: {CHUNKS_PATH}")

with open(CHUNKS_PATH, "r", encoding="utf-8") as f:
    chunks = json.load(f)

print(f"Loaded {len(chunks)} chunks")
print("Type:", type(chunks))
print("Sample keys:", list(chunks[0].keys()))


Loaded 165 chunks
Type: <class 'list'>
Sample keys: ['chunk_id', 'law_name', 'article_number', 'chunk_index', 'text']


In [3]:
def detect_key(d, candidates):
    for c in candidates:
        if c in d:
            return c
    return None

first = chunks[0]

chunk_id_key = detect_key(first, ["chunk_id", "id"])
text_key = detect_key(first, ["text", "chunk_text", "content"])
article_key = detect_key(first, ["article_number", "article", "article_no"])

print("Detected schema:")
print(" - chunk_id_key:", chunk_id_key)
print(" - text_key    :", text_key)
print(" - article_key :", article_key)

assert chunk_id_key, "Missing chunk id field"
assert text_key, "Missing text field"


Detected schema:
 - chunk_id_key: chunk_id
 - text_key    : text
 - article_key : article_number


In [4]:
df = pd.DataFrame(chunks)

df_rag = pd.DataFrame({
    "chunk_id": df[chunk_id_key].astype(str),
    "text": df[text_key].astype(str),
    "article_number": df[article_key] if article_key else None
})

print(df_rag.head())
print("Total rows:", len(df_rag))


          chunk_id                                               text  \
0   JL8-1996-A1-C0  Article (1): Title and Effective Date This Law...   
1   JL8-1996-A2-C0  Article (2): Definitions Wherever used in this...   
2   JL8-1996-A2-C1  age No (3487) and Amending Law No. (26) Of 201...   
3   JL8-1996-A2-C2  : The physician or com mittee of physicians, a...   
4  JL8-1996-A43-C0  composed under the provisions of article (43) ...   

  article_number  
0              1  
1              2  
2              2  
3              2  
4             43  
Total rows: 165


In [5]:
def dataset_fingerprint(df_: pd.DataFrame) -> str:
    h = hashlib.sha256()
    h.update(str(len(df_)).encode())
    for i in [0, len(df_) // 2, len(df_) - 1]:
        row = df_.iloc[i]
        h.update(row["chunk_id"].encode())
        h.update(str(row["article_number"]).encode())
        h.update(row["text"][:200].encode())
    return h.hexdigest()

fingerprint = dataset_fingerprint(df_rag)
print("Dataset fingerprint:", fingerprint)

meta = {
    "base_dir": str(BASE_DIR),
    "chunks_path": str(CHUNKS_PATH),
    "num_chunks": len(df_rag),
    "fingerprint": fingerprint,
    "schema": {
        "chunk_id": chunk_id_key,
        "text": text_key,
        "article": article_key
    }
}

with open(ARTIFACTS_DIR / "dataset_meta.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, indent=2, ensure_ascii=False)

print("Saved dataset_meta.json")


Dataset fingerprint: 6d3b8096220e7ef3248a36ec7029ca2ae7115dd64b4d244b74cc7d9a87e23444
Saved dataset_meta.json


In [6]:
def detect_language(user_query: str) -> str:
    """
    Returns 'ar' or 'en'.
    Defaults to English if uncertain.
    """
    try:
        lang = detect(user_query)
        if lang.startswith("ar"):
            return "ar"
        return "en"
    except Exception:
        return "en"


# Quick sanity test
tests = [
    "What are the employee rights?",
    "ما هي حقوق العامل عند إنهاء العقد؟"
]

for q in tests:
    print(q, "→", detect_language(q))


What are the employee rights? → en
ما هي حقوق العامل عند إنهاء العقد؟ → ar


In [7]:
from openai import OpenAI

client = OpenAI()

EMBEDDING_MODEL = "text-embedding-3-small"

EMBEDDINGS_PATH = ARTIFACTS_DIR / "labour_law_embeddings.npy"
IDS_PATH = ARTIFACTS_DIR / "labour_law_chunk_ids.npy"
EMBED_META_PATH = ARTIFACTS_DIR / "embedding_meta.json"

print("Embedding model:", EMBEDDING_MODEL)


Embedding model: text-embedding-3-small


In [8]:
def embeddings_exist():
    return (
        EMBEDDINGS_PATH.exists() and
        IDS_PATH.exists() and
        EMBED_META_PATH.exists()
    )

print("Embeddings already exist:", embeddings_exist())


Embeddings already exist: True


In [9]:
def embed_texts(texts, batch_size=32):
    embeddings = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding"):
        batch = texts[i:i + batch_size]

        response = client.embeddings.create(
            model=EMBEDDING_MODEL,
            input=batch
        )

        batch_embeddings = [e.embedding for e in response.data]
        embeddings.extend(batch_embeddings)

    return np.array(embeddings, dtype="float32")


In [10]:
if embeddings_exist():
    print("Loading cached embeddings...")

    embeddings = np.load(EMBEDDINGS_PATH)
    chunk_ids = np.load(IDS_PATH)

else:
    print("Generating embeddings from scratch...")

    texts = df_rag["text"].tolist()
    chunk_ids = df_rag["chunk_id"].tolist()

    embeddings = embed_texts(texts)

    np.save(EMBEDDINGS_PATH, embeddings)
    np.save(IDS_PATH, np.array(chunk_ids))

    meta = {
        "model": EMBEDDING_MODEL,
        "num_chunks": len(chunk_ids),
        "dataset_fingerprint": fingerprint
    }

    with open(EMBED_META_PATH, "w", encoding="utf-8") as f:
        json.dump(meta, f, indent=2)

    print("Embeddings saved to disk")

print("Embeddings shape:", embeddings.shape)


Loading cached embeddings...
Embeddings shape: (165, 1536)


In [11]:
assert embeddings.shape[0] == len(df_rag), "Mismatch between embeddings and chunks"
assert embeddings.shape[1] > 100, "Embedding dimension looks wrong"

print("✅ Embeddings sanity check passed")


✅ Embeddings sanity check passed


In [12]:
def normalize_vectors(vectors: np.ndarray) -> np.ndarray:
    norms = np.linalg.norm(vectors, axis=1, keepdims=True)
    return vectors / norms

# Normalize stored embeddings once
embeddings_norm = normalize_vectors(embeddings)

print("Normalized embeddings shape:", embeddings_norm.shape)


Normalized embeddings shape: (165, 1536)


In [13]:
def embed_query(query: str) -> np.ndarray:
    response = client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=[query]
    )
    vec = np.array(response.data[0].embedding, dtype="float32")
    return vec / np.linalg.norm(vec)


In [14]:
def retrieve_top_k(
    query: str,
    df_chunks: pd.DataFrame,
    embeddings_norm: np.ndarray,
    k: int = 5
):
    query_vec = embed_query(query)

    # cosine similarity
    scores = embeddings_norm @ query_vec

    top_indices = np.argsort(scores)[-k:][::-1]

    results = df_chunks.iloc[top_indices].copy()
    results["similarity_score"] = scores[top_indices]

    return results


In [15]:
query_en = "What are the employee’s rights if an employer terminates a limited contract early?"

results_en = retrieve_top_k(
    query=query_en,
    df_chunks=df_rag,
    embeddings_norm=embeddings_norm,
    k=5
)

results_en[["article_number", "chunk_id", "similarity_score", "text"]]


Unnamed: 0,article_number,chunk_id,similarity_score,text
35,26,JL8-1996-A26-C0,0.637791,Article (26): Termination of a Limited Period ...
36,29,JL8-1996-A29-C0,0.609433,reasons stipulated in Article (29) of this Law...
31,23,JL8-1996-A23-C0,0.60278,Article (23): Employment Termination a) Shall ...
38,29,JL8-1996-A29-C0,0.562221,Employee in cases other than those stipulated ...
29,21,JL8-1996-A21-C0,0.558104,Article (21): Employment Contract Termination ...


In [16]:
query_ar = "ما هي حقوق العامل إذا أنهى صاحب العمل عقداً محدد المدة قبل انتهائه؟"

results_ar = retrieve_top_k(
    query=query_ar,
    df_chunks=df_rag,
    embeddings_norm=embeddings_norm,
    k=5
)

results_ar[["article_number", "chunk_id", "similarity_score", "text"]]


Unnamed: 0,article_number,chunk_id,similarity_score,text
36,29,JL8-1996-A29-C0,0.53658,reasons stipulated in Article (29) of this Law...
22,15,JL8-1996-A15-C0,0.535973,Article (15): Employment Contract Preparation ...
58,40,JL8-1996-A40-C0,0.529999,its termination by either party in accordance ...
29,21,JL8-1996-A21-C0,0.521553,Article (21): Employment Contract Termination ...
38,29,JL8-1996-A29-C0,0.504697,Employee in cases other than those stipulated ...


In [17]:
def build_context(chunks_df: pd.DataFrame) -> str:
    context_blocks = []

    for _, row in chunks_df.iterrows():
        article = row["article_number"]
        text = row["text"]

        block = f"[Article {article}]\n{text}"
        context_blocks.append(block)

    return "\n\n".join(context_blocks)


In [18]:
SYSTEM_PROMPT_EN = """
You are a legal information assistant specialized in Jordanian Labour Law (Law No. 8 of 1996).

Rules:
- Answer ONLY using the provided legal text.
- Do NOT add information not present in the text.
- Cite article numbers explicitly.
- If the answer is not found in the text, say so clearly.
- This is NOT legal advice.

Always include this disclaimer at the end:
"This answer is for informational purposes only and does not constitute legal advice. Consult a licensed lawyer in Jordan for legal advice."
"""

SYSTEM_PROMPT_AR = """
أنت مساعد معلومات قانونية متخصص في قانون العمل الأردني رقم (8) لسنة 1996.

القواعد:
- أجب فقط بناءً على النصوص القانونية المقدمة.
- لا تضف أي معلومات غير موجودة في النص.
- اذكر أرقام المواد القانونية صراحة.
- إذا لم توجد إجابة واضحة في النص، صرّح بذلك.
- هذا ليس استشارة قانونية.

يجب دائمًا إضافة التنويه التالي في نهاية الإجابة:
"هذه الإجابة لأغراض معلوماتية فقط ولا تشكل استشارة قانونية. يُنصح بمراجعة محامٍ مرخص في الأردن للحصول على استشارة قانونية."
"""


In [19]:
def generate_answer(query: str, retrieved_chunks: pd.DataFrame) -> str:
    language = detect_language(query)

    context = build_context(retrieved_chunks)

    system_prompt = SYSTEM_PROMPT_AR if language == "ar" else SYSTEM_PROMPT_EN

    user_prompt = f"""
Legal Text:
{context}

Question:
{query}

Answer:
"""

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0
    )

    return response.choices[0].message.content.strip()


In [38]:
query_ar = "summarize قانون العمل المادة 9"

retrieved = retrieve_top_k(
    query=query_ar,
    df_chunks=df_rag,
    embeddings_norm=embeddings_norm,
    k=5
)

answer_ar = generate_answer(query_ar, retrieved)

print(answer_ar)


المادة (9) من قانون العمل تتعلق بصلاحيات مفتش العمل. تنص على ما يلي:

أ) يتمتع مفتش العمل بصلاحيات أعضاء الشرطة القضائية وفقاً لقانون الإجراءات الجنائية الساري، ويقوم بتنفيذ الضبط الذي ينظمه ضمن إطار عمله حتى يثبت العكس.

ب) يمكن للمفتش أن يطلب من صاحب العمل إزالة المخالفة خلال فترة لا تتجاوز سبعة أيام من تاريخ إخطاره كتابياً بذلك. في حال عدم قيام صاحب العمل بذلك، يمكن للوزير أو من يفوضه أن يقرر إغلاق المنشأة حتى يتم إزالة المخالفة أو تصدر المحكمة قراراً بشأنها.

ج) تلتزم المحكمة بإلزام المخالف بتصحيح المخالفة، مع فرض غرامة تتراوح بين خمسين ديناراً ومئتي دينار، ولا يمكن تخفيض الغرامة عن حدها الأدنى لأي سبب من الأسباب.

هذه الإجابة لأغراض معلوماتية فقط ولا تشكل استشارة قانونية. يُنصح بمراجعة محامٍ مرخص في الأردن للحصول على استشارة قانونية.


In [39]:
query_en = "summarize article 8"

retrieved = retrieve_top_k(
    query=query_en,
    df_chunks=df_rag,
    embeddings_norm=embeddings_norm,
    k=5
)

answer_en = generate_answer(query_en, retrieved)

print(answer_en)


The text provided does not contain an Article 8, and therefore I cannot summarize it.

"This answer is for informational purposes only and does not constitute legal advice. Consult a licensed lawyer in Jordan for legal advice."
