In [None]:
#install dependencies
!pip install torch transformers sentence-transformers faiss-cpu accelerate pdfplumber

In [None]:
#GPU
!pip install faiss-gpu

# 1. DATA PREPROCESSING + CLAUSE-AWARE CHUNKING

In [None]:
import re
from typing import List, Dict

def clean_text(text: str) -> str:
    """
    Remove formatting artefacts but preserve clause numbering.
    """
    # Remove excessive whitespace
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'\s+', ' ', text)

    # Remove page numbers if any (customise if needed)
    text = re.sub(r'\b\d+\s*DEC\s*2025\b', '', text)

    return text.strip()


def clause_aware_chunking(text: str) -> List[Dict]:
    """
    Split document by Regulation and clause markers.
    Returns structured chunks with metadata.
    """

    chunks = []

    # Split by main Regulation numbers (1. Scope, 2. Entitlement, etc.)
    regulation_pattern = r'(?=\n?\s*\d+\.\s)'
    regulations = re.split(regulation_pattern, text)

    for reg in regulations:
        reg = reg.strip()
        if not reg:
            continue

        # Extract regulation number
        reg_match = re.match(r'(\d+)\.', reg)
        if not reg_match:
            continue

        reg_number = reg_match.group(1)

        # Split clauses (i), (ii), (iii)
        clause_pattern = r'(?=\(\w+\))'
        clauses = re.split(clause_pattern, reg)

        for clause in clauses:
            clause = clause.strip()
            if len(clause) < 50:  # ignore tiny fragments
                continue

            clause_id_match = re.match(r'\((\w+)\)', clause)
            clause_id = clause_id_match.group(1) if clause_id_match else "main"

            chunks.append({
                "regulation": reg_number,
                "clause": clause_id,
                "text": clause.strip()
            })

    return chunks


# 2. EMBEDDING + VECTOR DATABASE (FAISS)

In [None]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# Load embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

def create_vector_index(chunks):
    texts = [chunk["text"] for chunk in chunks]

    embeddings = embedding_model.encode(texts, convert_to_numpy=True)

    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)

    return index, embeddings


# 3.RETRIEVAL FUNCTION

In [None]:
def retrieve(query, index, chunks, top_k=3):
    query_embedding = embedding_model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, top_k)

    retrieved_chunks = []
    for idx in indices[0]:
        retrieved_chunks.append(chunks[idx])

    return retrieved_chunks


# 4.GENERATIVE MODEL (RAG GENERATION)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto"
)

def generate_answer(query, retrieved_chunks):
    context = "\n\n".join([
        f"Regulation {c['regulation']}({c['clause']}): {c['text']}"
        for c in retrieved_chunks
    ])

    prompt = f"""
You are a regulatory assistant for the UTM Resource Centre.
Answer strictly using the provided regulations.
Cite the regulation number.

Regulations:
{context}

Question:
{query}

Answer:
"""

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=300,
        temperature=0.2,
        top_p=0.9
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


# 5. FULL PIPELINE

In [None]:
# Load your regulation text
with open("RCentre.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

cleaned_text = clean_text(raw_text)
chunks = clause_aware_chunking(cleaned_text)

index, embeddings = create_vector_index(chunks)

# Example query
query = "What happens if a member fails to return materials by the due date?"

retrieved = retrieve(query, index, chunks, top_k=3)
answer = generate_answer(query, retrieved)

print(answer)
