# Legal Document Question Answering using RAG (DistilBERT + FAISS)

In [None]:
# Install required packages
!pip install transformers datasets sentence-transformers faiss-cpu PyMuPDF gradio -q

[31mERROR: Invalid requirement: '!pip': Expected package name at the start of dependency specifier
    !pip
    ^[0m[31m
[0m

In [4]:
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import faiss
import fitz  # PyMuPDF
import numpy as np

ModuleNotFoundError: No module named 'faiss'

In [None]:
# Replace with your law PDF path
pdf_path = "google_terms_of_service_en.pdf"

doc = fitz.open(pdf_path)
full_text = ""
for page in doc:
    full_text += page.get_text("text")

# Try semantic chunking instead
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=50,
    separators=["\n\n", "\n", ". ", " ", ""]
)
chunks = splitter.split_text(full_text)

print("\n\n\n\n\n\n".join(chunks[:5]))

NameError: name 'fitz' is not defined

In [6]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embedder.encode(chunks, convert_to_tensor=False)

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

NameError: name 'chunks' is not defined

In [None]:
from transformers import pipeline

# Better generative model that can synthesize answers
qa_model = pipeline("text2text-generation", model="google/flan-t5-large")  # Larger model

def answer_question(question, num_chunks=16):
    # Find relevant chunks
    q_embedding = embedder.encode([question])
    distances, chunk_ids = index.search(np.array(q_embedding), num_chunks)

    # Just grab the top chunks - keep it simple
    context = ""
    for chunk_id in chunk_ids[0]:
        context += chunks[chunk_id] + " "

    # Create a clear prompt for the generative model
    prompt = f"""Based on the context below, provide a complete and helpful answer. If you need to synthesize information from multiple parts, do so. If the answer isn't clearly covered, say 'not mentioned'.

Context: {context}

Question: {question}

Provide a clear, comprehensive answer:"""

    # Generate answer
    result = qa_model(prompt, max_length=200, do_sample=False)
    return result[0]['generated_text']

# Test it
print("#######################")
print(answer_question("How much does Google cost?"))

In [None]:
import json
from datetime import datetime

# Define all test questions
test_questions = {
   "basic_factual": [
       "What is Google's address?",
       "When did these terms become effective?",
       "What company provides Google services?"
   ],
   "content_and_usage": [
       "What are the rules about using AI-generated content from Google's services?",
       "Can I create fake accounts on Google?",
       "What happens if I abuse Google's services?",
       "Am I allowed to reverse engineer Google's services?"
   ],
   "user_content": [
       "Who owns the content I upload to Google?",
       "What rights does Google have to my content?",
       "Can I share someone else's content on Google services?"
   ],
   "age_requirements": [
       "What are the age requirements for Google accounts?",
       "Do I need parental permission to use Google?"
   ],
   "tricky_questions": [
       "How much does Google cost?",
       "Can Google delete my account?",
       "What should I do if someone violates these terms?"
   ],
   "complex_synthesis": [
       "What are all the things I'm not allowed to do on Google?",
       "What happens in case of disagreements with Google?"
   ]
}

# Run all questions and collect answers
results = {
   "timestamp": datetime.now().isoformat(),
   "model_used": "google/flan-t5-base",
   "categories": {}
}

print("Running all test questions...")
print("=" * 50)

for category, questions in test_questions.items():
   print(f"\n{category.upper().replace('_', ' ')}:")
   results["categories"][category] = []

   for question in questions:
       print(f"\nQ: {question}")
       try:
           answer = answer_question(question)
           print(f"A: {answer}")

           results["categories"][category].append({
               "question": question,
               "answer": answer,
               "status": "success"
           })
       except Exception as e:
           error_msg = f"Error: {str(e)}"
           print(f"A: {error_msg}")

           results["categories"][category].append({
               "question": question,
               "answer": error_msg,
               "status": "error"
           })

       print("-" * 30)

# Save to JSON file
filename = "google_tos_qa_results.json"
with open(filename, 'w', encoding='utf-8') as f:
   json.dump(results, f, indent=2, ensure_ascii=False)

print(f"\n\nResults saved to {filename}")
print(f"Total questions asked: {sum(len(q) for q in test_questions.values())}")

# Quick summary
total_questions = 0
successful_answers = 0
for category in results["categories"].values():
   for qa in category:
       total_questions += 1
       if qa["status"] == "success":
           successful_answers += 1

print(f"Successful answers: {successful_answers}/{total_questions}")

In [None]:
import gradio as gr

def qa_interface(question):
    if question.strip():
        return answer_question(question)
    return "Please ask a question."

# Create Gradio interface
interface = gr.Interface(
    fn=qa_interface,
    inputs=gr.Textbox(label="Ask about Google's Terms of Service", placeholder="What are the age requirements?"),
    outputs=gr.Textbox(label="Answer"),
    title="Legal Document QA (RAG)",
    description="Ask questions about Google's Terms of Service using RAG (Retrieval Augmented Generation)"
)

# Launch with public link
interface.launch(share=True)