<a href="https://colab.research.google.com/github/ChonghaoSu/legal_opensource/blob/main/legal_rag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Environment Setup & Authentication
This step initializes the workspace. It performs three critical actions:
1.  **Mounts Google Drive:** Ensures we can save models and data permanently.
2.  **Installs Dependencies:** Downloads the necessary libraries for RAG (`langchain`, `chromadb`) and model handling (`transformers`, `bitsandbytes`).
3.  **Interactive Login:** Securely authenticates with Hugging Face to access gated models (like Llama 3 or Qwen) without exposing your API key in the code.

In [1]:
# 1. Mount Google Drive (To save your models/data permanently)
#from google.colab import drive
#drive.mount('/content/drive')

# 2. Clear old/corrupt tokens (Fixes the 401 Unauthorized error)
!rm -rf /root/.cache/huggingface/token
import os
if 'HF_TOKEN' in os.environ:
    del os.environ['HF_TOKEN']

# 3. Install ALL Dependencies
# (We install huggingface_hub first to ensure login works)
!pip install -qU huggingface_hub
!pip install -qU langchain langchain-community langchain-text-splitters chromadb pypdf sentence-transformers
!pip install -qU torch torchvision torchaudio transformers datasets peft bitsandbytes trl accelerate unstructured

# 4. Interactive Login (The Reliable Way)
from huggingface_hub import notebook_login
print("\n👇 -----------------------------------------------------------------")
print("👉 ACTION REQUIRED: A login box will appear below.")
print("   1. Paste your token.")
print("   2. Click 'Login'.")
print("-----------------------------------------------------------------\n")
notebook_login()

# 5. Verify GPU Status
import torch
print(f"\n---------------------------------------")
print(f"GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
else:
    print("WARNING: No GPU detected. Go to Runtime > Change runtime type > T4 GPU")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/520.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m512.0/520.9 kB[0m [31m20.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m520.9/520.9 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
transformers 4.57.3 requires huggingface-hub<1.0,>=0.34.0, but you have huggingface-hub 1.2.1 which is incompatible.[0m[31m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…


---------------------------------------
GPU Available: True
GPU Name: Tesla T4
VRAM: 14.74 GB


In [2]:
# 1. Clone your repository
# (This downloads the scripts needed for SFT and DPO training)
!rm -rf legal_opensource
!git clone https://github.com/ChonghaoSu/legal_opensource.git
%cd legal_opensource

# Note: We removed the login code because Cell 1 already handled it!
print("✅ Repository cloned successfully.")

Cloning into 'legal_opensource'...
remote: Enumerating objects: 66, done.[K
remote: Counting objects: 100% (66/66), done.[K
remote: Compressing objects: 100% (60/60), done.[K
Receiving objects: 100% (66/66), 40.86 KiB | 973.00 KiB/s, done.
remote: Total 66 (delta 21), reused 27 (delta 2), pack-reused 0 (from 0)[K
Resolving deltas: 100% (21/21), done.
/content/legal_opensource
✅ Repository cloned successfully.


In [3]:
import yaml
import json
import os

# --- A. Create Dummy Data (So you can test the pipeline NOW) ---
os.makedirs("data", exist_ok=True)

# Sample SFT Data (Instruction Tuning)
sft_data = [
    {"instruction": "Draft a confidentiality clause.", "response": "The Receiving Party shall hold the Confidential Information in strict confidence and shall not disclose it to any third party."},
    {"instruction": "What is a tort?", "response": "A tort is a civil wrong that causes a claimant to suffer loss or harm, resulting in legal liability for the person who commits the tortious act."},
    {"instruction": "Summarize the concept of Force Majeure.", "response": "Force Majeure allows a party to suspend or terminate the performance of its obligations when certain circumstances beyond their control arise, making performance inadvisable, commercially impracticable, illegal, or impossible."}
] * 10  # Multiply to simulate a larger dataset

# Sample DPO Data (Preference Optimization)
dpo_data = [
    {
        "prompt": "Explain 'Breach of Contract'",
        "chosen": "A breach of contract occurs when one party fails to fulfill their obligations under the terms of a binding agreement.",
        "rejected": "It's when someone messes up the deal and doesn't do what they said."
    }
] * 10

# Write to JSONL
with open("data/train.jsonl", "w") as f:
    for entry in sft_data: f.write(json.dumps(entry) + "\n")
with open("data/val.jsonl", "w") as f:
    for entry in sft_data[:5]: f.write(json.dumps(entry) + "\n") # Small val set
with open("data/dpo_train.jsonl", "w") as f:
    for entry in dpo_data: f.write(json.dumps(entry) + "\n")

print("✅ Dummy data created in /data folder.")

# --- B. Create T4-Optimized Config (Prevents Out of Memory) ---
t4_config = {
    "model_name": "meta-llama/Meta-Llama-3-8B-Instruct", # Or "mistralai/Mistral-7B-v0.3"
    "data_path": "data/train.jsonl",
    "output_dir": "checkpoints/sft",
    "max_seq_length": 512,  # Reduced from 2048 for T4
    "load_in_4bit": True,   # CRITICAL for T4
    "use_peft": True,
    "lora_r": 8,            # Low Rank
    "lora_alpha": 16,
    "lora_dropout": 0.05,
    "per_device_train_batch_size": 1, # Smallest batch size
    "gradient_accumulation_steps": 4, # Compensate for small batch
    "learning_rate": 2e-4,
    "num_train_epochs": 1,
    "logging_steps": 1,
    "save_steps": 10
}

os.makedirs("configs", exist_ok=True)
with open("configs/colab_t4_finetune.yaml", "w") as f:
    yaml.dump(t4_config, f)

print("✅ T4-Optimized config created at configs/colab_t4_finetune.yaml")

✅ Dummy data created in /data folder.
✅ T4-Optimized config created at configs/colab_t4_finetune.yaml


In [4]:
# 1. Run Supervised Fine-Tuning (SFT)
# We use the T4 config we just created
!python train_finetune.py --config configs/colab_t4_finetune.yaml

# 2. (Optional) Backup to Drive immediately
!cp -r checkpoints/sft /content/drive/MyDrive/legal_project_backups/
print("Backup complete.")

2025-12-10 08:37:05.264260: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765355825.603022    2506 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765355825.696072    2506 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1765355826.452999    2506 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1765355826.453045    2506 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1765355826.453053    2506 computation_placer.cc:177] computation placer alr

DPO TRAINING

In [5]:
import yaml
import os

# Create the DPO-specific configuration
# NOTE: We use the base model again here to save memory.
# In a full production run, you would merge your SFT adapter into the base model first,
# but on a T4, merging often crashes RAM. This "Hello World" proves the DPO pipeline works.

dpo_t4_config = {
    "model_name": "meta-llama/Meta-Llama-3-8B-Instruct", # Base model
    "data_path": "data/dpo_train.jsonl",      # Path to the triples we created earlier
    "output_dir": "checkpoints/dpo",

    # Memory Optimizations
    "max_seq_length": 512,      # Keep short for T4 (Legal docs usually need 2048+)
    "max_prompt_length": 256,   # Length of the input prompt
    "load_in_4bit": True,       # Mandatory for T4

    # DPO Specifics
    "beta": 0.1,                # The temperature of the DPO loss (0.1 is standard)

    # LoRA (Adapter) Config
    "use_peft": True,
    "lora_r": 8,
    "lora_alpha": 16,
    "lora_dropout": 0.05,
    "target_modules": ["q_proj", "v_proj"], # Target specific attention layers

    # Training Loop
    "per_device_train_batch_size": 1,
    "gradient_accumulation_steps": 8,   # High accumulation to simulate larger batches
    "learning_rate": 5e-5,              # DPO usually needs lower LR than SFT
    "num_train_epochs": 1,
    "logging_steps": 1,
    "save_steps": 10,
    "optim": "paged_adamw_32bit"        # Memory efficient optimizer
}

os.makedirs("configs", exist_ok=True)
with open("configs/colab_t4_dpo.yaml", "w") as f:
    yaml.dump(dpo_t4_config, f)

print("✅ T4-Optimized DPO config created at configs/colab_t4_dpo.yaml")

✅ T4-Optimized DPO config created at configs/colab_t4_dpo.yaml


In [6]:
# 1. Run Direct Preference Optimization (DPO)
!python train_dpo.py --config configs/colab_t4_dpo.yaml

# 2. (Optional) Backup DPO adapter to Drive
!cp -r checkpoints/dpo /content/drive/MyDrive/legal_project_backups/
print("✅ DPO Training complete and backed up.")

2025-12-10 08:37:35.508980: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765355855.529639    2732 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765355855.536895    2732 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1765355855.554384    2732 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1765355855.554407    2732 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1765355855.554411    2732 computation_placer.cc:177] computation placer alr

# **RAG Step**

# 2. Data Ingestion (The "Fail-Safe" Engine)
This cell builds the legal knowledge base. It uses a robust **fail-safe logic**:
* **Real Data:** It first checks the `rag_documents/` folder. If you have uploaded PDFs, it ingests them.
* **Synthetic Data:** If the folder is empty, it automatically generates synthetic contracts (NDA, MSA) so the pipeline works immediately for testing.

It then splits the text into semantic chunks and indexes them into a local **ChromaDB** vector database.

In [7]:
# 1. Install RAG & Vector DB libraries
!pip install -q langchain langchain-community chromadb pypdf sentence-transformers unstructured

# 2. Install LLM libraries (if not already present from previous steps)
!pip install -q --upgrade torch transformers bitsandbytes accelerate

import os

# 3. Create the document folder
DOC_DIR = "rag_documents"
os.makedirs(DOC_DIR, exist_ok=True)

print("✅ Environment Ready.")
print(f"👉 ACTION: Drag your PDFs into the '{DOC_DIR}' folder on the left sidebar.")
print("   (If you don't, the system will auto-generate fake contracts in the next step.)")

✅ Environment Ready.
👉 ACTION: Drag your PDFs into the 'rag_documents' folder on the left sidebar.
   (If you don't, the system will auto-generate fake contracts in the next step.)


In [8]:
import os
import shutil
import time
import warnings

# Suppress warnings
warnings.filterwarnings("ignore", category=SyntaxWarning)

# 1. Imports
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

# --- CONFIGURATION ---
DOC_DIR = "rag_documents"
DB_DIR = "chroma_db"

# --- 2. CLEANUP (Crucial Fix for InternalError) ---
# We do this FIRST to ensure the folder is clean and unlocked before we start.
if os.path.exists(DB_DIR):
    print("🧹 Cleaning up old database to prevent locks...")
    try:
        shutil.rmtree(DB_DIR)
        time.sleep(1) # Wait 1 second for Colab to release the file lock
        print("   Cleanup complete.")
    except Exception as e:
        print(f"⚠️ Warning: Could not delete old DB (might be in use): {e}")

# --- 3. FAIL-SAFE GENERATOR ---
def generate_synthetic_legal_data(directory):
    print("⚠️ No user files found. Generating synthetic legal data for testing...")

    # Fake Contract 1
    nda_text = """NON-DISCLOSURE AGREEMENT (NDA)
    1. Confidentiality. The Receiving Party agrees to not disclose any Confidential Information to third parties for a period of 5 years.
    2. Exclusions. Confidential Information does not include information that is public knowledge or independently developed.
    3. Jurisdiction. This agreement shall be governed by the laws of the State of Texas."""
    with open(f"{directory}/synthetic_nda.txt", "w") as f: f.write(nda_text)

    # Fake Contract 2
    service_text = """MASTER SERVICES AGREEMENT
    1. Scope of Work. Provider shall deliver the software deliverables as outlined in Exhibit A.
    2. Payment Terms. Client shall pay invoices within 30 days of receipt (Net 30).
    3. Termination. Either party may terminate this agreement with 14 days written notice for material breach."""
    with open(f"{directory}/synthetic_msa.txt", "w") as f: f.write(service_text)
    print(f"✅ Created 2 synthetic contracts in {directory}/")

# --- 4. MAIN LOGIC ---
# Check for files
if not os.path.exists(DOC_DIR) or len(os.listdir(DOC_DIR)) == 0:
    os.makedirs(DOC_DIR, exist_ok=True)
    generate_synthetic_legal_data(DOC_DIR)
else:
    print(f"✅ Found {len(os.listdir(DOC_DIR))} file(s) in {DOC_DIR}.")

# Load Documents
print("Loading documents...")
try:
    pdf_loader = DirectoryLoader(DOC_DIR, glob="./*.pdf", loader_cls=PyPDFLoader)
    pdf_docs = pdf_loader.load()
except: pdf_docs = []

try:
    txt_loader = DirectoryLoader(DOC_DIR, glob="./*.txt", loader_cls=TextLoader)
    txt_docs = txt_loader.load()
except: txt_docs = []

documents = pdf_docs + txt_docs

if not documents:
    # If both loaders fail, force synthetic data and try again
    print("⚠️ Load failed. Forcing synthetic data...")
    generate_synthetic_legal_data(DOC_DIR)
    txt_loader = DirectoryLoader(DOC_DIR, glob="./*.txt", loader_cls=TextLoader)
    documents = txt_loader.load()

# Split Text
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", r"(?<=\. )", " ", ""]
)
texts = text_splitter.split_documents(documents)

# Create Database
print("Embedding and indexing...")
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vector_db = Chroma.from_documents(
    documents=texts,
    embedding=embedding_model,
    persist_directory=DB_DIR
)

print(f"✅ Database successfully rebuilt with {len(texts)} chunks!")

⚠️ No user files found. Generating synthetic legal data for testing...
✅ Created 2 synthetic contracts in rag_documents/
Loading documents...
Embedding and indexing...


  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Database successfully rebuilt with 2 chunks!


# 3. Define Retrieval Logic
Here we define the search mechanism. The `legal_search` function performs a **semantic similarity search** against our ChromaDB database.

* It retrieves the top `k` most relevant chunks based on the user's query.
* It includes source citation metadata (page numbers, filenames) to ensure traceability.

In [9]:
def legal_search(query, k=3):
    """
    Searches the vector DB for the top k most relevant legal chunks.
    """
    print(f"\n🔎 Searching for: '{query}'...")
    results = vector_db.similarity_search(query, k=k)

    # Print source citations for verification
    for i, doc in enumerate(results):
        source = doc.metadata.get('source', 'Unknown')
        print(f"\n[CITATION {i+1}] Source: {source}")
        print(f"Content: {doc.page_content[:200]}...") # Preview first 200 chars
        print("-" * 50)

    return results

# Quick Test
_ = legal_search("What are the payment terms?")


🔎 Searching for: 'What are the payment terms?'...

[CITATION 1] Source: rag_documents/synthetic_msa.txt
Content: MASTER SERVICES AGREEMENT
    1. Scope of Work. Provider shall deliver the software deliverables as outlined in Exhibit A.
    2. Payment Terms. Client shall pay invoices within 30 days of receipt (Ne...
--------------------------------------------------

[CITATION 2] Source: rag_documents/synthetic_nda.txt
Content: NON-DISCLOSURE AGREEMENT (NDA)
    1. Confidentiality. The Receiving Party agrees to not disclose any Confidential Information to third parties for a period of 5 years.
    2. Exclusions. Confidential...
--------------------------------------------------


# 4. The "Transient" AI Lawyer (Memory Optimized)
This is the core RAG application. To prevent **CUDA Out of Memory** errors on the free T4 GPU, we use a **"Load-and-Release"** architecture:

1.  **Load:** The model (Qwen 2.5) is loaded into VRAM only when needed.
2.  **Generate:** It reads the retrieved legal context and drafts an answer.
3.  **Flush:** The model is immediately deleted from memory, and the GPU cache is cleared.

This allows you to run heavy queries repeatedly without crashing the runtime.

In [20]:
import torch
import gc
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

def ask_lawyer_bot_transient(user_query):
    print("🔄 1. Loading model into memory... (This takes ~60 seconds)")

    # A. Load Model
    model_id = "Qwen/Qwen2.5-7B-Instruct"
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16
    )

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map="auto",
        low_cpu_mem_usage=True
    )

    # B. Retrieve & Prompt
    try:
        context_docs = legal_search(user_query)
        context_text = "\n\n".join([d.page_content for d in context_docs])
    except:
        context_text = "No context available."

    prompt = f"""<|im_start|>system
You are a legal assistant. Answer strictly based on the context provided.
<|im_end|>
<|im_start|>user
Context: {context_text}

Question: {user_query}
<|im_end|>
<|im_start|>assistant
"""

    # C. Generate
    print("🔄 2. Generating answer...")
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=256, temperature=0.1)
    response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)

    # D. THE CLEANUP (The most important part)
    print("🔄 3. Cleaning up memory...")
    del model
    del tokenizer
    del inputs
    del outputs

    # Force Python to release memory immediately
    gc.collect()
    torch.cuda.empty_cache()

    # Verify memory is cleared
    free_mem = torch.cuda.mem_get_info()[0] / 1024**3
    print(f"✅ Memory cleared! Free VRAM: {free_mem:.2f} GB")

    return response

# --- Test it ---
# This will load the model, answer, and then delete the model automatically.
answer = ask_lawyer_bot_transient("What is the penalty for late payments?")
print(f"\n📝 Final Answer:\n{answer}")

🔄 1. Loading model into memory... (This takes ~60 seconds)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]


🔎 Searching for: 'What is the penalty for late payments?'...
🔄 2. Generating answer...
🔄 3. Cleaning up memory...
✅ Memory cleared! Free VRAM: 4.23 GB

📝 Final Answer:
The context provided does not specify a penalty for late payments. According to the Payment Terms in the MASTER SERVICES AGREEMENT, the Client is required to pay invoices within 30 days of receipt (Net 30). There is no mention of penalties for late payments in the given context.
