In [None]:
filepath = r'D:\Legal Clause Extraction\cuad\data\CUADv1.json'

In [None]:
import json

# Read the minified JSON
with open(filepath, "r", encoding="utf-8") as f:
    data = json.load(f)

# Save it pretty-printed
with open("cuad_data_pretty.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2)



In [None]:
import json

# Read the minified JSON
with open(r'D:\Legal Clause Extraction\cuad\data\test.json', "r", encoding="utf-8") as f:
    data = json.load(f)

# Save it pretty-printed
with open("cuad_test_pretty.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2)


In [None]:
TARGET_CATEGORIES = [
    "Agreement Date",
    "Effective Date",
    "Expiration Date",
    "Renewal Term",
    "Notice Period to Terminate Renewal",
    "Parties",
    "Governing Law"
]


In [None]:
import json

# Define the categories you want to keep
TARGET_CATEGORIES = {
    "Agreement Date",
    "Effective Date",
    "Expiration Date",
    "Renewal Term",
    "Notice Period to Terminate Renewal",
    "Parties",
    "Governing Law"
}

def filter_cuad_file(input_path, output_path, categories_to_keep):
    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    filtered_data = {"data": []}
    
    for doc in data["data"]:
        new_doc = {"title": doc["title"], "paragraphs": []}
        for para in doc["paragraphs"]:
            new_qas = [
                qa for qa in para["qas"]
                if any(qa["id"].endswith("__" + cat) for cat in categories_to_keep)
            ]
            if new_qas:
                new_doc["paragraphs"].append({
                    "context": para.get("context", ""),  
                    "qas": new_qas
                })
        if new_doc["paragraphs"]:
            filtered_data["data"].append(new_doc)

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(filtered_data, f, indent=2)

# Example usage:
filter_cuad_file('D:\Legal Clause Extraction\cuad_data_pretty.json', "train_filtered.json", TARGET_CATEGORIES)
filter_cuad_file('D:\Legal Clause Extraction\cuad_test_pretty.json', "test_filtered.json", TARGET_CATEGORIES)


In [None]:
import json
from pathlib import Path

import re

def clean_context(text):
    text = text.replace('\r\n', '\n')  # Windows line endings
    text = text.replace('\r', '\n')    # Mac-style line endings
    text = re.sub(r'\u2028|\u2029', '\n', text)  # Unicode line separators
    text = re.sub(r'\x00', '', text)   # Remove null characters
    text = re.sub(r'[ \t]{2,}', ' ', text)  # Compress long spaces
    text = re.sub(r'\n{3,}', '\n\n', text)  # Collapse too many newlines
    return text.strip()


#  CONFIG: Set your file paths
train_input_path = "train_filtered.json"
test_input_path = "test_filtered.json"
train_output_path = "train_flat.json"
test_output_path = "test_flat.json"

def flatten_cuad(input_path, output_path, drop_impossible=False):
    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    flat = []

    for doc in data["data"]:
        title = doc.get("title", "")
        for para in doc.get("paragraphs", []):
            # Determine context (CUAD may not have it separately)
            context = para.get("context")
            if context is None:
                # If no explicit context, assume full contract is stored under 'context' or part of outer structure
                # If paragraphs wrap chunks, you might need to inject full contract text here
                context = ""  # Placeholder (you can customize)
            for qa in para.get("qas", []):
                if drop_impossible and qa.get("is_impossible", False):
                    continue
                flat.append({
                    "id": qa["id"],
                    "context": clean_context(context),
                    "question": qa["question"],
                    "answers": qa["answers"],
                    "is_impossible": qa.get("is_impossible", False)
                })

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(flat, f, indent=2,ensure_ascii=False)

    print(f" Flattened {len(flat)} QA pairs -> {output_path}")

# 🔧 Run on both train and test sets
flatten_cuad(train_input_path, train_output_path, drop_impossible=False)
flatten_cuad(test_input_path, test_output_path, drop_impossible=False)


In [None]:
with open("train_flat.json", "r", encoding="utf-8") as f:
    data = json.load(f)

print(data[0]["context"])  # If this prints with proper line breaks, your file is fine


In [None]:
import json
import tiktoken
from openai import OpenAI

# 1. Load API key
with open("api.txt", "r") as f:
    api_key = f.read().strip()
client = OpenAI(api_key=api_key)

# 2. Load flattened QA data
with open("train_flat.json", "r", encoding="utf-8") as f:
    train_data = json.load(f)
with open("test_flat.json", "r", encoding="utf-8") as f:
    test_data = json.load(f)

# 3. Configuration
NUM_FEW_SHOT     = 4
CONTRACT_ID      = "CENTRACKINTERNATIONALINC_10_29_1999-EX-10.3-WEB SITE HOSTING AGREEMENT"
CHUNK_TOKENS     = 1500
MAX_TOTAL_TOKENS = 4000
MAX_OUTPUT_TOKENS= 100
TEMPERATURE      = 0.2

# 4. Tokenizer for GPT-3.5-turbo
enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
def num_tokens(text: str) -> int:
    return len(enc.encode(text))
def chunk_text_by_tokens(text: str, max_tokens: int):
    toks = enc.encode(text)
    return [enc.decode(toks[i : i + max_tokens]) for i in range(0, len(toks), max_tokens)]

# 5. System prompt to enforce span-only responses
SYSTEM = (
    "You are a legal assistant. When I give you a contract and question, "
    "extract and return only the exact text span from the contract that answers the question. "
    "Do not explain, rephrase, or add any extra text."
)

# 6. Prepare few-shot examples **with** answers
def format_fs_example(e):
    ans = e['answers'][0]['text'].strip() if e['answers'] else '[No answer]'
    return f"""Contract:
{e['context'].strip()}

Question:
{e['question'].strip()}

Answer:
{ans}"""

few_shot = "\n\n---\n\n".join(format_fs_example(e) for e in train_data[:NUM_FEW_SHOT])
fs_tokens = num_tokens(few_shot)
print(f"🔍 Few-shot uses {fs_tokens} tokens")

# 7. Verify available contract prefixes
all_prefixes = sorted({e["id"].rsplit("__",1)[0] for e in test_data})
print("🗂 Available prefixes:")
for p in all_prefixes:
    print(" ", p)

# 8. Filter test entries for our CONTRACT_ID
target = [e for e in test_data if e["id"].startswith(CONTRACT_ID)]
if not target:
    raise ValueError(f"No entries found for CONTRACT_ID={CONTRACT_ID!r}")
print(f"\n🧪 Found {len(target)} questions for:\n  {CONTRACT_ID}\n")

# 9. Loop through each clause question
for idx, e in enumerate(target, 1):
    question = e["question"].strip()
    true_ans = e["answers"][0]["text"].strip() if e["answers"] else "[No answer]"
    context = e["context"]
    
    # If whole context fits, use it; otherwise chunk
    if fs_tokens + num_tokens(context) + MAX_OUTPUT_TOKENS + 50 <= MAX_TOTAL_TOKENS:
        chunks = [context]
    else:
        chunks = chunk_text_by_tokens(context, CHUNK_TOKENS)

    print(f"---\nClause {idx}/{len(target)} → {e['id'].split('__')[-1]}")
    print(f"Q: {question}")

    prediction = ""
    for c_i, chunk in enumerate(chunks, 1):
        # Skip if still too large
        if fs_tokens + num_tokens(chunk) + MAX_OUTPUT_TOKENS + 50 > MAX_TOTAL_TOKENS:
            continue

        prompt = f"{few_shot}\n\n---\n\nContract Chunk:\n{chunk.strip()}\n\nQuestion:\n{question}\n\nAnswer:"
        resp = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role":"system","content":SYSTEM},
                {"role":"user","content":prompt}
            ],
            temperature=TEMPERATURE,
            max_tokens=MAX_OUTPUT_TOKENS
        )
        ans = resp.choices[0].message.content.strip()
        print(f"  Chunk {c_i}: {ans}")

        if ans and ans.lower() not in {"no","none","n/a",""}:
            prediction = ans
            break

    print(f"\n GPT:  {prediction}")
    print(f" True: {true_ans}\n")


In [None]:
import json
import tiktoken
from openai import OpenAI

# ===  Load API key ===
with open("api.txt", "r") as f:
    api_key = f.read().strip()
client = OpenAI(api_key=api_key)

# ===  Load train and test ===
with open("train_flat.json", "r", encoding="utf-8") as f:
    train_data = json.load(f)
with open("test_flat.json", "r", encoding="utf-8") as f:
    test_data = json.load(f)


MODEL             = "gpt-3.5-turbo"
MAX_PROMPT_TOKENS = 4000
MAX_OUTPUT_TOKENS = 100
TEMPERATURE       = 0.2

# ===  Token ===
enc = tiktoken.encoding_for_model(MODEL)
def num_tokens(text: str) -> int:
    return len(enc.encode(text))


def chunk_text_by_tokens(text: str, max_toks: int):
    toks = enc.encode(text)
    return [enc.decode(toks[i:i+max_toks]) for i in range(0, len(toks), max_toks)]

# ===  System prompt ===
SYSTEM = (
    "You are a legal assistant. Extract and return only the exact text span "
    "from the contract that answers the question. Do not explain, rephrase, or add extra text."
)


def build_fewshot(train_data, clause_types, max_tokens=1000):
    fewshot = []
    total_tokens = 0

    for entry in train_data:
        clause = entry["id"].split("__")[-1]
        if clause not in clause_types:
            continue
        context = entry["context"].strip()
        question = entry["question"].strip()
        answers = entry["answers"]
        if not answers:
            continue
        answer_text = answers[0]["text"].strip()

        # Truncate context around first answer (±300 chars window)
        ans_start = answers[0]["answer_start"]
        snippet = context[max(0, ans_start - 300): ans_start + 300].replace("\n", " ")

        example = f"""Contract:
{snippet}

Question:
{question}

Answer:
{answer_text}"""
        tokens = num_tokens(example)
        if total_tokens + tokens > max_tokens:
            break
        fewshot.append(example)
        total_tokens += tokens

    return "\n\n---\n\n".join(fewshot), total_tokens

# ===  Pick a test case ===
CONTRACT_ID = "CENTRACKINTERNATIONALINC_10_29_1999-EX-10.3-WEB SITE HOSTING AGREEMENT"
target = [e for e in test_data if e["id"].startswith(CONTRACT_ID)]
if not target:
    raise ValueError(f"No test entries found for {CONTRACT_ID}")
print(f" Loaded {len(target)} questions from test set")

# ===  Build few-shot from train only (no leakage) ===
CLAUSE_TYPES = list({e['id'].split("__")[-1] for e in target})
few_shot, fs_tokens = build_fewshot(train_data, CLAUSE_TYPES, max_tokens=1000)
print(f" Few-shot block uses {fs_tokens} tokens across {few_shot.count('Contract:')} examples\n")


for idx, entry in enumerate(target, 1):
    question = entry["question"].strip()
    true_ans = entry["answers"][0]["text"].strip() if entry["answers"] else "[No answer]"
    context = entry["context"]

    chunks = [context] if fs_tokens + num_tokens(context) + MAX_OUTPUT_TOKENS < MAX_PROMPT_TOKENS else chunk_text_by_tokens(context, 1500)

    print(f"---\nClause {idx}/{len(target)} → {entry['id'].split('__')[-1]}")
    print(f"Q: {question}")

    best = ""
    for c_i, chunk in enumerate(chunks, 1):
        if fs_tokens + num_tokens(chunk) + MAX_OUTPUT_TOKENS > MAX_PROMPT_TOKENS:
            continue
        prompt = (
            f"{few_shot}\n\n---\n\n"
            f"Contract:\n{chunk.strip()}\n\n"
            f"Question:\n{question}\n\n"
            f"Answer:"
        )
        try:
            resp = client.chat.completions.create(
                model=MODEL,
                messages=[
                    {"role": "system", "content": SYSTEM},
                    {"role": "user", "content": prompt}
                ],
                temperature=TEMPERATURE,
                max_tokens=MAX_OUTPUT_TOKENS
            )
            ans = resp.choices[0].message.content.strip()
            print(f"  Chunk {c_i} → {repr(ans)}")
            if ans and ans.lower() not in {"none", "n/a", ""}:
                best = ans
                break
        except Exception as e:
            print(f" Error: {e}")

    print(f"\n GPT: {best}")
    print(f"True:{true_ans}\n")


In [None]:
import json
import os

def convert_cuad_to_qa_format(input_path, output_path):
    with open(input_path, "r", encoding="utf-8") as f:
        raw_data = json.load(f)

    hf_qa_data = []
    skipped = 0

    for entry in raw_data:
        if not entry["answers"]:
            skipped += 1
            continue  # Skip entries with no labeled answer

        # CUAD format: answers: [{"text": ..., "answer_start": ...}]
        first_answer = entry["answers"][0]
        hf_entry = {
            "id": entry["id"],
            "context": entry["context"],
            "question": entry["question"],
            "answers": {
                "text": [first_answer["text"]],
                "answer_start": [first_answer["answer_start"]]
            }
        }
        hf_qa_data.append(hf_entry)

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(hf_qa_data, f, indent=2)

    print(f" Converted {len(hf_qa_data)} examples. Skipped {skipped} unlabeled entries.")
    print(f" Saved to {output_path}")


# Example usage:
convert_cuad_to_qa_format("train_flat.json", "cuad_qa_train.json")
convert_cuad_to_qa_format("test_flat.json", "cuad_qa_test.json")


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
import numpy as np
import evaluate
import torch
import os
os.environ["HF_HOME"] = "D:/huggingface_cache"


# === Load your converted dataset ===
dataset = load_dataset("json", data_files={
    "train": "cuad_qa_train.json",
    "validation": "cuad_qa_test.json"
})

# === TEMP: Subset for speed during testing ===
dataset["train"] = dataset["train"].select(range(300))
dataset["validation"] = dataset["validation"].select(range(100))

# === Load tokenizer + model ===
model_name = "deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# === Preprocessing ===
def preprocess(example):
    return tokenizer(
        example["question"],
        example["context"],
        truncation=True,
        padding="max_length",
        max_length=384,
        return_offsets_mapping=True
    )

tokenized_dataset = dataset.map(preprocess, batched=True)

# === Align start/end token positions ===
def add_token_positions(example):
    start_char = example["answers"]["answer_start"][0]
    end_char = start_char + len(example["answers"]["text"][0])
    offsets = example["offset_mapping"]

    start_token = end_token = 0
    for idx, (s, e) in enumerate(offsets):
        if s <= start_char < e:
            start_token = idx
        if s < end_char <= e:
            end_token = idx
            break

    example["start_positions"] = start_token
    example["end_positions"] = end_token
    return example

tokenized_dataset = tokenized_dataset.map(add_token_positions, remove_columns=["offset_mapping"])

# === Training arguments ===
args = TrainingArguments(
    output_dir="./cuad_roberta",
    eval_strategy="epoch",                    # use eval_strategy instead of evaluation_strategy
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,            # simulate larger batches
    num_train_epochs=1,
    max_steps=300,                            # optional cap for speed during testing
    logging_dir="./logs",
    logging_steps=200,
    fp16=torch.cuda.is_available(),           # enable automatic mixed precision on GPU
)

# === Optional: Evaluation metrics (disabled here for speed)
# squad_metric = evaluate.load("squad")
# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred
#     return squad_metric.compute(predictions=predictions, references=labels)

# === Trainer ===
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=None  # use compute_metrics=squad_metric if needed
)

# === Train! ===
trainer.train()
