In [1]:
%pip install transformers datasets scikit-learn torch tqdm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install fitz
%pip install PyMuPDF


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


### **Parse and prepare dataset**

In [3]:
import json
import os

# Load CUAD annotations
with open("cuad/data/CUADv1.json", "r") as f:
    cuad_data = json.load(f)

training_data = []

for doc in cuad_data["data"]:
    for para in doc["paragraphs"]:
        context = para["context"]  # Full paragraph (clause)
        for qa in para["qas"]:
            if qa["is_impossible"]:
                continue  # Skip non-annotated clauses
            label = qa["question"].lower().strip().replace(" ", "_")
            training_data.append((context, label))

print(f"Total extracted clauses: {len(training_data)}")

Total extracted clauses: 6702


### **Duplicate and format for training**

In [4]:
# Optional: Deduplicate and balance
seen = set()
cleaned_data = []
for clause, label in training_data:
    if (clause, label) not in seen:
        cleaned_data.append((clause.strip(), label))
        seen.add((clause, label))

# Preview
for i in range(5):
    print(f"[{cleaned_data[i][1].upper()}] - {cleaned_data[i][0][:100]}...")

[HIGHLIGHT_THE_PARTS_(IF_ANY)_OF_THIS_CONTRACT_RELATED_TO_"DOCUMENT_NAME"_THAT_SHOULD_BE_REVIEWED_BY_A_LAWYER._DETAILS:_THE_NAME_OF_THE_CONTRACT] - EXHIBIT 10.6

                              DISTRIBUTOR AGREEMENT

         THIS  DISTRIBUTOR  AGREE...
[HIGHLIGHT_THE_PARTS_(IF_ANY)_OF_THIS_CONTRACT_RELATED_TO_"PARTIES"_THAT_SHOULD_BE_REVIEWED_BY_A_LAWYER._DETAILS:_THE_TWO_OR_MORE_PARTIES_WHO_SIGNED_THE_CONTRACT] - EXHIBIT 10.6

                              DISTRIBUTOR AGREEMENT

         THIS  DISTRIBUTOR  AGREE...
[HIGHLIGHT_THE_PARTS_(IF_ANY)_OF_THIS_CONTRACT_RELATED_TO_"AGREEMENT_DATE"_THAT_SHOULD_BE_REVIEWED_BY_A_LAWYER._DETAILS:_THE_DATE_OF_THE_CONTRACT] - EXHIBIT 10.6

                              DISTRIBUTOR AGREEMENT

         THIS  DISTRIBUTOR  AGREE...
[HIGHLIGHT_THE_PARTS_(IF_ANY)_OF_THIS_CONTRACT_RELATED_TO_"EFFECTIVE_DATE"_THAT_SHOULD_BE_REVIEWED_BY_A_LAWYER._DETAILS:_THE_DATE_WHEN_THE_CONTRACT_IS_EFFECTIVE] - EXHIBIT 10.6

                              DISTRIBUTOR AGREEM

## **Training Classifier**

In [5]:
from sklearn.model_selection import train_test_split

X = [x[0] for x in cleaned_data]
y = [x[1] for x in cleaned_data]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### **Prepare data for BERT**

In [6]:
import json
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Load CUAD
with open("cuad/data/CUADv1.json") as f:
    cuad = json.load(f)

# Extract (clause_text, clause_type)
data = []
for doc in cuad["data"]:
    for para in doc["paragraphs"]:
        context = para["context"]
        for qa in para["qas"]:
            if not qa["is_impossible"]:
                label = qa["question"].strip().lower().replace(" ", "_")
                data.append({"text": context, "label": label})

# Deduplicate and trim for sanity
data = list({(x["text"], x["label"]) for x in data})
data = [{"text": x[0], "label": x[1]} for x in data]

print("Sample:", data[0])

Sample: {'text': 'Exhibit 99.3\n\nEXECUTION COPY\n\nINTELLECTUAL PROPERTY AGREEMENT\n\nTHIS INTELLECTUAL PROPERTY AGREEMENT (this "Intellectual Property Agreement"), dated as of December 20, 2007, is made by and between NMS COMMUNICATIONS CORP., a Delaware corporation ("Seller"), and VERSO BACKHAUL SOLUTIONS, INC., a Georgia corporation ("Backhaul").\n\nRECITALS:\n\nWHEREAS, Seller and Verso Technologies, Inc., a Minnesota corporation ("Buyer"), have entered into that certain Asset Purchase Agreement, dated as of the date hereof (the "Asset Purchase Agreement"), pursuant to which Buyer has the right to acquire the Purchased Assets of Seller and its Subsidiaries, as more particularly described in the Asset Purchase Agreement (all capitalized words and terms used herein and not otherwise defined herein shall have the meanings ascribed to them in the Asset Purchase Agreement); and\n\nWHEREAS, Buyer has designated Backhaul as a Buyer Designee for purposes of the Asset Purchase Agreement, a

### **Encode labels and split**

In [7]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
all_labels = [x["label"] for x in data]
label_encoder.fit(all_labels)

for x in data:
    x["label_id"] = label_encoder.transform([x["label"]])[0]

# Train/test split
train_data, test_data = train_test_split(data, test_size=0.15, random_state=42)

# Convert to Hugging Face dataset
train_dataset = Dataset.from_list(train_data)
test_dataset = Dataset.from_list(test_data)

# Label mapping for inverse later
id2label = {i: l for i, l in enumerate(label_encoder.classes_)}
label2id = {v: k for k, v in id2label.items()}

### **Tokenize for BERT**

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_fn(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=256)

train_dataset = train_dataset.map(tokenize_fn, batched=True)
test_dataset = test_dataset.map(tokenize_fn, batched=True)

train_dataset = train_dataset.rename_column("label_id", "labels")
test_dataset = test_dataset.rename_column("label_id", "labels")

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/5689 [00:00<?, ? examples/s]

Map:   0%|          | 0/1004 [00:00<?, ? examples/s]

### **Fine tune BERT**

In [9]:
import torch
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

# Choose MPS 
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# Load model and push it to device
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label_encoder.classes_),
    id2label=id2label,
    label2id=label2id
).to(device)

# Set training arguments
training_args = TrainingArguments(
    output_dir="./bert-cuad-clause",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="no",
    report_to="none",  # Disable W&B if you're not using it
    remove_unused_columns=False,  # Needed for custom tokenizers or models
    no_cuda=True  # Important for MPS; disables CUDA and allows MPS to be used
)

# Hugging Face's Trainer will use MPS automatically when CUDA is disabled
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

# Train on MPS
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,3.364,3.404645
2,3.3849,3.377695


### **Save the model for reuse**

In [None]:
model.save_pretrained("legal-bert-cuad")
tokenizer.save_pretrained("legal-bert-cuad")


('legal-bert-cuad/tokenizer_config.json',
 'legal-bert-cuad/special_tokens_map.json',
 'legal-bert-cuad/vocab.txt',
 'legal-bert-cuad/added_tokens.json',
 'legal-bert-cuad/tokenizer.json')

In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("legal-bert-cuad")
tokenizer = AutoTokenizer.from_pretrained("legal-bert-cuad")


### **Load the model and tokenizer**

In [2]:
from transformers import pipeline

classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

test_clause = "Either party may terminate this agreement with prior written notice."
pred = classifier(test_clause)
pred_label = pred[0]["label"]

print(f"🔍 Prediction: {pred_label}")

Device set to use mps:0


🔍 Prediction: highlight_the_parts_(if_any)_of_this_contract_related_to_"document_name"_that_should_be_reviewed_by_a_lawyer._details:_the_name_of_the_contract


### **Predicting top 3 labels**

In [3]:
from transformers import pipeline

# Load classifier with top_k
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, top_k=3)

test_clause = "Either party may terminate this agreement with prior written notice."

# top_k=3 returns a list of list of dicts
predictions = classifier(test_clause)[0]  # Take first result

print("🔍 Top-3 Predicted Clause Categories:\n")
for pred in predictions:
    label = pred["label"]
    score = pred["score"]
    print(f"🧠 {label} — {score:.4f}")

Device set to use mps:0


🔍 Top-3 Predicted Clause Categories:

🧠 highlight_the_parts_(if_any)_of_this_contract_related_to_"document_name"_that_should_be_reviewed_by_a_lawyer._details:_the_name_of_the_contract — 0.1177
🧠 highlight_the_parts_(if_any)_of_this_contract_related_to_"parties"_that_should_be_reviewed_by_a_lawyer._details:_the_two_or_more_parties_who_signed_the_contract — 0.1034
🧠 highlight_the_parts_(if_any)_of_this_contract_related_to_"agreement_date"_that_should_be_reviewed_by_a_lawyer._details:_the_date_of_the_contract — 0.0996


### **Highlighting the clause based on predicted labels**

In [4]:
from termcolor import colored

highlight_map = {
    "termination": ["terminate", "termination", "notice", "written"],
    "confidentiality": ["confidential", "disclosure", "non-disclosure", "secret"],
    "dispute_resolution": ["arbitration", "dispute", "settlement", "court", "litigation"],
    "jurisdiction": ["governed", "jurisdiction", "laws of", "venue", "court"]
}

def highlight_clause(clause, predicted_label):
    label_keywords = highlight_map.get(predicted_label.lower(), [])
    highlighted = clause
    for kw in label_keywords:
        if kw.lower() in highlighted.lower():
            highlighted = highlighted.replace(
                kw,
                colored(kw, "red", attrs=["bold"])
            )
    return highlighted
top_label = predictions[0]["label"]
highlighted_clause = highlight_clause(test_clause, top_label)

print("Highlighted Clause:")
print(highlighted_clause)

Highlighted Clause:
Either party may terminate this agreement with prior written notice.


In [8]:
import fitz  # PyMuPDF
from transformers import pipeline
from termcolor import colored

# Define clause label -> keywords for highlight
highlight_map = {
    "termination": ["terminate", "termination", "notice", "written"],
    "confidentiality": ["confidential", "disclosure", "non-disclosure", "secret"],
    "dispute_resolution": ["arbitration", "dispute", "settlement", "court", "litigation"],
    "jurisdiction": ["governed", "jurisdiction", "laws of", "venue", "court"]
}

def highlight_clause(clause, predicted_label):
    label_keywords = highlight_map.get(predicted_label.lower(), [])
    highlighted = clause
    for kw in label_keywords:
        if kw.lower() in highlighted.lower():
            highlighted = highlighted.replace(
                kw, colored(kw, "red", attrs=["bold"])
            )
    return highlighted

# Load PDF
def extract_clauses_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    clauses = []
    for page in doc:
        text = page.get_text()
        lines = text.split("\n")
        for line in lines:
            if len(line.strip()) > 30:  # Avoid junk or headings
                clauses.append(line.strip())
    return clauses

In [9]:
# Load classifier with top-3 output
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, top_k=3)

# Path to your contract PDF
pdf_path = "Land-Purchase-Agreement.pdf"

clauses = extract_clauses_from_pdf(pdf_path)

# Analyze each clause
for i, clause in enumerate(clauses):
    preds = classifier(clause)[0]  # Top 3 predictions
    top_label = preds[0]["label"]

    print(f"\n Clause {i+1}:")
    print(highlight_clause(clause, top_label))

    print("\n Top-3 Categories:")
    for pred in preds:
        print(f"  - {pred['label']} ({pred['score']:.4f})")

Device set to use mps:0



 Clause 1:
This Land Purchase Agreement (“Agreement”) is made as of [Insert Date] by and between:

 Top-3 Categories:
  - highlight_the_parts_(if_any)_of_this_contract_related_to_"document_name"_that_should_be_reviewed_by_a_lawyer._details:_the_name_of_the_contract (0.1199)
  - highlight_the_parts_(if_any)_of_this_contract_related_to_"parties"_that_should_be_reviewed_by_a_lawyer._details:_the_two_or_more_parties_who_signed_the_contract (0.1051)
  - highlight_the_parts_(if_any)_of_this_contract_related_to_"agreement_date"_that_should_be_reviewed_by_a_lawyer._details:_the_date_of_the_contract (0.1020)

 Clause 2:
Seller: [Seller’s Full Name], whose address is [Seller’s Address].

 Top-3 Categories:
  - highlight_the_parts_(if_any)_of_this_contract_related_to_"document_name"_that_should_be_reviewed_by_a_lawyer._details:_the_name_of_the_contract (0.1138)
  - highlight_the_parts_(if_any)_of_this_contract_related_to_"parties"_that_should_be_reviewed_by_a_lawyer._details:_the_two_or_more_par

In [6]:
import fitz  # PyMuPDF
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

# Load model + tokenizer
model = AutoModelForSequenceClassification.from_pretrained("./legal-bert-cuad")
tokenizer = AutoTokenizer.from_pretrained("./legal-bert-cuad")
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, top_k=1)

# Clause label colors (optional enhancement)
label_colors = {
    "termination": (1, 1, 0),           # Yellow
    "confidentiality": (1, 0.8, 0.5),   # Orange
    "dispute_resolution": (0.8, 1, 1),  # Cyan
    "jurisdiction": (0.9, 0.9, 1)       # Blueish
}

# Annotate the PDF using bounding boxes instead of keyword search
def annotate_pdf_bboxes(input_pdf_path, output_pdf_path):
    doc = fitz.open(input_pdf_path)

    for page in doc:
        blocks = page.get_text("blocks")  # (x0, y0, x1, y1, text, block_no)
        for block in blocks:
            x0, y0, x1, y1, text = block[:5]
            clause = text.strip()

            if len(clause) < 30:
                continue

            # Get prediction
            pred = classifier(clause)[0][0]
            label = pred["label"].lower()
            confidence = pred["score"]

            # Add highlight using bounding box
            rect = fitz.Rect(x0, y0, x1, y1)
            highlight = page.add_highlight_annot(rect)

            # Optional: set custom color (not guaranteed in all PDF viewers)
            color = label_colors.get(label, (1, 1, 0))  # Default yellow
            highlight.set_colors(stroke=color)

            # Add clause classification as comment
            highlight.set_info(
                title="AI Clause Classification",
                content=f"🔍 Predicted: {label.upper()} ({confidence:.2f})"
            )
            highlight.update()

    doc.save(output_pdf_path, deflate=True)
    doc.close()
    print(f"Annotated PDF saved as: {output_pdf_path}")

Device set to use mps:0


In [7]:
annotate_pdf_bboxes("Land-Purchase-Agreement.pdf", "highlighted_contract.pdf")

Annotated PDF saved as: highlighted_contract.pdf
