# ClauseGuard — Automated Contract Clause Classification & Risk Advice

Automate contract clause understanding end-to-end: classify clauses with fine-tuned LegalBERT and RoBERTa, then generate risk analysis and mitigation guidance with GPT-5 via the OpenAI API.

In [None]:
# Install necessary libraries
# !pip install -qU transformers accelerate openai datasets scikit-learn --no-cache-dir

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    Trainer, TrainingArguments,
    BertTokenizer, BertForSequenceClassification,
    RobertaTokenizerFast, RobertaForSequenceClassification)
from sklearn.model_selection import train_test_split, StratifiedKFold, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from datasets import load_dataset, Dataset

In [2]:
# --------- Select CPU or GPU to run ----------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

#--------- Check the name of the GPU --------
if torch.cuda.is_available():
    print(f"GPU name: {torch.cuda.get_device_name(0)}")

Using device: cuda
GPU name: Tesla V100-SXM2-16GB


In [None]:
rev = "eb7c6783655ca3a984a33e9a06ad367602057b38"
data_files = {
    "train": f"https://huggingface.co/datasets/nguha/legalbench/resolve/{rev}/cuad_audit_rights/train/0000.parquet",
    "test":  f"https://huggingface.co/datasets/nguha/legalbench/resolve/{rev}/cuad_audit_rights/test/0000.parquet",
}
dataset = load_dataset("parquet", data_files=data_files)

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['answer', 'index', 'text', 'document_name'],
        num_rows: 6
    })
    test: Dataset({
        features: ['answer', 'index', 'text', 'document_name'],
        num_rows: 1216
    })
})

In [None]:
# Preprocessing Data
df_train = pd.DataFrame(dataset['test'])
df_test  = pd.DataFrame(dataset['train'])

# Vectorized text cleanup
for df in (df_train, df_test):
    df['cleaned_text'] = df['text'].astype(str).str.strip().str.lower()

# Combine, remove any stray 'index' column if present, shuffle, and reset index
df_combined = (
    pd.concat([df_train, df_test], ignore_index=True)
      .drop(columns=['index'], errors='ignore')
      .sample(frac=1, random_state=42)
      .reset_index(drop=True)
)

# Two-stage split the data into train, val, and test sets
train_data, temp = train_test_split(
    df_combined,
    test_size=0.2,
    stratify=df_combined['answer'],
    random_state=42
)

val_data, test_data = train_test_split(
    temp,
    test_size=0.4,                  
    stratify=temp['answer'],
    random_state=42
)

print("Train, Validation, and Test sets size:")
print(f"Train: {train_data.shape}, Validation: {val_data.shape}, Test: {test_data.shape}")

Train, Validation, and Test sets size:
Train: (977, 4), Validation: (147, 4), Test: (98, 4)


In [6]:
# Create dataset class
class LegalDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

### Contract Clause Classification

In [None]:
# Training and Evaluating
def train_and_evaluate(X_train, y_train, X_val, y_val, model, tokenizer):

    # Tokenize the inputs
    train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=512)
    val_encodings = tokenizer(X_val.tolist(), truncation=True, padding=True, max_length=512)

    # Convert labels to tensor
    train_labels = torch.tensor(y_train.apply(lambda x: 1 if x.lower() == "yes" else 0).tolist())
    val_labels = torch.tensor(y_val.apply(lambda x: 1 if x.lower() == "yes" else 0).tolist())

    # Create datasets
    train_dataset = LegalDataset(train_encodings, train_labels)
    val_dataset = LegalDataset(val_encodings, val_labels)

    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        eval_strategy="epoch"
    )

    trainer = Trainer(
        model = model,
        args = training_args,
        train_dataset = train_dataset,
        eval_dataset = val_dataset,
    )

    trainer.train()
    predictions = trainer.predict(val_dataset)
    preds = predictions.predictions.argmax(-1)
    labels = predictions.label_ids

    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)

    return accuracy, precision, recall, f1

# Training model with K-fold cross-validation

In [8]:
def run_cv_for_model(model_name: str, save_prefix: str, num_labels: int = 2, n_splits: int = 5):
    kf = StratifiedKFold(n_splits=n_splits)
    accuracies, precisions, recalls, f1s = [], [], [], []
    texts = train_data['cleaned_text']
    labels = train_data['answer']
    i = 0

    for train_index, val_index in kf.split(texts, labels):
        X_train = texts[texts.index.isin(train_index)]
        X_val   = texts[texts.index.isin(val_index)]
        y_train = labels[labels.index.isin(train_index)]
        y_val   = labels[labels.index.isin(val_index)]

        if model_name=="roberta-base":
            model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to(device)
            tokenizer = RobertaTokenizerFast.from_pretrained(model_name)
        else:
            model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to(device)
            tokenizer = BertTokenizer.from_pretrained(model_name)

        accuracy, precision, recall, f1 = train_and_evaluate(X_train, y_train, X_val, y_val, model, tokenizer)

        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)

        model.save_pretrained(f'{save_prefix}-fold{i}')
        tokenizer.save_pretrained(f'{save_prefix}-fold{i}')
        i += 1

    # Print & return averages in the same style
    avg_metrics = {
        "model": model_name,
        "Average Accuracy": float(np.mean(accuracies)),
        "Average Precision": float(np.mean(precisions)),
        "Average Recall": float(np.mean(recalls)),
        "Average F1 Score": float(np.mean(f1s)),
    }
    print(f"[{model_name}] Average Accuracy: {avg_metrics['Average Accuracy']}")
    print(f"[{model_name}] Average Precision: {avg_metrics['Average Precision']}")
    print(f"[{model_name}] Average Recall: {avg_metrics['Average Recall']}")
    print(f"[{model_name}] Average F1 Score: {avg_metrics['Average F1 Score']}")
    return avg_metrics

In [None]:
candidates = [
    ("nlpaueb/legal-bert-base-uncased", "fine-tuned-legal-bert"),
    ("roberta-base", "fine-tuned-roberta")
]
results = []
for model_name, save_prefix in candidates:
    metrics = run_cv_for_model(model_name=model_name, save_prefix=save_prefix, num_labels=2, n_splits=5)
    results.append(metrics)

df_results = pd.DataFrame(results)
df_results_path = "./results/cv_comparison.csv"
df_results.to_csv(df_results_path, index=False)
df_results

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss
1,0.0745,0.078655
2,0.1441,0.080963
3,0.0003,0.000231


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss
1,0.2594,0.121921
2,0.0009,0.072437
3,0.0002,0.161886


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss
1,0.3831,0.204118
2,0.1331,0.101963
3,0.0002,0.054245


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss
1,0.4334,0.254963
2,0.3186,0.096709
3,0.0002,0.137122


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss
1,0.1629,0.112013
2,0.0016,0.06406
3,0.0002,0.146437


[nlpaueb/legal-bert-base-uncased] Average Accuracy: 0.9859779501837485
[nlpaueb/legal-bert-base-uncased] Average Precision: 0.9832891477640843
[nlpaueb/legal-bert-base-uncased] Average Recall: 0.9901002125721228
[nlpaueb/legal-bert-base-uncased] Average F1 Score: 0.9866158278312586


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss
1,0.005,0.600989
2,0.2086,0.080695
3,0.0015,0.094551


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss
1,0.3312,0.078338
2,0.0004,0.105769
3,0.3898,0.12304


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss
1,0.1171,0.02535
2,0.0014,0.054464
3,0.0003,0.53709


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss
1,0.0149,0.263611
2,0.1544,0.125381
3,0.0003,0.15545


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss
1,0.1341,0.159144
2,0.0009,0.252274
3,0.0016,0.261499


[roberta-base] Average Accuracy: 0.9653650310489835
[roberta-base] Average Precision: 0.9823001072720174
[roberta-base] Average Recall: 0.9498450973589472
[roberta-base] Average F1 Score: 0.9651516244333506


Unnamed: 0,model,Average Accuracy,Average Precision,Average Recall,Average F1 Score
0,nlpaueb/legal-bert-base-uncased,0.985978,0.983289,0.9901,0.986616
1,roberta-base,0.965365,0.9823,0.949845,0.965152


In [None]:
# Define the test function
def test_model(X_test, y_test, model_name, model_path, tokenizer_path):
    # Load the fine-tuned model and tokenizer
    if model_name=="roberta-base":
        model = RobertaForSequenceClassification.from_pretrained(model_path).to(device)
        tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_path)
    else:
        model = BertForSequenceClassification.from_pretrained(model_path).to(device)
        tokenizer = BertTokenizer.from_pretrained(tokenizer_path)

    # Tokenize the test texts
    test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=512)

    # Convert labels to tensor
    test_labels_tensor = torch.tensor(y_test.apply(lambda x: 1 if x.lower() == "yes" else 0).tolist())

    # Create a test dataset
    test_dataset = LegalDataset(test_encodings, test_labels_tensor)

    # Create a DataLoader for the test dataset
    test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

    # Evaluate the model
    model.eval()
    preds = []
    labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels.extend(batch['labels'].cpu().numpy())
            outputs = model(input_ids, attention_mask=attention_mask)
            preds.extend(torch.argmax(outputs.logits, dim=-1).cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)

    return accuracy, precision, recall, f1

# test data
test_texts = test_data['cleaned_text']
test_labels = test_data['answer']

results = []
for model_name, save_prefix in candidates:
    accuracies, precisions, recalls, f1s = [], [], [], []
    for i in range(5):
        model_path = f'{save_prefix}-fold{i}'
        tokenizer_path = f'{save_prefix}-fold{i}'
        accuracy, precision, recall, f1 = test_model(test_texts, test_labels, model_name, model_path, tokenizer_path)
        
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)

        print(f"Model {model_path} - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}")
    
    avg_metrics = {
        "model": model_name,
        "Average Accuracy": float(np.mean(accuracies)),
        "Average Precision": float(np.mean(precisions)),
        "Average Recall": float(np.mean(recalls)),
        "Average F1 Score": float(np.mean(f1s)),
    }
    results.append(avg_metrics)

df_results = pd.DataFrame(results)
df_results_path = "./results/test_comparison.csv"
df_results.to_csv(df_results_path, index=False)
print("Testing resutls:")
df_results

Model fine-tuned-legal-bert-fold0 - Accuracy: 0.9897959183673469, Precision: 1.0, Recall: 0.9795918367346939, F1 Score: 0.9896907216494846
Model fine-tuned-legal-bert-fold1 - Accuracy: 1.0, Precision: 1.0, Recall: 1.0, F1 Score: 1.0
Model fine-tuned-legal-bert-fold2 - Accuracy: 0.9897959183673469, Precision: 1.0, Recall: 0.9795918367346939, F1 Score: 0.9896907216494846
Model fine-tuned-legal-bert-fold3 - Accuracy: 0.9693877551020408, Precision: 0.9423076923076923, Recall: 1.0, F1 Score: 0.9702970297029703
Model fine-tuned-legal-bert-fold4 - Accuracy: 0.9897959183673469, Precision: 1.0, Recall: 0.9795918367346939, F1 Score: 0.9896907216494846
Model fine-tuned-roberta-fold0 - Accuracy: 1.0, Precision: 1.0, Recall: 1.0, F1 Score: 1.0
Model fine-tuned-roberta-fold1 - Accuracy: 1.0, Precision: 1.0, Recall: 1.0, F1 Score: 1.0
Model fine-tuned-roberta-fold2 - Accuracy: 0.9387755102040817, Precision: 1.0, Recall: 0.8775510204081632, F1 Score: 0.9347826086956522
Model fine-tuned-roberta-fold3 -

Unnamed: 0,model,Average Accuracy,Average Precision,Average Recall,Average F1 Score
0,nlpaueb/legal-bert-base-uncased,0.987755,0.988462,0.987755,0.987874
1,roberta-base,0.983673,0.992,0.97551,0.982916


In [23]:
# Load model
model = BertForSequenceClassification.from_pretrained("fine-tuned-legal-bert-fold1").to(device)
tokenizer = BertTokenizer.from_pretrained("fine-tuned-legal-bert-fold1")

# Function for classification using Legal-BERT
def classify_clause(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    outputs = model(**inputs)
    preds = torch.argmax(outputs.logits, dim=-1)
    return preds.item()

# Define a test clause
test_clause = 'Neither party shall voluntarily or by operation of law assign or otherwise transfer the rights and/or obligations incurred pursuant to the terms of this Agreement without the prior written consent of the other party.'

# Get the combined result
response = "Audit clause" if classify_clause(test_clause) else "Not an Audit Clause"

# Print the combined result
print(response)


Not an Audit Clause


# Risk Analysis and Mitigation Guidance

In [None]:
from openai import OpenAI
import os
from dotenv import load_dotenv
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

In [None]:
# Function to run the prompt using the ChatCompletion endpoint
def llm_clause_analysis(classification_label, clause):
    message = (
        f"Below is a contract clause classified as '{classification_label}':\n\n"
        f"'{clause}'\n\n"
    )
    system_prompt = """You are a legal advisor. Provide a concise, cohesive explanation linking the clause, its classification, and the listed risks. Use the exact template below:
        Clause: <clause>
        Classification: <label>
        Key risks: <In 5–8 bullet points max, flag any risks in the clauses>
        Mitigations: <Solutions for risks>"""

    response = client.responses.create(
        model="gpt-5-mini",
        instructions=system_prompt,
        input=message,
    )
    return response.output_text

# Define a test clause
test_clause = "Neither party shall voluntarily or by operation of law assign or otherwise transfer the rights and/or obligations incurred pursuant to the terms of this Agreement without the prior written consent of the other party."

classification_result = classify_clause(test_clause)
classification_label = "Audit Clause" if classification_result else "Not an Audit Clause"

llm_response = llm_clause_analysis(classification_label, test_clause)

# Print the combined result
print(llm_response)

Clause: Neither party shall voluntarily or by operation of law assign or otherwise transfer the rights and/or obligations incurred pursuant to the terms of this Agreement without the prior written consent of the other party.

Classification: Not an Audit Clause

Key risks:
- Blocks important corporate changes (mergers, acquisitions, reorganizations) that could affect business continuity or value.
- Ambiguity over what constitutes a transfer or assignment, and whether ancillary actions (delegation, novation) trigger the clause.
- No standard requiring consent to be reasonably withheld, leading to potential arbitrary or strategic withholding.
- No carve-outs for permitted transfers (to affiliates, successors, lenders, or in connection with a sale of all or substantially all assets), increasing disruption risk.
- No defined remedies or consequences for unauthorized assignment, creating potential enforcement disputes.
- Potential negative impact on financing arrangements or strategic partn