### Load data

In [1]:
import pandas as pd
import pickle
import numpy as np

# Load saved files
df_labeled = pd.read_pickle("/content/drive/MyDrive/data_cuad_transformer/df_labeled.pkl")

with open("/content/drive/MyDrive/data_cuad_transformer/label_names.pkl", "rb") as f:
    label_names = pickle.load(f)

pos_weight = np.load("/content/drive/MyDrive/data_cuad_transformer/pos_weight.npy")

print("Loaded df_labeled, label_names, pos_weight")


Loaded df_labeled, label_names, pos_weight


### BERT Tokenization

In [4]:
import os, json, numpy as np, pandas as pd, pickle, time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from torch.optim import AdamW
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

df_labeled = pd.read_pickle("/content/drive/MyDrive/data_cuad_transformer/df_labeled.pkl")
with open("/content/drive/MyDrive/data_cuad_transformer/label_names.pkl", "rb") as f:
    label_names = pickle.load(f)

df_labeled["y_multi"] = df_labeled["y_multi"].apply(lambda v: np.asarray(v, dtype=np.float32))
y_lengths = df_labeled["y_multi"].map(lambda v: v.shape[0]).unique()
if len(y_lengths) != 1:
    raise ValueError(f"Inconsistent y_multi lengths across rows: {y_lengths}")
TRUE_NUM_LABELS = int(y_lengths[0])

if 'label_names' in globals():
    if len(label_names) != TRUE_NUM_LABELS:
        print(f"[warn] label_names len={len(label_names)}")
        if len(label_names) > TRUE_NUM_LABELS:
            label_names = list(label_names)[:TRUE_NUM_LABELS]
        else:
            label_names = list(label_names) + [f"label_{i}" for i in range(len(label_names), TRUE_NUM_LABELS)]

NUM_LABELS = TRUE_NUM_LABELS
print("NUM_LABELS:", NUM_LABELS)

def align_matrix(mat, num_labels):
    mat = np.asarray(mat, dtype=np.float32)
    if mat.ndim == 1:
        mat = mat[None, :]
    if mat.shape[1] > num_labels:
        return mat[:, :num_labels]
    if mat.shape[1] < num_labels:
        pad = np.zeros((mat.shape[0], num_labels - mat.shape[1]), dtype=np.float32)
        return np.hstack([mat, pad])
    return mat

Using device: cuda
[warn] label_names len=40
NUM_LABELS: 41


In [10]:
class ClauseDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512, num_labels=None):
        self.texts = list(texts)
        arr = np.array(list(labels), dtype=np.float32)
        if num_labels is not None:
            arr = align_matrix(arr, num_labels)
        self.labels = arr
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self): return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float32)
        return item

In [6]:
titles = df_labeled["contract_id"].unique()
train_t, test_t = train_test_split(titles, test_size=0.20, random_state=42)
train_t, val_t  = train_test_split(train_t, test_size=0.20, random_state=42)

def take(ids): return df_labeled[df_labeled["contract_id"].isin(ids)].reset_index(drop=True)
train_df, val_df, test_df = take(train_t), take(val_t), take(test_t)

MODEL_NAME  = "bert-base-uncased"
MAX_LEN     = 512
BATCH_SIZE  = 8
NUM_LABELS  = len(label_names)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
train_ds = ClauseDataset(train_df["clause_text"], train_df["y_multi"], tokenizer, MAX_LEN, NUM_LABELS)
val_ds   = ClauseDataset(val_df["clause_text"],   val_df["y_multi"],   tokenizer, MAX_LEN, NUM_LABELS)
test_ds  = ClauseDataset(test_df["clause_text"],  test_df["y_multi"],  tokenizer, MAX_LEN, NUM_LABELS)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE)


Y_train_mat = np.array(train_df["y_multi"].tolist())
P = Y_train_mat.sum(axis=0)
N = len(Y_train_mat)
pos_w_np = (N - P) / np.clip(P, 1, None)
pos_w = torch.tensor(pos_w_np, dtype=torch.float32, device=device)
bce_loss = nn.BCEWithLogitsLoss(pos_weight=pos_w, reduction="none")

def loss_fn(logits, targets):
    loss = bce_loss(logits, targets)
    return loss.mean()

CKPT_DIR = "/content/drive/MyDrive/data_cuad_transformer/ckpt_bert_finetuned"
os.makedirs(CKPT_DIR, exist_ok=True)

config = AutoConfig.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    problem_type="multi_label_classification"
)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=config).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def multilabel_metrics(y_true, y_pred):
    return {
        "micro_f1": f1_score(y_true, y_pred, average="micro", zero_division=0),
        "macro_f1": f1_score(y_true, y_pred, average="macro", zero_division=0),
        "micro_precision": precision_score(y_true, y_pred, average="micro", zero_division=0),
        "micro_recall": recall_score(y_true, y_pred, average="micro", zero_division=0),
        "macro_precision": precision_score(y_true, y_pred, average="macro", zero_division=0),
        "macro_recall": recall_score(y_true, y_pred, average="macro", zero_division=0),
    }

def find_best_threshold_probs(y_true, y_prob, metric_key="micro_f1"):
    best_t, best_stats = 0.5, None
    for t in np.linspace(0.1, 0.9, 17):
        y_hat = (y_prob >= t).astype(int)
        stats = multilabel_metrics(y_true, y_hat)
        if best_stats is None or stats[metric_key] > best_stats[metric_key]:
            best_t, best_stats = float(t), stats
    return best_t, best_stats

def infer_loader(model, loader):
    model.eval()
    all_logits, all_labels = [], []
    with torch.no_grad():
        for batch in loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}
            labels = batch["labels"].to(device)
            logits = model(**inputs).logits
            all_logits.append(logits.detach().cpu().numpy())
            all_labels.append(labels.detach().cpu().numpy())
    return np.vstack(all_logits), np.vstack(all_labels)

In [8]:
max_epochs = 10
best_val_micro_f1 = -1.0
no_improve = 0
patience = 3


print("Training : ")
for epoch in range(1, max_epochs + 1):
    model.train()
    total_loss = 0.0
    for batch in train_loader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        logits = model(**inputs).logits
        loss = loss_fn(logits, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        total_loss += loss.item() * labels.size(0)

    avg_loss = total_loss / len(train_loader.dataset)

    val_logits, y_val = infer_loader(model, val_loader)
    val_probs = 1.0 / (1.0 + np.exp(-val_logits))
    t_val, stats_val = find_best_threshold_probs(y_val, val_probs)

    print(f"Epoch {epoch} | TrainLoss: {avg_loss:.4f} | Val Micro-F1: {stats_val['micro_f1']:.4f} | Threshold: {t_val:.2f}")

    if stats_val["micro_f1"] > best_val_micro_f1:
        best_val_micro_f1 = stats_val["micro_f1"]
        model.save_pretrained(CKPT_DIR)
        tokenizer.save_pretrained(CKPT_DIR)
        with open(os.path.join(CKPT_DIR, "threshold.json"), "w") as f:
            json.dump({"best_threshold": float(t_val), "val_stats": {k: float(v) for k, v in stats_val.items()}}, f, indent=2)
        no_improve = 0
    else:
        no_improve += 1
        if no_improve >= patience:
            print("Early stopping.")
            break

Training : 
Epoch 1 | TrainLoss: 0.9583 | Val Micro-F1: 0.4899 | Threshold: 0.45
Epoch 2 | TrainLoss: 0.9396 | Val Micro-F1: 0.4850 | Threshold: 0.40
Epoch 3 | TrainLoss: 0.9158 | Val Micro-F1: 0.5232 | Threshold: 0.45
Epoch 4 | TrainLoss: 0.8571 | Val Micro-F1: 0.5636 | Threshold: 0.45
Epoch 5 | TrainLoss: 0.7975 | Val Micro-F1: 0.5795 | Threshold: 0.45
Epoch 6 | TrainLoss: 0.7422 | Val Micro-F1: 0.5964 | Threshold: 0.45
Epoch 7 | TrainLoss: 0.7001 | Val Micro-F1: 0.6036 | Threshold: 0.45
Epoch 8 | TrainLoss: 0.6573 | Val Micro-F1: 0.6109 | Threshold: 0.45
Epoch 9 | TrainLoss: 0.6258 | Val Micro-F1: 0.6124 | Threshold: 0.40
Epoch 10 | TrainLoss: 0.5939 | Val Micro-F1: 0.6256 | Threshold: 0.40


In [12]:
with open(os.path.join(CKPT_DIR, "threshold.json"), "r") as f:
    meta = json.load(f)
best_t = float(meta["best_threshold"])
best_val_stats = {k: float(v) for k, v in meta["val_stats"].items()}

best_model = AutoModelForSequenceClassification.from_pretrained(CKPT_DIR).to(device)

test_logits, y_test = infer_loader(best_model, test_loader)
test_probs = 1.0 / (1.0 + np.exp(-test_logits))
y_test_hat = (test_probs >= best_t).astype(int)

test_stats = multilabel_metrics(y_test, y_test_hat)

print("BERT F1 Score (Micro):", f1_score(y_test, y_test_hat, average="micro", zero_division=0))
print("VAL :", {k: round(v, 4) for k, v in best_val_stats.items()})
print("TEST:", {k: round(v, 4) for k, v in test_stats.items()})

BERT F1 Score (Micro): 0.5997248968363136
VAL : {'micro_f1': 0.6256, 'macro_f1': 0.4957, 'micro_precision': 0.4986, 'micro_recall': 0.8392, 'macro_precision': 0.4125, 'macro_recall': 0.7789}
TEST: {'micro_f1': 0.5997, 'macro_f1': 0.4785, 'micro_precision': 0.4727, 'micro_recall': 0.8202, 'macro_precision': 0.3916, 'macro_recall': 0.7407}


### Sample Output

In [14]:
def names_from_vec(vec, names):
    return [names[i] for i, v in enumerate(vec) if int(v) == 1]

np.random.seed(42)
k = 3
idxs = np.random.choice(len(test_df), size=min(k, len(test_df)), replace=False)

for idx in idxs:
    clause = test_df["clause_text"].iloc[idx]
    clause_short = (clause[:600] + "...") if len(clause) > 600 else clause

    true_labels = names_from_vec(y_test[idx], label_names)
    pred_labels = names_from_vec(y_test_hat[idx], label_names)

    print("\n— Clause —")
    print(clause_short, "\n")
    print(" True:", true_labels)
    print(" Pred (BERT):", pred_labels)


— Clause —
Exhibit 10.5 STRATEGIC ALLIANCE AGREEMENT ---------------------------- THIS STRATEGIC ALLIANCE AGREEMENT (this "Agreement") is made as of 31 December, --------- 1996, between NORTHERN TELECOM LIMITED, a Canadian corporation ("NTL"), and --- ENTRUST TECHNOLOGIES INC., a Maryland corporation ("ETI"). --- WHEREAS, pursuant to an asset transfer agreement between NTL and Entrust Technologies Limited of even date (the "NTL Transfer Agreement") and an asset ---------------------- transfer agreement between Northern Telecom Inc. and ETI of even date, the Entrust Technology (as defined herein) has been... 

 True: ['Affiliate License-Licensee', 'Affiliate License-Licensor', 'Agreement Date', 'Anti-Assignment', 'Cap On Liability', 'Covenant Not To Sue', 'Effective Date', 'Governing Law', 'Insurance', 'Joint Ip Ownership', 'Liquidated Damages', 'No-Solicit Of Customers', 'Notice Period To Terminate Renewal', 'Post-Termination Services', 'Unlimited/All-You-Can-Eat-License', 'Volume Res

### RoBertA

In [20]:
import os, json, numpy as np, pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from torch.optim import AdamW
from sklearn.metrics import f1_score, precision_score, recall_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

df_labeled = pd.read_pickle("/content/drive/MyDrive/data_cuad_transformer/df_labeled.pkl")
import pickle
with open("/content/drive/MyDrive/data_cuad_transformer/label_names.pkl", "rb") as f:
    label_names = pickle.load(f)

drop_idx = 9
label_names = [lab for i, lab in enumerate(label_names) if i != drop_idx]
TARGET_L = len(label_names)

def to_fixed(v, L=TARGET_L):
    a = np.asarray(v, dtype=np.float32).ravel()
    if a.shape[0] >= L:
        return a[:L].tolist()
    else:
        return np.pad(a, (0, L - a.shape[0]), constant_values=0).tolist()

df_labeled["y_multi"] = df_labeled["y_multi"].apply(to_fixed)
lens = df_labeled["y_multi"].map(len).value_counts()


class ClauseDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512, num_labels=None):
        self.texts = list(texts)
        L = num_labels if num_labels is not None else TARGET_L
        self.labels = np.stack([to_fixed(y, L) for y in labels], axis=0).astype(np.float32)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self): return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float32)
        return item

Using device: cuda


### Split dataset

In [21]:
from sklearn.model_selection import train_test_split
titles = df_labeled["contract_id"].unique()
train_t, test_t = train_test_split(titles, test_size=0.2, random_state=42)
train_t, val_t  = train_test_split(train_t, test_size=0.2, random_state=42)

def take(ids): return df_labeled[df_labeled["contract_id"].isin(ids)].reset_index(drop=True)
train_df, val_df, test_df = take(train_t), take(val_t), take(test_t)

MODEL_NAME = "roberta-base"
MAX_LEN = 512
BATCH_SIZE = 8
NUM_LABELS = len(label_names)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
train_ds = ClauseDataset(train_df["clause_text"], train_df["y_multi"], tokenizer, MAX_LEN)
val_ds   = ClauseDataset(val_df["clause_text"], val_df["y_multi"], tokenizer, MAX_LEN)
test_ds  = ClauseDataset(test_df["clause_text"], test_df["y_multi"], tokenizer, MAX_LEN)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE)
test_loader  = DataLoader(test_ds, batch_size=BATCH_SIZE)

class ClassBalancedFocalLoss(nn.Module):
    def __init__(self, beta=0.9999, gamma=2.0, class_counts=None, device='cpu'):
        super().__init__()
        effective_num = 1.0 - torch.pow(beta, class_counts)
        weights = (1.0 - beta) / (effective_num + 1e-8)
        weights = weights / weights.sum() * len(class_counts)
        self.weights = weights.to(device)
        self.gamma = gamma

    def forward(self, logits, targets):
        probs = torch.sigmoid(logits)
        ce_loss = F.binary_cross_entropy_with_logits(logits, targets, reduction='none')
        p_t = probs * targets + (1 - probs) * (1 - targets)
        loss = ((1 - p_t) ** self.gamma) * ce_loss * self.weights
        return loss.mean()

CKPT_DIR = "/content/drive/MyDrive/data_cuad_transformer/ckpt_roberta_finetuned"
os.makedirs(CKPT_DIR, exist_ok=True)

Y_train_mat = np.array(train_df["y_multi"].tolist())
class_counts = Y_train_mat.sum(axis=0)
loss_fn = ClassBalancedFocalLoss(class_counts=torch.tensor(class_counts), device=device)

### Fine tuning RoBertA

In [22]:
config = AutoConfig.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS, problem_type="multi_label_classification")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=config).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Evaluation and threshold tuning

In [23]:
def multilabel_metrics(y_true, y_pred):
    return {
        "micro_f1": f1_score(y_true, y_pred, average="micro", zero_division=0),
        "macro_f1": f1_score(y_true, y_pred, average="macro", zero_division=0),
        "micro_precision": precision_score(y_true, y_pred, average="micro", zero_division=0),
        "micro_recall": recall_score(y_true, y_pred, average="micro", zero_division=0),
        "macro_precision": precision_score(y_true, y_pred, average="macro", zero_division=0),
        "macro_recall": recall_score(y_true, y_pred, average="macro", zero_division=0),
    }

def find_best_threshold_probs(y_true, y_prob, metric_key="micro_f1"):
    best_t, best_stats = 0.5, None
    for t in np.linspace(0.1, 0.9, 17):
        y_hat = (y_prob >= t).astype(int)
        stats = multilabel_metrics(y_true, y_hat)
        if best_stats is None or stats[metric_key] > best_stats[metric_key]:
            best_t, best_stats = float(t), stats
    return best_t, best_stats

def infer_loader(model, loader):
    model.eval()
    all_logits, all_labels = [], []
    with torch.no_grad():
        for batch in loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}
            labels = batch["labels"].to(device)
            logits = model(**inputs).logits
            all_logits.append(logits.detach().cpu().numpy())
            all_labels.append(labels.detach().cpu().numpy())
    return np.vstack(all_logits), np.vstack(all_labels)

### Training and Evaluation

In [24]:
max_epochs = 10
best_val_micro_f1 = -1.0
no_improve = 0
patience = 3

print("Training : ")
for epoch in range(1, max_epochs + 1):
    model.train()
    total_loss = 0.0
    for batch in train_loader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}
        labels = batch["labels"].to(device)
        optimizer.zero_grad()
        logits = model(**inputs).logits
        loss = loss_fn(logits, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total_loss += loss.item() * labels.size(0)

    avg_loss = total_loss / len(train_loader.dataset)
    val_logits, y_val = infer_loader(model, val_loader)
    val_probs = 1 / (1 + np.exp(-val_logits))
    t_val, stats_val = find_best_threshold_probs(y_val, val_probs)

    print(f"Epoch {epoch} | Loss: {avg_loss:.4f} | Val Micro-F1: {stats_val['micro_f1']:.4f} | Threshold: {t_val:.2f}")

    if stats_val["micro_f1"] > best_val_micro_f1:
        best_val_micro_f1 = stats_val["micro_f1"]
        model.save_pretrained(CKPT_DIR)
        tokenizer.save_pretrained(CKPT_DIR)
        with open(os.path.join(CKPT_DIR, "threshold.json"), "w") as f:
            json.dump({"best_threshold": float(t_val), "val_stats": stats_val}, f, indent=2)
        no_improve = 0
    else:
        no_improve += 1
        if no_improve >= patience:
            print("Early stopping.")
            break

Training : 
Epoch 1 | Loss: 0.0948 | Val Micro-F1: 0.6657 | Threshold: 0.45
Epoch 2 | Loss: 0.0742 | Val Micro-F1: 0.6750 | Threshold: 0.40
Epoch 3 | Loss: 0.0744 | Val Micro-F1: 0.6753 | Threshold: 0.45
Epoch 4 | Loss: 0.0741 | Val Micro-F1: 0.6718 | Threshold: 0.40
Epoch 5 | Loss: 0.0720 | Val Micro-F1: 0.6790 | Threshold: 0.40
Epoch 6 | Loss: 0.0713 | Val Micro-F1: 0.6967 | Threshold: 0.45
Epoch 7 | Loss: 0.0686 | Val Micro-F1: 0.7196 | Threshold: 0.45
Epoch 8 | Loss: 0.0647 | Val Micro-F1: 0.7161 | Threshold: 0.45
Epoch 9 | Loss: 0.0615 | Val Micro-F1: 0.7123 | Threshold: 0.40
Epoch 10 | Loss: 0.0556 | Val Micro-F1: 0.7376 | Threshold: 0.40


In [25]:
from transformers import AutoModelForSequenceClassification

with open(os.path.join(CKPT_DIR, "threshold.json"), "r") as f:
    meta_rb = json.load(f)

best_t_rb = float(meta_rb["best_threshold"])
best_val_stats_rb = {k: float(v) for k, v in meta_rb["val_stats"].items()}

roberta_best = AutoModelForSequenceClassification.from_pretrained(CKPT_DIR).to(device)

test_logits_rb, y_test = infer_loader(roberta_best, test_loader)
test_probs_rb = 1.0 / (1.0 + np.exp(-test_logits_rb))
y_test_hat_roberta = (test_probs_rb >= best_t_rb).astype(int)

test_stats_rb = multilabel_metrics(y_test, y_test_hat_roberta)

print("RoBERTa F1 Score (Micro):", f1_score(y_test, y_test_hat_roberta, average="micro", zero_division=0))
print("VAL :", {k: round(v, 4) for k, v in best_val_stats_rb.items()})
print("TEST:", {k: round(v, 4) for k, v in test_stats_rb.items()})

RoBERTa F1 Score (Micro): 0.6829752066115703
VAL : {'micro_f1': 0.7376, 'macro_f1': 0.4636, 'micro_precision': 0.6672, 'micro_recall': 0.8248, 'macro_precision': 0.429, 'macro_recall': 0.5358}
TEST: {'micro_f1': 0.683, 'macro_f1': 0.4322, 'micro_precision': 0.5988, 'micro_recall': 0.7946, 'macro_precision': 0.3972, 'macro_recall': 0.524}


### Sample Output :

In [26]:
def names_from_vec(vec, names):
    return [names[i] for i, v in enumerate(vec) if int(v) == 1]

def show_roberta_preds(k=3, seed=42):

    y_hat_roberta = globals().get("y_test_hat_roberta", globals().get("y_test_hat"))
    assert y_hat_roberta is not None, "Run RoBERTa eval first to create y_test_hat (or y_test_hat_roberta)."

    np.random.seed(seed)
    k = min(k, len(test_df))
    idxs = np.random.choice(len(test_df), size=k, replace=False)

    for idx in idxs:
        clause = test_df["clause_text"].iloc[idx]
        clause_short = (clause[:600] + "...") if len(clause) > 600 else clause

        true_labels = names_from_vec(y_test[idx], label_names)
        pred_labels = names_from_vec(y_hat_roberta[idx], label_names)

        print("\n— Clause —")
        print(clause_short, "\n")
        print(" True:", true_labels)
        print(" Pred (RoBERTa):", pred_labels)

show_roberta_preds(k=3, seed=42)


— Clause —
Exhibit 10.5 STRATEGIC ALLIANCE AGREEMENT ---------------------------- THIS STRATEGIC ALLIANCE AGREEMENT (this "Agreement") is made as of 31 December, --------- 1996, between NORTHERN TELECOM LIMITED, a Canadian corporation ("NTL"), and --- ENTRUST TECHNOLOGIES INC., a Maryland corporation ("ETI"). --- WHEREAS, pursuant to an asset transfer agreement between NTL and Entrust Technologies Limited of even date (the "NTL Transfer Agreement") and an asset ---------------------- transfer agreement between Northern Telecom Inc. and ETI of even date, the Entrust Technology (as defined herein) has been... 

 True: ['Affiliate License-Licensee', 'Affiliate License-Licensor', 'Agreement Date', 'Anti-Assignment', 'Cap On Liability', 'Covenant Not To Sue', 'Exclusivity', 'Insurance', 'Ip Ownership Assignment', 'License Grant', 'Minimum Commitment', 'No-Solicit Of Employees', 'Parties', 'Price Restrictions', 'Volume Restriction', 'Warranty Duration']
 Pred (RoBERTa): ['Affiliate License-

### Legal Bert

In [38]:
import os, json, numpy as np, pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from torch.optim import AdamW
from sklearn.metrics import f1_score, precision_score, recall_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

df_labeled = pd.read_pickle("/content/drive/MyDrive/data_cuad_transformer/df_labeled.pkl")
import pickle
with open("/content/drive/MyDrive/data_cuad_transformer/label_names.pkl", "rb") as f:
    label_names = pickle.load(f)

df_labeled["y_multi"] = df_labeled["y_multi"].apply(lambda v: np.asarray(v, dtype=np.float32))
y_lengths = df_labeled["y_multi"].map(lambda v: v.shape[0]).unique()
if len(y_lengths) != 1:
    raise ValueError(f"Inconsistent y_multi lengths across rows: {y_lengths}")

TRUE_NUM_LABELS = int(y_lengths[0])

if len(label_names) != TRUE_NUM_LABELS:
    print(f"label_names len={len(label_names)}")
    if len(label_names) > TRUE_NUM_LABELS:
        label_names = list(label_names)[:TRUE_NUM_LABELS]
    else:
        label_names = list(label_names) + [f"label_{i}" for i in range(len(label_names), TRUE_NUM_LABELS)]

NUM_LABELS = TRUE_NUM_LABELS

def _align_to_n(vec, n):
    vec = np.asarray(vec, dtype=np.float32)
    if vec.ndim == 1:
        if vec.shape[0] > n:
            return vec[:n]
        if vec.shape[0] < n:
            return np.pad(vec, (0, n - vec.shape[0]), constant_values=0.0)
        return vec
    if vec.shape[1] > n:
        return vec[:, :n]
    if vec.shape[1] < n:
        pad = np.zeros((vec.shape[0], n - vec.shape[1]), dtype=np.float32)
        return np.hstack([vec, pad])
    return vec

Using device: cuda
label_names len=40


In [39]:
class ClauseDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512, num_labels=None):
        self.texts = list(texts)
        arr = np.array(list(labels), dtype=np.float32)
        if num_labels is not None:
            arr = _align_to_n(arr, num_labels)
        self.labels = arr
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self): return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float32)
        return item

## Split dataset

In [42]:
from sklearn.model_selection import train_test_split
titles = df_labeled["contract_id"].unique()
train_t, test_t = train_test_split(titles, test_size=0.2, random_state=42)
train_t, val_t  = train_test_split(train_t, test_size=0.2, random_state=42)

def take(ids): return df_labeled[df_labeled["contract_id"].isin(ids)].reset_index(drop=True)
train_df, val_df, test_df = take(train_t), take(val_t), take(test_t)

MODEL_NAME = "nlpaueb/legal-bert-base-uncased"
MAX_LEN = 512
BATCH_SIZE = 8
NUM_LABELS = len(label_names)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
train_ds = ClauseDataset(train_df["clause_text"], train_df["y_multi"], tokenizer, MAX_LEN, num_labels=NUM_LABELS)
val_ds   = ClauseDataset(val_df["clause_text"],   val_df["y_multi"],   tokenizer, MAX_LEN, num_labels=NUM_LABELS)
test_ds  = ClauseDataset(test_df["clause_text"],  test_df["y_multi"],  tokenizer, MAX_LEN, num_labels=NUM_LABELS)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE)

class ClassBalancedFocalLoss(nn.Module):
    def __init__(self, beta=0.9999, gamma=2.0, class_counts=None, device='cpu'):
        super().__init__()
        effective_num = 1.0 - torch.pow(beta, class_counts)
        weights = (1.0 - beta) / (effective_num + 1e-8)
        weights = weights / weights.sum() * len(class_counts)
        self.weights = weights.to(device)
        self.gamma = gamma

    def forward(self, logits, targets):
        probs = torch.sigmoid(logits)
        ce_loss = F.binary_cross_entropy_with_logits(logits, targets, reduction='none')
        p_t = probs * targets + (1 - probs) * (1 - targets)
        loss = ((1 - p_t) ** self.gamma) * ce_loss * self.weights
        return loss.mean()

CKPT_DIR = "/content/drive/MyDrive/data_cuad_transformer/ckpt_legalbert_finetuned"
os.makedirs(CKPT_DIR, exist_ok=True)

Y_train_mat = np.array(list(train_df["y_multi"]), dtype=np.float32)
Y_train_mat = _align_to_n(Y_train_mat, NUM_LABELS)
class_counts = Y_train_mat.sum(axis=0)

config = AutoConfig.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS, problem_type="multi_label_classification")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=config).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Metrics and Fine Tuning

In [43]:
def multilabel_metrics(y_true, y_pred):
    return {
        "micro_f1": f1_score(y_true, y_pred, average="micro", zero_division=0),
        "macro_f1": f1_score(y_true, y_pred, average="macro", zero_division=0),
        "micro_precision": precision_score(y_true, y_pred, average="micro", zero_division=0),
        "micro_recall": recall_score(y_true, y_pred, average="micro", zero_division=0),
        "macro_precision": precision_score(y_true, y_pred, average="macro", zero_division=0),
        "macro_recall": recall_score(y_true, y_pred, average="macro", zero_division=0),
    }

def find_best_threshold_probs(y_true, y_prob, metric_key="micro_f1"):
    best_t, best_stats = 0.5, None
    for t in np.linspace(0.1, 0.9, 17):
        y_hat = (y_prob >= t).astype(int)
        stats = multilabel_metrics(y_true, y_hat)
        if best_stats is None or stats[metric_key] > best_stats[metric_key]:
            best_t, best_stats = float(t), stats
    return best_t, best_stats

def infer_loader(model, loader):
    model.eval()
    all_logits, all_labels = [], []
    with torch.no_grad():
        for batch in loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}
            labels = batch["labels"].to(device)
            logits = model(**inputs).logits
            all_logits.append(logits.detach().cpu().numpy())
            all_labels.append(labels.detach().cpu().numpy())
    return np.vstack(all_logits), np.vstack(all_labels)

## Training

In [44]:
max_epochs = 10
best_val_micro_f1 = -1.0
no_improve = 0
patience = 3

print("Training :")
for epoch in range(1, max_epochs + 1):
    model.train()
    total_loss = 0.0
    for batch in train_loader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}
        labels = batch["labels"].to(device)
        optimizer.zero_grad()
        logits = model(**inputs).logits
        loss = loss_fn(logits, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total_loss += loss.item() * labels.size(0)

    avg_loss = total_loss / len(train_loader.dataset)
    val_logits, y_val = infer_loader(model, val_loader)
    val_probs = 1 / (1 + np.exp(-val_logits))
    t_val, stats_val = find_best_threshold_probs(y_val, val_probs)

    print(f"Epoch {epoch} | Loss: {avg_loss:.4f} | Val Micro-F1: {stats_val['micro_f1']:.4f} | Threshold: {t_val:.2f}")

    if stats_val["micro_f1"] > best_val_micro_f1:
        best_val_micro_f1 = stats_val["micro_f1"]
        model.save_pretrained(CKPT_DIR)
        tokenizer.save_pretrained(CKPT_DIR)
        with open(os.path.join(CKPT_DIR, "threshold.json"), "w") as f:
            json.dump({"best_threshold": float(t_val), "val_stats": stats_val}, f, indent=2)
        no_improve = 0
    else:
        no_improve += 1
        if no_improve >= patience:
            print("Early stopping.")
            break

Training :
Epoch 1 | Loss: 0.0970 | Val Micro-F1: 0.6499 | Threshold: 0.45
Epoch 2 | Loss: 0.0761 | Val Micro-F1: 0.6611 | Threshold: 0.45
Epoch 3 | Loss: 0.0738 | Val Micro-F1: 0.6731 | Threshold: 0.40
Epoch 4 | Loss: 0.0715 | Val Micro-F1: 0.6759 | Threshold: 0.45
Epoch 5 | Loss: 0.0676 | Val Micro-F1: 0.6762 | Threshold: 0.45
Epoch 6 | Loss: 0.0635 | Val Micro-F1: 0.7099 | Threshold: 0.45
Epoch 7 | Loss: 0.0579 | Val Micro-F1: 0.7102 | Threshold: 0.45
Epoch 8 | Loss: 0.0525 | Val Micro-F1: 0.7118 | Threshold: 0.45
Epoch 9 | Loss: 0.0488 | Val Micro-F1: 0.7325 | Threshold: 0.45
Epoch 10 | Loss: 0.0450 | Val Micro-F1: 0.7346 | Threshold: 0.45


## Evaluation

In [45]:
from transformers import AutoModelForSequenceClassification

with open(os.path.join(CKPT_DIR, "threshold.json"), "r") as f:
    meta_lb = json.load(f)

best_t_lb = float(meta_lb["best_threshold"])
best_val_stats_lb = {k: float(v) for k, v in meta_lb["val_stats"].items()}

legalbert_best = AutoModelForSequenceClassification.from_pretrained(CKPT_DIR).to(device)
legalbert_best.eval()

test_logits_lb, y_test = infer_loader(legalbert_best, test_loader)
test_probs_lb = 1.0 / (1.0 + np.exp(-test_logits_lb))
y_test_hat_legalbert = (test_probs_lb >= best_t_lb).astype(int)

test_stats_lb = multilabel_metrics(y_test, y_test_hat_legalbert)

print("LegalBERT F1 Score (Micro):", f1_score(y_test, y_test_hat_legalbert, average="micro", zero_division=0))
print("VAL :", {k: round(v, 4) for k, v in best_val_stats_lb.items()})
print("TEST:", {k: round(v, 4) for k, v in test_stats_lb.items()})

LegalBERT F1 Score (Micro): 0.7007654836464857
VAL : {'micro_f1': 0.7346, 'macro_f1': 0.443, 'micro_precision': 0.7019, 'micro_recall': 0.7704, 'macro_precision': 0.4456, 'macro_recall': 0.4712}
TEST: {'micro_f1': 0.7008, 'macro_f1': 0.4648, 'micro_precision': 0.6518, 'micro_recall': 0.7577, 'macro_precision': 0.552, 'macro_recall': 0.4956}


## Sample Output

In [56]:
def align_cols(A, N):
    A = np.asarray(A)
    if A.ndim == 1:
        A = A[None, :]
    if A.shape[1] > N:
        return A[:, :N]
    if A.shape[1] < N:
        pad = np.zeros((A.shape[0], N - A.shape[1]), dtype=A.dtype)
        return np.hstack([A, pad])
    return A
def names_from_vec(vec, names):
    vec = np.asarray(vec).ravel()
    N = min(len(vec), len(names))
    return [names[i] for i in range(N) if int(vec[i]) == 1]

N = len(label_names)
y_test_arr = align_cols(y_test, N).astype(int)
y_pred_legalbert = align_cols(y_test_hat, N).astype(int)

np.random.seed(42)
k = 3
idxs = np.random.choice(len(test_df), size=min(k, len(test_df)), replace=False)

for idx in idxs:
    clause = test_df["clause_text"].iloc[idx]
    clause_short = (clause[:600] + "...") if len(clause) > 600 else clause

    true_labels = names_from_vec(y_test_arr[idx], label_names)
    pred_labels = names_from_vec(y_pred_legalbert[idx], label_names)

    print("\n— Clause —")
    print(clause_short, "\n")
    print(" True:", true_labels)
    print(" Pred (LegalBERT):", pred_labels)


— Clause —
Exhibit 10.5 STRATEGIC ALLIANCE AGREEMENT ---------------------------- THIS STRATEGIC ALLIANCE AGREEMENT (this "Agreement") is made as of 31 December, --------- 1996, between NORTHERN TELECOM LIMITED, a Canadian corporation ("NTL"), and --- ENTRUST TECHNOLOGIES INC., a Maryland corporation ("ETI"). --- WHEREAS, pursuant to an asset transfer agreement between NTL and Entrust Technologies Limited of even date (the "NTL Transfer Agreement") and an asset ---------------------- transfer agreement between Northern Telecom Inc. and ETI of even date, the Entrust Technology (as defined herein) has been... 

 True: ['Affiliate License-Licensee', 'Affiliate License-Licensor', 'Agreement Date', 'Anti-Assignment', 'Cap On Liability', 'Covenant Not To Sue', 'Governing Law', 'Irrevocable Or Perpetual License', 'Joint Ip Ownership', 'Minimum Commitment', 'No-Solicit Of Customers', 'Non-Disparagement', 'Price Restrictions', 'Revenue/Profit Sharing']
 Pred (LegalBERT): ['Agreement Date', 'Ca

In [57]:
import os, json, numpy as np, pandas as pd, torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import f1_score, precision_score, recall_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class ClauseDatasetSimple(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512, num_labels=None):
        self.texts = list(texts)
        arr = np.array(list(labels), dtype=np.float32)
        if num_labels is not None:
            if arr.ndim == 1:
                arr = arr[None, :]
            if arr.shape[1] > num_labels:
                arr = arr[:, :num_labels]
            elif arr.shape[1] < num_labels:
                pad = np.zeros((arr.shape[0], num_labels - arr.shape[1]), dtype=np.float32)
                arr = np.hstack([arr, pad])
        self.labels = arr
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self): return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx], truncation=True, padding="max_length",
            max_length=self.max_len, return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float32)
        return item

def infer_loader(model, loader):
    model.eval()
    all_logits, all_labels = [], []
    with torch.no_grad():
        for batch in loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}
            labels = batch["labels"].to(device)
            logits = model(**inputs).logits
            all_logits.append(logits.detach().cpu().numpy())
            all_labels.append(labels.detach().cpu().numpy())
    return np.vstack(all_logits), np.vstack(all_labels)

def evaluate_ckpt(ckpt_dir, model_name, max_len=512, batch_size=8):
    """Load tokenizer+model from ckpt_dir, rebuild test loader, evaluate with saved threshold."""
    thresh_path = os.path.join(ckpt_dir, "threshold.json")
    if not (os.path.isdir(ckpt_dir) and os.path.isfile(thresh_path)):
        return None, f"[skip] Missing checkpoint or threshold.json for {model_name} at {ckpt_dir}"
    tok = AutoTokenizer.from_pretrained(ckpt_dir, use_fast=True)
    mdl = AutoModelForSequenceClassification.from_pretrained(ckpt_dir).to(device)

    num_labels = mdl.config.num_labels
    test_ds = ClauseDatasetSimple(
        test_df["clause_text"], test_df["y_multi"],
        tokenizer=tok, max_len=max_len, num_labels=num_labels
    )
    test_loader = DataLoader(test_ds, batch_size=batch_size)

    with open(thresh_path, "r") as f:
        meta = json.load(f)
    best_t = float(meta.get("best_threshold", 0.5))
    val_stats = {k: float(v) for k, v in meta.get("val_stats", {}).items()}


    test_logits, y_test = infer_loader(mdl, test_loader)
    test_probs = 1.0 / (1.0 + np.exp(-test_logits))
    y_hat = (test_probs >= best_t).astype(int)
    test_stats = multilabel_metrics(y_test, y_hat)

    row = {
        "model": model_name,
        "n_labels": int(num_labels),
        "best_t": round(best_t, 3),
        "val_micro_f1": round(val_stats.get("micro_f1", float("nan")), 4),
        "val_macro_f1": round(val_stats.get("macro_f1", float("nan")), 4),
        "test_micro_f1": round(test_stats["micro_f1"], 4),
        "test_macro_f1": round(test_stats["macro_f1"], 4),
        "test_precision": round(test_stats["micro_precision"], 4),
        "test_recall": round(test_stats["micro_recall"], 4),
    }
    return row, None

CKPTS = [
    ("RoBERTa",   "/content/drive/MyDrive/data_cuad_transformer/ckpt_roberta_finetuned"),
    ("LegalBERT", "/content/drive/MyDrive/data_cuad_transformer/ckpt_legalbert_finetuned"),
     ("BERT",      "/content/drive/MyDrive/data_cuad_transformer/ckpt_bert_finetuned"),
]

rows, notes = [], []
for name, path in CKPTS:
    r, msg = evaluate_ckpt(path, name, max_len=512, batch_size=8)
    if r is not None:
        rows.append(r)
    if msg:
        notes.append(msg)

if rows:
    results_df = (pd.DataFrame(rows)
                    .sort_values("test_micro_f1", ascending=False)
                    .reset_index(drop=True))
    display(results_df)
    print(f"\nBest: {results_df.iloc[0]['model']} | test micro-F1={results_df.iloc[0]['test_micro_f1']:.4f} | n_labels={results_df.iloc[0]['n_labels']} | best_t={results_df.iloc[0]['best_t']:.3f}")
else:
    print("No models evaluated. Check CKPT paths/threshold.json files.")

for n in notes:
    print(n)


Unnamed: 0,model,n_labels,best_t,val_micro_f1,val_macro_f1,test_micro_f1,test_macro_f1,test_precision,test_recall
0,RoBERTa,39,0.4,0.7376,0.4636,0.5202,0.3295,0.44,0.6362
1,LegalBERT,41,0.45,0.7346,0.443,0.5201,0.3184,0.4608,0.5968
2,BERT,41,0.4,0.6256,0.4957,0.5001,0.3942,0.3794,0.7334



Best: RoBERTa | test micro-F1=0.5202 | n_labels=39 | best_t=0.400
