In [1]:
pip install transformers datasets torch scikit-learn pandas openpyxl

Note: you may need to restart the kernel to use updated packages.




In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load CSV
df = pd.read_csv("combined_clauses.csv")

# Encode labels
le = LabelEncoder()
df['Label_enc'] = le.fit_transform(df['Label'])

# Split into train and test
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['Clause'].tolist(), df['Label_enc'].tolist(), test_size=0.1, random_state=42
)


In [None]:
df[df['Label_enc'] == 41]
df['Label_enc'].max()

np.int64(46)

In [3]:
from transformers import AutoTokenizer

model_name = "nlpaueb/legal-bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=256)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=256)


In [4]:
import torch

class LegalDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = LegalDataset(train_encodings, train_labels)
val_dataset = LegalDataset(val_encodings, val_labels)


In [5]:
from transformers import AutoModelForSequenceClassification

num_labels = len(le.classes_)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./legalbert_finetuned",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.9509,0.808124,0.753187,0.704326
2,0.6416,0.666271,0.779838,0.743163
3,0.5214,0.656499,0.782155,0.754917




TrainOutput(global_step=2913, training_loss=0.9271633866477413, metrics={'train_runtime': 25027.7234, 'train_samples_per_second': 0.931, 'train_steps_per_second': 0.116, 'total_flos': 3065824222963200.0, 'train_loss': 0.9271633866477413, 'epoch': 3.0})

In [7]:
trainer.save_model("./legalbert_finetuned")
tokenizer.save_pretrained("./legalbert_finetuned")


('./legalbert_finetuned\\tokenizer_config.json',
 './legalbert_finetuned\\special_tokens_map.json',
 './legalbert_finetuned\\vocab.txt',
 './legalbert_finetuned\\added_tokens.json',
 './legalbert_finetuned\\tokenizer.json')

In [15]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import joblib

# Load your original CSV with Clause + Label
df = pd.read_csv("combined_clauses.csv")

# Fit LabelEncoder on labels
le = LabelEncoder()
df["Label_enc"] = le.fit_transform(df["Label"])

# Save for inference
joblib.dump(le, "label_encoder.pkl")

print("✅ Label encoder recreated and saved as label_encoder.pkl")


✅ Label encoder recreated and saved as label_encoder.pkl
