### Extracting the individual clauses and labels

In [101]:
!pip install transformers
!pip install datasets
!pip install torch
!pip install 'accelerate>=0.26.0'



In [102]:
import os
import json

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
from datasets import Dataset



In [103]:
# Data Loading Function
def load_clauses_data(data_dir):
    """
    Load clauses data from the specified directory
    Parameters:
        data_dir (str): Path to the directory containing service folders
    Returns:
        list: List of dictionaries containing clause data
    """
    all_clauses = []

    for service_folder in os.listdir(data_dir):
        service_path = os.path.join(data_dir, service_folder)

        if not os.path.isdir(service_path):
            continue

        clauses_file = os.path.join(service_path, 'clauses.json')
        if not os.path.exists(clauses_file):
            continue

        try:
            with open(clauses_file, 'r', encoding='utf-8') as f:
                data = json.load(f)

            if 'clauses' not in data or not data['clauses']:
                print(f"⚠️ WARNING: 'clauses' list is empty in '{service_folder}/clauses.json'")
                continue

            for clause in data['clauses']:
                if not all(key in clause for key in ['clause_text', 'description', 'rating']):
                    print(f"⚠️ WARNING: Skipping a clause in '{service_folder}' due to missing required fields")
                    continue

                clause_data = {
                    'service': service_folder,
                    'clause_text': clause['clause_text'],
                    'description': clause['description'],
                    'rating': clause['rating']
                }
                all_clauses.append(clause_data)

        except Exception as e:
            print(f"Error processing {service_folder}: {str(e)}")

    return all_clauses

In [104]:
rating_map = {
    'good': 0,
    'neutral': 1,
    'bad': 2,
    'blocker': 3,
}

In [105]:
DATA_DIR = "CS224-TC/data_all_202503120623106"
if not os.path.exists(DATA_DIR):
    print(f"Directory '{DATA_DIR}' not found. Cloning repository...")
    # Remove existing directory if it exists
    !rm -rf CS224-TC
    # Clone the repository
    !git clone --depth 1 --filter=blob:none https://github.com/AI-knows-your-rights/CS224-TC.git
else:
    print(f"Directory '{DATA_DIR}' already exists. Skipping cloning.")

# !git clone --depth 1 --filter=blob:none --no-checkout https://github.com/AI-knows-your-rights/CS224-TC.git
# Enable sparse-checkout
#!git sparse-checkout init --cone
# Specify the folder you want to checkout (e.g., "your_folder")
#!git sparse-checkout set data_all_202503120623106


Directory 'CS224-TC/data_all_202503120623106' already exists. Skipping cloning.


In [106]:
# Load and prepare the dataset
clauses_data = load_clauses_data(DATA_DIR)
df = pd.DataFrame(clauses_data)

# Convert ratings to numerical values
df['rating'] = df['rating'].map(rating_map)
print("\nRating distribution after conversion:")
print(df['rating'].value_counts().sort_index())

# Split into training and testing sets

training_size = round(df.shape[0] * 0.8)

train_df, test_df = train_test_split(df, train_size=training_size, random_state=42)

print(f"Training set size: {len(train_df)}")
print(f"Testing set size: {len(test_df)}")
print("\nRating distribution in training set:")
print(train_df['rating'].value_counts().sort_index())


Rating distribution after conversion:
rating
0    2606
1    4072
2    2896
3     219
Name: count, dtype: int64
Training set size: 7834
Testing set size: 1959

Rating distribution in training set:
rating
0    2080
1    3258
2    2328
3     168
Name: count, dtype: int64


### BERT

In [107]:
model_name = "nlpaueb/legal-bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=4,
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [108]:
print("GPU Available:", torch.cuda.is_available())
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("Using CPU")


GPU Available: True
Using GPU: Tesla T4


In [109]:
# Move model to GPU if available
model = model.to(device)

In [110]:
# Define tokenization function

def tokenize_function(examples):
    # Convert the input to a list of strings and ensure it's properly formatted
    texts = [str(text) for text in examples["clause_text"]]  # Ensure text is string

    return tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors=None  # Important: keep this as None for batched processing
    )


In [111]:
def prepare_dataset(df):
    return Dataset.from_pandas(df)

train_raw_data = prepare_dataset(train_df)
test_raw_data = prepare_dataset(test_df)

In [112]:
print(train_raw_data)

print("Dataset features:", train_raw_data.features)
print("Sample row:", train_raw_data[0])


train_dataset = train_raw_data.map(tokenize_function, batched=True)
test_dataset = test_raw_data.map(tokenize_function, batched=True)

columns_to_remove = ['service', 'clause_text', 'description']
train_dataset = train_dataset.remove_columns(columns_to_remove)
test_dataset = test_dataset.remove_columns(columns_to_remove)
train_dataset = train_dataset.rename_column('rating', 'labels')
test_dataset = test_dataset.rename_column('rating', 'labels')

train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])


Dataset({
    features: ['service', 'clause_text', 'description', 'rating', '__index_level_0__'],
    num_rows: 7834
})
Dataset features: {'service': Value(dtype='string', id=None), 'clause_text': Value(dtype='string', id=None), 'description': Value(dtype='string', id=None), 'rating': Value(dtype='int64', id=None), '__index_level_0__': Value(dtype='int64', id=None)}
Sample row: {'service': 'National Geographic', 'clause_text': 'You may download Course Content identified as available for download only for your own personal, non-commercial use.', 'description': "Users can't use the Service for commercial purposes (unless the Service consents): it is only for personal, individual purposes.", 'rating': 1, '__index_level_0__': 8183}


Map:   0%|          | 0/7834 [00:00<?, ? examples/s]

Map:   0%|          | 0/1959 [00:00<?, ? examples/s]

In [113]:
# Define metrics for evaluation
import torch.nn.functional as F

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Convert labels to PyTorch tensor before applying one_hot
    labels = torch.tensor(labels, dtype=torch.int64)
    labels = F.one_hot(labels, num_classes=4).float()  # Convert to one-hot and float

    # predictions = predictions.squeeze()
    predictions = torch.tensor(predictions).float()
    mse = ((predictions - labels) ** 2).mean()
    rmse = np.sqrt(mse.item()) # Convert mse to a python number for numpy

    # predicted_class = torch.argmax(predictions, dim=1)

    rmse = np.sqrt(mse)
    return {
        "mse": mse.item(), # Convert mse to a python number for json serialization
        "rmse": rmse
    }




In [114]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    no_cuda=False,  # Enable GPU
    fp16=True,
)



In [115]:
# To track the training
!pip install weave
!wandb login


[34m[1mwandb[0m: Currently logged in as: [33mrayhu007[0m ([33mrayhu007-stanford[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [116]:
# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    # run_name="ray-legal-bert-regression",
)

In [117]:
print(train_dataset)

Dataset({
    features: ['labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 7834
})


In [118]:
# Start training
trainer.train()

Epoch,Training Loss,Validation Loss,Mse,Rmse
1,0.9592,0.590019,1.934452,1.390846
2,0.5035,0.564782,3.34441,1.828773
3,0.3278,0.595005,4.725522,2.173827


TrainOutput(global_step=2940, training_loss=0.5525915029097577, metrics={'train_runtime': 745.0156, 'train_samples_per_second': 31.546, 'train_steps_per_second': 3.946, 'total_flos': 6183747063816192.0, 'train_loss': 0.5525915029097577, 'epoch': 3.0})

In [126]:
# Evaluate the model on test set
test_results = trainer.evaluate()
print("\nTest Results:")
print(test_results)

# Function to predict ratings for new clauses

def predict_rating(clause_text):
    # Check if clause_text is None and handle it gracefully
    if clause_text is None:
        print("Warning: Encountered a None clause_text. Returning -1.")  # Or any other default value
        return -1  # Or any other default value

    inputs = tokenizer(
        clause_text,
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        predictions = outputs.logits.squeeze()

    predicted_class = torch.argmax(predictions).item()
    return predicted_class

# Test prediction with a sample clause
sample_clause = test_df['clause_text'].iloc[0]
predicted_rating = predict_rating(sample_clause)
actual_rating = test_df['rating'].iloc[0]

print("\nSample Prediction:")
print(f"Predicted Rating: {predicted_rating:.2f}")
print(f"Actual Rating: {actual_rating}")


Test Results:
{'eval_loss': 0.5647823810577393, 'eval_mse': 3.3444104194641113, 'eval_rmse': 1.8287729024887085, 'eval_runtime': 15.2471, 'eval_samples_per_second': 128.484, 'eval_steps_per_second': 16.069, 'epoch': 3.0}

Sample Prediction:
Predicted Rating: 1.00
Actual Rating: 3


In [128]:

# Evaluate the model on the entire test dataset
predictions = trainer.predict(test_dataset)

# Access predicted labels and metrics
predicted_labels = np.argmax(predictions.predictions, axis=1)
metrics = predictions.metrics

print("\nTest Results (Full Dataset):")
print(metrics)

correct_predictions = np.sum(predicted_labels == test_df['rating'])
total_predictions = len(test_df)
accuracy = correct_predictions / total_predictions

print(f"\nOverall Accuracy: {accuracy:.2f}")




Test Results (Full Dataset):
{'test_loss': 0.5647823810577393, 'test_mse': 3.3444104194641113, 'test_rmse': 1.8287729024887085, 'test_runtime': 15.9246, 'test_samples_per_second': 123.017, 'test_steps_per_second': 15.385}

Overall Accuracy: 0.79


In [None]:
# Predict ratings for all samples in test_df and compare with actual ratings
for index in range(len(test_df)):
    sample_clause = test_df['clause_text'].iloc[index]  # Get the clause text
    predicted_rating = predict_rating(sample_clause)   # Predict the rating
    actual_rating = test_df['rating'].iloc[index]     # Get the actual rating

    print(f"\nSample Prediction ({index + 1}):")
    print(f"Predicted Rating: {predicted_rating}")  # No need for :.2f as it's an integer
    print(f"Actual Rating: {actual_rating}")