### Extracting the individual clauses and labels

In [1]:
# For retrieving the clauses and labels.
import os
import json

In [2]:
data_dir = "data"  # Data directory containing company folders
clause_pairs = []

# Step 1: Check if data directory exists
if not os.path.exists(data_dir):
    print(f"❌ ERROR: Data directory '{data_dir}' does not exist.")
    exit()

# Step 2: Loop through all company folders inside the data directory
for company in os.listdir(data_dir):
    company_path = os.path.join(data_dir, company)

    # Check if it's a directory (company folder)
    if os.path.isdir(company_path):
        clause_file = os.path.join(company_path, "clauses.json")

        # Step 3: Check if clauses.json exists
        if not os.path.isfile(clause_file):
            print(f"❌ ERROR: 'clauses.json' not found in '{company}' folder")
            continue

        try:
            # Step 4: Check if clauses.json is valid JSON
            with open(clause_file, "r", encoding="utf-8") as f:
                data = json.load(f)

            # Step 5: Check if 'clauses' key exists
            if "clauses" not in data:
                print(f"⚠️ WARNING: No 'clauses' key found in '{company}/clauses.json'")
                continue

            clauses = data["clauses"]
            if not clauses:
                print(f"⚠️ WARNING: 'clauses' list is empty in '{company}/clauses.json'")
                continue

            # Step 6: Extract (description, rating) pairs
            for clause in clauses:
                description = clause.get("description", "").strip() if clause.get("description") else ""
                rating = clause.get("rating", "").strip() if clause.get("rating") else ""

                if description and rating:
                    clause_pairs.append((description, rating))
                else:
                    print(f"⚠️ WARNING: Skipping a clause in '{company}' due to missing description or rating.")
        
        except json.JSONDecodeError:
            print(f"❌ ERROR: Invalid JSON in '{company}/clauses.json'")

# Final results
print(f"\n✅ Extracted {len(clause_pairs)} clause-rating pairs from all company folders.\n")
for pair in clause_pairs[:5]:  # Print first 5 for checking
    print(pair)



✅ Extracted 12971 clause-rating pairs from all company folders.

('The Service infers your acceptance from your continued usage, instead of prompting you to read the new Terms and asking for direct consent. This doesn’t apply to services that have no way to seek consent from users through their contact details or an account.', 'bad')
('The service logs the URL of the last web page you visited before visiting the Site and may use that information to gather more data about you.', 'bad')
('If the service gets acquired or is involved in a merger, bankruptcy, reorganisation or sale, your personal data may be transferred or sold.', 'bad')
("You must provide your legal name and pseudonyms aren't allowed. This case doesn't apply to Services for which transparency regarding users identities is relevant for their purposes.", 'bad')
('The service will remove personal data from its systems once it is no longer required. Depending on the type of information and the reason it was collected, data re

In [3]:
# Example of accessing the descriptions and ratings
for pair in clause_pairs[:5]:  # Print first 5 pairs for checking
    description = pair[0]  # Clause description (x)
    rating = pair[1]       # Clause rating (y)
    print(f"Description: {description}\nRating: {rating}\n")

Description: The Service infers your acceptance from your continued usage, instead of prompting you to read the new Terms and asking for direct consent. This doesn’t apply to services that have no way to seek consent from users through their contact details or an account.
Rating: bad

Description: The service logs the URL of the last web page you visited before visiting the Site and may use that information to gather more data about you.
Rating: bad

Description: If the service gets acquired or is involved in a merger, bankruptcy, reorganisation or sale, your personal data may be transferred or sold.
Rating: bad

Description: You must provide your legal name and pseudonyms aren't allowed. This case doesn't apply to Services for which transparency regarding users identities is relevant for their purposes.
Rating: bad

Description: The service will remove personal data from its systems once it is no longer required. Depending on the type of information and the reason it was collected, da

### BERT

In [4]:
import torch
import numpy as np
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, classification_report

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [5]:
# Step 3.1: Filter out clauses with 'unknown' ratings
filtered_clause_pairs = [(desc, rating) for desc, rating in clause_pairs if rating != "unknown"]

# Split the filtered data
clauses, ratings = zip(*filtered_clause_pairs)  # Extract clauses and their ratings

# Map ratings to integers
rating_dict = {"very bad": 0, "bad": 1, "neutral": 2, "good": 3}  # Modify if you have different ratings
ratings_int = [rating_dict[r] for r in ratings]

# Step 3.2: Split data into train, dev, and test sets (80% train, 10% dev, 10% test)
X_train, X_temp, y_train, y_temp = train_test_split(clauses, ratings_int, test_size=0.2, random_state=42)
X_dev, X_test, y_dev, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Step 3.3: Convert into a format that Hugging Face can use
train_data = Dataset.from_dict({"text": X_train, "label": y_train})
dev_data = Dataset.from_dict({"text": X_dev, "label": y_dev})
test_data = Dataset.from_dict({"text": X_test, "label": y_test})


In [6]:
# Step 4.1: Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Step 4.2: Define a function to tokenize the input texts
def tokenize_function(examples):
    return tokenizer(examples.get('text', ""), padding='max_length', truncation=True, max_length=512)

# Step 4.3: Apply the tokenizer to the train, dev, and test datasets
train_data = train_data.map(tokenize_function, batched=True)
dev_data = dev_data.map(tokenize_function, batched=True)
test_data = test_data.map(tokenize_function, batched=True)

# Step 4.4: Set the format for PyTorch
train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'label'])
dev_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'label'])
test_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'label'])

# Step 4.5: Remove the original text filed
train_data = train_data.map(tokenize_function, batched=True, remove_columns=["text"])
dev_data = dev_data.map(tokenize_function, batched=True, remove_columns=["text"])
test_data = test_data.map(tokenize_function, batched=True, remove_columns=["text"])

# Step 4.6: Make sure we are working with longs
train_data = train_data.map(lambda x: {"label": torch.tensor(x["label"]).long()})
dev_data = dev_data.map(lambda x: {"label": torch.tensor(x["label"]).long()})
test_data = test_data.map(lambda x: {"label": torch.tensor(x["label"]).long()})

Map:   0%|          | 0/10120 [00:00<?, ? examples/s]

Map:   0%|          | 0/1265 [00:00<?, ? examples/s]

Map:   0%|          | 0/1265 [00:00<?, ? examples/s]

Map:   0%|          | 0/10120 [00:00<?, ? examples/s]

Map:   0%|          | 0/1265 [00:00<?, ? examples/s]

Map:   0%|          | 0/1265 [00:00<?, ? examples/s]

Map:   0%|          | 0/10120 [00:00<?, ? examples/s]

  train_data = train_data.map(lambda x: {"label": torch.tensor(x["label"]).long()})


Map:   0%|          | 0/1265 [00:00<?, ? examples/s]

  dev_data = dev_data.map(lambda x: {"label": torch.tensor(x["label"]).long()})


Map:   0%|          | 0/1265 [00:00<?, ? examples/s]

  test_data = test_data.map(lambda x: {"label": torch.tensor(x["label"]).long()})


In [7]:
# Step 5.1: Take a smaller sample (e.g., 5%) of the training data
train_sample = train_data.shuffle(seed=42).select(range(int(0.1 * len(train_data))))
dev_sample = dev_data.shuffle(seed=42).select(range(int(0.2 * len(dev_data))))

In [12]:
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",  # Update to eval_strategy
    save_strategy="epoch",  # Save model at each epoch
    save_total_limit=2,  # Keep last 2 checkpoints
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

In [13]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)  # Convert logits to predicted labels
    return {"accuracy": accuracy_score(labels, preds)}

In [14]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_sample,
    eval_dataset=dev_sample,
    compute_metrics=compute_metrics  # Corrected function
)

# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss


In [None]:
# Evaluate on the test set
test_results = trainer.evaluate(test_data)

# Print loss and accuracy
print(f"Test Loss: {test_results['eval_loss']:.4f}")
print(f"Test Accuracy: {test_results['eval_accuracy']:.4f}")  # Accuracy from compute_metrics

Test Loss: 0.8967
Test Accuracy: 0.8538
