
# Exploration Notebook - Legal NLP Assistant

### This notebook explores the sample dataset used for the legal clause classification and NER tasks. It provides a quick overview of the distribution of classes, examples of entities, and qualitative insights into the contract fragments

### The goal is to understand the data before training the model



In [1]:
import json
from pathlib import Path

raw_path = Path("../assets/raw_data.json")

with raw_path.open() as f:
    data = json.load(f)

len(data), data[:1]


(3,
 [{'text': 'This Agreement is made between ACME Corp and Beta Solutions Ltd. The contract shall begin on March 1, 2025. The Supplier accepts full liability for any damages without limitation. Payment of $140,000 is due within 30 days.',
   'cats': {'TERMINATION': 0.0,
    'CONFIDENTIALITY': 0.0,
    'LIABILITY': 1.0,
    'PAYMENT_TERMS': 1.0,
    'GOVERNING_LAW': 0.0,
    'FORCE_MAJEURE': 0.0},
   'entities': [{'start': 33, 'end': 42, 'label': 'PARTY'},
    {'start': 47, 'end': 65, 'label': 'PARTY'},
    {'start': 92, 'end': 104, 'label': 'EFFECTIVE_DATE'},
    {'start': 178, 'end': 186, 'label': 'PAYMENT_AMOUNT'}]}])

In [2]:
for i, ex in enumerate(data):
    print(f"\n--- Example {i+1} ---")
    print(ex["text"])
    print("Clause labels:", ex["cats"])
    print("Entities:", ex["entities"])



--- Example 1 ---
This Agreement is made between ACME Corp and Beta Solutions Ltd. The contract shall begin on March 1, 2025. The Supplier accepts full liability for any damages without limitation. Payment of $140,000 is due within 30 days.
Clause labels: {'TERMINATION': 0.0, 'CONFIDENTIALITY': 0.0, 'LIABILITY': 1.0, 'PAYMENT_TERMS': 1.0, 'GOVERNING_LAW': 0.0, 'FORCE_MAJEURE': 0.0}
Entities: [{'start': 33, 'end': 42, 'label': 'PARTY'}, {'start': 47, 'end': 65, 'label': 'PARTY'}, {'start': 92, 'end': 104, 'label': 'EFFECTIVE_DATE'}, {'start': 178, 'end': 186, 'label': 'PAYMENT_AMOUNT'}]

--- Example 2 ---
Either party may terminate this Agreement by providing 60 days written notice. This contract is governed by the laws of New York. Confidentiality applies to all shared data.
Clause labels: {'TERMINATION': 1.0, 'CONFIDENTIALITY': 1.0, 'LIABILITY': 0.0, 'PAYMENT_TERMS': 0.0, 'GOVERNING_LAW': 1.0, 'FORCE_MAJEURE': 0.0}
Entities: [{'start': 52, 'end': 64, 'label': 'DURATION'}, {'start': 1

In [None]:
import spacy
from spacy import displacy

nlp = spacy.blank("en")

# For demonstration only, we create doc objects with entities
docs = []
for ex in data:
    doc = nlp.make_doc(ex["text"])
    spans = []
    for ent in ex["entities"]:
        span = doc.char_span(ent["start"], ent["end"], label=ent["label"])
        if span:
            spans.append(span)
    doc.ents = spans
    docs.append(doc)

displacy.render(docs[0], style="ent")


In [4]:
# count clause frequencies
from collections import Counter

label_counts = Counter()

for ex in data:
    for label, score in ex["cats"].items():
        if score == 1.0:
            label_counts[label] += 1

label_counts


Counter({'PAYMENT_TERMS': 2,
         'LIABILITY': 1,
         'TERMINATION': 1,
         'CONFIDENTIALITY': 1,
         'GOVERNING_LAW': 1,
         'FORCE_MAJEURE': 1})

In [None]:
print("Number of samples:", len(data))
print("Clause frequencies:")
for label, count in label_counts.items():
    print(f"{label}: {count}")

entity_labels = Counter(ent["label"] for ex in data for ent in ex["entities"])
print("\nEntity label frequencies:")
for label, count in entity_labels.items():
    print(f"{label}: {count}")


Number of samples: 3
Clause frequencies:
LIABILITY: 1
PAYMENT_TERMS: 2
TERMINATION: 1
CONFIDENTIALITY: 1
GOVERNING_LAW: 1
FORCE_MAJEURE: 1

Entity label frequencies:
PARTY: 2
EFFECTIVE_DATE: 1
PAYMENT_AMOUNT: 2
DURATION: 2
LOCATION: 1
