In [1]:
import json
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
# Read the txt file
with open('data/TRDataChallenge2023.txt', 'r', encoding='utf-8') as file:
    content = file.read()

print(f"File size: {len(content)} characters")

File size: 331936506 characters


In [3]:
# Parse JSON dictionaries from the file
data = []
lines = content.strip().split('\n')

for i, line in enumerate(lines):
    if line.strip():
        try:
            json_obj = json.loads(line)
            data.append(json_obj)
        except json.JSONDecodeError as e:
            print(f"Error parsing line {i+1}: {e}")
            print(f"Line content: {line[:100]}...")

print(f"\nSuccessfully parsed {len(data)} JSON objects")


Successfully parsed 18000 JSON objects


In [4]:
# 2. Structure analysis
print("\n2. DATA STRUCTURE ANALYSIS")
print("-" * 30)

if data:
    # print("Sample JSON object:")
    # print(json.dumps(data[0], indent=2))
    
    print("\nKeys in the JSON objects:")
    all_keys = set()
    for obj in data:
        if isinstance(obj, dict):
            all_keys.update(obj.keys())
    
    for key in sorted(all_keys):
        print(f"- {key}")


2. DATA STRUCTURE ANALYSIS
------------------------------

Keys in the JSON objects:
- documentId
- postures
- sections


In [5]:
# 3. Convert to DataFrame for easier analysis
print("\n3. DATAFRAME CONVERSION")
print("-" * 30)

if data and isinstance(data[0], dict):
    df = pd.DataFrame(data)
    print(f"DataFrame shape: {df.shape}")
    print("\nDataFrame info:")
    df.info()
    print("\nFirst few rows:")
    print(df.head())
else:
    print("Data is not in expected dictionary format")


3. DATAFRAME CONVERSION
------------------------------
DataFrame shape: (18000, 3)

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18000 entries, 0 to 17999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   documentId  18000 non-null  object
 1   postures    18000 non-null  object
 2   sections    18000 non-null  object
dtypes: object(3)
memory usage: 422.0+ KB

First few rows:
                          documentId  \
0  Ib4e590e0a55f11e8a5d58a2c8dcb28b5   
1  Ib06ab4d056a011e98c7a8e995225dbf9   
2  Iaa3e3390b93111e9ba33b03ae9101fb2   
3  I0d4dffc381b711e280719c3f0e80bdd0   
4  I82c7ef10d6d111e8aec5b23c3317c9c0   

                                            postures  \
0                                        [On Appeal]   
1  [Appellate Review, Sentencing or Penalty Phase...   
2          [Motion to Compel Arbitration, On Appeal]   
3     [On Appeal, Review of Administrative Decision]   
4          

In [6]:
df

Unnamed: 0,documentId,postures,sections
0,Ib4e590e0a55f11e8a5d58a2c8dcb28b5,[On Appeal],"[{'headtext': '', 'paragraphs': ['Plaintiff Dw..."
1,Ib06ab4d056a011e98c7a8e995225dbf9,"[Appellate Review, Sentencing or Penalty Phase...","[{'headtext': '', 'paragraphs': ['After pleadi..."
2,Iaa3e3390b93111e9ba33b03ae9101fb2,"[Motion to Compel Arbitration, On Appeal]","[{'headtext': '', 'paragraphs': ['Frederick Gr..."
3,I0d4dffc381b711e280719c3f0e80bdd0,"[On Appeal, Review of Administrative Decision]","[{'headtext': '', 'paragraphs': ['Appeal from ..."
4,I82c7ef10d6d111e8aec5b23c3317c9c0,[On Appeal],"[{'headtext': '', 'paragraphs': ['Order, Supre..."
...,...,...,...
17995,Ia5743cf0e4b611e99e94fcbef715f24d,[Appellate Review],"[{'headtext': '', 'paragraphs': ['¶1 On Februa..."
17996,I974c18f08f1611e998e8870e22e55653,[Objection to Proof of Claim],[{'headtext': 'ORDER OVERRULING DEBTOR'S OBJEC...
17997,Idaaa92f0886f11e998e8870e22e55653,"[Appellate Review, Trial or Guilt Phase Motion...","[{'headtext': '', 'paragraphs': ['A jury convi..."
17998,I247a8420677e11e9a072efd81f5238d6,"[Appellate Review, Jury Selection Challenge or...","[{'headtext': '', 'paragraphs': ['Defendant Ch..."


In [7]:
print(df['postures'].apply(type).value_counts())

postures
<class 'list'>    18000
Name: count, dtype: int64


In [8]:
def is_effectively_missing(val):
    if val is None:
        return True
    if isinstance(val, list) and len(val) == 0:
        return True
    return False

df['postures_missing'] = df['postures'].apply(is_effectively_missing)
print(df['postures_missing'].sum())

923


In [17]:
docs_with_empty_paragraphs = df['sections'].apply(
    lambda section_list: (
        isinstance(section_list, list) and
        any(
            isinstance(s, dict) and (
                'paragraphs' not in s or 
                not isinstance(s['paragraphs'], list) or 
                len(s['paragraphs']) == 0
            )
            for s in section_list
        )
    )
).sum()

print(f"Documents with at least one section missing or empty paragraphs: {docs_with_empty_paragraphs}")

Documents with at least one section missing or empty paragraphs: 638


In [19]:
num_docs_with_empty_headtext = df['sections'].apply(
    lambda section_list: (
        isinstance(section_list, list) and
        any(
            isinstance(s, dict) and 
            ('headtext' not in s or s['headtext'] is None or s['headtext'].strip() == "")
            for s in section_list
        )
    )
).sum()

print(f"Number of documents with at least one missing or empty headtext: {num_docs_with_empty_headtext}")

Number of documents with at least one missing or empty headtext: 11257


In [20]:
import json

with open('data/TRDataChallenge2023.txt', 'r', encoding='utf-8') as file:
    content = file.read()

print(f"File size: {len(content)} characters")

data = []
lines = content.strip().split('\n')

for i, line in enumerate(lines):
    if line.strip():
        try:
            json_obj = json.loads(line)

            # ➡️ Here: Process and combine text from sections
            sections = json_obj.get("sections", [])
            text_parts = []

            for sec in sections:
                headtext = sec.get("headtext", "")
                if headtext:
                    text_parts.append("[HEAD] " + headtext)
                paragraphs = sec.get("paragraphs", [])
                for para in paragraphs:
                    text_parts.append("[PARA] " + para)

            # Join sections with [SEP]
            full_text = " [SEP] ".join(text_parts)

            # Save new keys to the object
            json_obj["combined_text"] = full_text

            # Append to data list
            data.append(json_obj)

        except json.JSONDecodeError as e:
            print(f"Error parsing line {i+1}: {e}")
            print(f"Line content: {line[:100]}...")

print(f"\nSuccessfully parsed {len(data)} JSON objects")

# Example: print first processed document
print("\nExample processed text:")
print(data[0]["combined_text"][:1000])  # Print first 1000 characters to check

File size: 331936506 characters

Successfully parsed 18000 JSON objects

Example processed text:
[PARA] Plaintiff Dwight Watson (“Husband”) appeals from the trial court’s equitable distribution order entered 28 February 2017. On appeal, plaintiff contends that the trial court erred in its classification, valuation, and distribution of the parties’ property and in granting defendant Gertha  Watson (“Wife”) an unequal distribution of martial property. Because the trial court’s findings of fact do not support its conclusions of law and because the distributional factors found by the trial court are based upon some of those erroneous findings and conclusions, we reverse the equitable distribution order and remand for entry of a new equitable distribution order. [SEP] [HEAD] Background [SEP] [PARA] Husband and Wife were married in November 1989. Although the trial court’s equitable distribution order found the date of separation as October 2007, the parties stipulated in the final pretrial 

In [22]:
data[0]

{'documentId': 'Ib4e590e0a55f11e8a5d58a2c8dcb28b5',
 'postures': ['On Appeal'],
 'sections': [{'headtext': '',
   'paragraphs': ['Plaintiff Dwight Watson (“Husband”) appeals from the trial court’s equitable distribution order entered 28 February 2017. On appeal, plaintiff contends that the trial court erred in its classification, valuation, and distribution of the parties’ property and in granting defendant Gertha\u2009 Watson (“Wife”) an unequal distribution of martial property. Because the trial court’s findings of fact do not support its conclusions of law and because the distributional factors found by the trial court are based upon some of those erroneous findings and conclusions, we reverse the equitable distribution order and remand for entry of a new equitable distribution order.']},
  {'headtext': 'Background',
   'paragraphs': ['Husband and Wife were married in November 1989. Although the trial court’s equitable distribution order found the date of separation as October 2007,

In [23]:
import pandas as pd

df = pd.DataFrame([
    {
        "documentId": d["documentId"],
        "labels": d["postures"],
        "text": d["combined_text"]
    }
    for d in data
])

print(df.head())

                          documentId  \
0  Ib4e590e0a55f11e8a5d58a2c8dcb28b5   
1  Ib06ab4d056a011e98c7a8e995225dbf9   
2  Iaa3e3390b93111e9ba33b03ae9101fb2   
3  I0d4dffc381b711e280719c3f0e80bdd0   
4  I82c7ef10d6d111e8aec5b23c3317c9c0   

                                              labels  \
0                                        [On Appeal]   
1  [Appellate Review, Sentencing or Penalty Phase...   
2          [Motion to Compel Arbitration, On Appeal]   
3     [On Appeal, Review of Administrative Decision]   
4                                        [On Appeal]   

                                                text  
0  [PARA] Plaintiff Dwight Watson (“Husband”) app...  
1  [PARA] After pleading guilty, William Jerome H...  
2  [PARA] Frederick Greene, the plaintiff below, ...  
3  [PARA] Appeal from an amended judgment of the ...  
4  [PARA] Order, Supreme Court, New York County (...  


[('On Appeal', 9197), ('Appellate Review', 4652), ('Review of Administrative Decision', 2773), ('Motion to Dismiss', 1679), ('Sentencing or Penalty Phase Motion or Objection', 1342), ('Trial or Guilt Phase Motion or Objection', 1097), ("Motion for Attorney's Fees", 612), ('Post-Trial Hearing Motion', 512), ('Motion for Preliminary Injunction', 364), ('Motion to Dismiss for Lack of Subject Matter Jurisdiction', 343), ('Motion to Compel Arbitration', 255), ('Motion for New Trial', 226), ('Petition to Terminate Parental Rights', 219), ('Motion for Judgment as a Matter of Law (JMOL)/Directed Verdict', 212), ('Motion for Reconsideration', 206), ('Motion to Dismiss for Lack of Personal Jurisdiction', 204), ('Motion for Costs', 168), ('Juvenile Delinquency Proceeding', 146), ('Motion for Default Judgment/Order of Default', 143), ('Motion to Dismiss for Lack of Standing', 137)]


In [None]:
# TAKE TOPk and keep only labels from the topk + exclude documents that end up with no remaining labels
from collections import Counter

# Assume your data is a list of dicts, each with a 'postures' field
label_counter = Counter()
for entry in data:
    label_counter.update(entry["postures"])
# Print most common labels
print(label_counter.most_common(20))

k = 5  
top_k_labels = [label for label, _ in label_counter.most_common(k)]
print(f"top {5} labels :{top_k_labels}")
filtered_data = []
for entry in data:
    # Keep only labels that are in top k
    new_labels = [label for label in entry["postures"] if label in top_k_labels]

    if new_labels:
        # Update labels
        entry["postures"] = new_labels
        filtered_data.append(entry)

print(f"Remaining documents after filtering: {len(filtered_data)}")

# next steps: take care also of the kappa score

[('On Appeal', 9197), ('Appellate Review', 4652), ('Review of Administrative Decision', 2773), ('Motion to Dismiss', 1679), ('Sentencing or Penalty Phase Motion or Objection', 1342), ('Motion for Preliminary Injunction', 174), ('Motion to Dismiss for Lack of Subject Matter Jurisdiction', 149), ("Motion for Attorney's Fees", 108), ('Motion to Dismiss for Lack of Personal Jurisdiction', 66), ('Motion to Compel Arbitration', 62), ('Motion for Reconsideration', 54), ('Motion to Transfer or Change Venue', 51), ('Motion for Costs', 50), ('Certified Question', 47), ('Motion for Judgment as a Matter of Law (JMOL)/Directed Verdict', 45), ('Motion for Default Judgment/Order of Default', 45), ('Objection to Proof of Claim', 37), ('Post-Trial Hearing Motion', 34), ('Motion for New Trial', 33), ('Motion for Protective Order', 31)]
top 5 labels :['On Appeal', 'Appellate Review', 'Review of Administrative Decision', 'Motion to Dismiss', 'Sentencing or Penalty Phase Motion or Objection']
Remaining doc

In [37]:
from sklearn.model_selection import train_test_split

# Start with your filtered data
# Example: filtered_data is your list of documents

# First split: train vs (dev + test)
train_data, temp_data = train_test_split(
    filtered_data, 
    test_size=0.3,  # 30% goes to temp (dev + test)
    random_state=42,
    shuffle=True
)

# Second split: dev vs test (half-half from temp)
dev_data, test_data = train_test_split(
    temp_data, 
    test_size=0.5,  # Split remaining 30% equally
    random_state=42,
    shuffle=True
)

# Check sizes
print(f"Train set size: {len(train_data)}")
print(f"Dev set size: {len(dev_data)}")
print(f"Test set size: {len(test_data)}")

Train set size: 10962
Dev set size: 2349
Test set size: 2350


In [41]:
import random

# Fix random seed for reproducibility
random.seed(42)
# Define desired sample sizes
sample_sizes = [500,1000, 2000, 5000, 10000]
# Save all subsets
train_subsets = {}

for size in sample_sizes:
    # Check if we have enough samples
    if len(train_data) >= size:
        subset = random.sample(train_data, size)
        train_subsets[size] = subset
        print(f"Created subset with {size} samples.")
    else:
        print(f"Not enough data to create subset with {size} samples (only {len(train_data)} available).")

# Example: look at first document text in 1k subset
print(train_subsets[500][0]["combined_text"][:500])

Created subset with 500 samples.
Created subset with 1000 samples.
Created subset with 2000 samples.
Created subset with 5000 samples.
Created subset with 10000 samples.
[PARA] Proceeding pursuant to CPLR article 78 (transferred to this Court by order of the Supreme Court, entered in Albany County) to review a determination of the Department of Health which found decedent ineligible for certain Medicaid benefits. [SEP] [PARA] In September 2003, Paul Hettinger (hereinafter decedent), a widower, executed a durable general power of attorney appointing Sharon Williams, an individual identified in the record as his cousin-in-law, as his attorney-in-fact.   The power 
