In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from sklearn.model_selection import train_test_split

In [4]:
with open('datas/train.json') as f:
    data = json.load(f)

all_cases = []
for year, cases in data.items():
    for case in cases:
        year1, year2 = year.split('-') if '-' in year else (year, year)
        case['year'] = (int(year1) + int(year2)) // 2  # Average year if it's a range (ex: 1900-1940)
        all_cases.append(case)

df = pd.DataFrame(all_cases)

# Remove duplicates based on 'links' column
print(f"Number of duplicate URLs: {df['links'].duplicated().sum()}")
df = df.drop_duplicates(subset=['links'], keep='first')
print(f"Number of cases after removing duplicates: {len(df)}")

# Reset index to have consecutive indices from 0 to len(df)-1
df = df.reset_index(drop=True)

Number of duplicate URLs: 36
Number of cases after removing duplicates: 2720


In [None]:
remove_before_2003 = []
for i, row in df.iterrows():
    parsed = row['written_opinion']['parsed']
    summary = row['Summary']
    nb_words_opinion = 0
    nb_words_summary = 0
    for _, value in parsed.items():
        nb_words_opinion += len(value)
    for _, value in summary.items():
        nb_words_summary += len(value)
    if nb_words_opinion < nb_words_summary:
        remove_before_2003.append(i)
print(f"Number of cases where summary is longer than opinion: {len(remove_before_2003)}")
df = df.drop(index=remove_before_2003)
df = df.reset_index(drop=True)

Number of cases where summary is longer than opinion: 0


# Removes cases before 2003

In [37]:
#remove all the cases before 2003
remove_before_2003 = []
for i, row in df.iterrows():
    if row['year'] < 2003:
        remove_before_2003.append(i)
print(f"Number of cases before 2003: {len(remove_before_2003)}")
df = df.drop(index=remove_before_2003)
df = df.reset_index(drop=True)
print(f"Number of cases after removing those before 2003: {len(df)}")

Number of cases before 2003: 0
Number of cases after removing those before 2003: 1120


# Remove cases with only Per Curiam

In [46]:
remove_curiam = []
for i, row in df.iterrows():
    parsed_keys = row['written_opinion']['parsed'].keys()
    if len(parsed_keys) == 1:
        remove_curiam.append(i)
print(f"Number of cases with only Per Curiam: {len(remove_curiam)}")
df = df.drop(index=remove_curiam)
df = df.reset_index(drop=True)
print(f"Number of cases after removing those with only Per Curiam: {len(df)}")

Number of cases with only Per Curiam: 57
Number of cases after removing those with only Per Curiam: 1063


# Split the dataset in train, dev, test

In [None]:
#split the dataset in train, dev, test
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
dev_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
#save the datasets as csv files
#create the folder datas/easy if it doesn't exist
import os
if not os.path.exists('datas/easy'):
    os.makedirs('datas/easy')
train_df.to_csv('datas/easy/train.csv', index=False)
dev_df.to_csv('datas/easy/dev.csv', index=False)
test_df.to_csv('datas/easy/test.csv', index=False)