In [1]:
## Importing required libraries
import json
import pandas as pd
from pathlib import Path


In [2]:
## path for the training and testing files
main_path = '/Users/ahmed/Desktop/SS23/practical-legalNLP/data'

path_cl_train = Path(main_path, 'CL','CL_train.json')
path_cl_dev = Path(main_path, 'CL','CL_dev.json')
path_cl_test = Path(main_path, 'CL','CL_test.json')

path_it_train = Path(main_path, 'IT','IT_train.json')
path_it_dev = Path(main_path, 'IT','IT_dev.json')
path_it_test = Path(main_path, 'IT','IT_test.json')

In [3]:
label_mapper = {
    "Fact": "Fact",
    "Issue": "Fact",
    "ArgumentPetitioner": "Argument",
    "ArgumentRespondent": "Argument",
    "PrecedentReliedUpon": "Precedent",
    "PrecedentNotReliedUpon": "Precedent",
    "PrecedentOverruled": "Precedent",
    "RatioOfTheDecision": "Ratio",
    "RulingByLowerCourt": "RulingL",
    "RulingByPresentCourt": "RulingP",
    "Statute": "Statute",
    "Dissent": "Dissent",
    "None": "None"
}

In [4]:
def load_json_file(path) -> dict:
    with open(path,'r') as f:
        d = json.load(f)
    return d

def construct_data_labels(d: dict) -> dict:
    data = {}
    num_none_labels = 0
    for i,file in enumerate(d.keys()):
        #extract sentences and parse to list
        sentences = d[file]['sentences']
        if isinstance(sentences, str):
            sentences = sentences[2:-2].split('\', \'')
        
        # extract labels and map to short labels
        labels = d[file]['complete']
        labels = [label_mapper[label] for label in labels]
        
        # remove sentences with None labels
        new_sentences = []
        new_labels = []
        for j,l in enumerate(labels):
            if (l == 'None'):
                continue
            else:
                new_sentences.append(sentences[j])
                new_labels.append(labels[j])
        num_none_labels += len(sentences) - len(new_sentences)
       
        # assert equal lengths
        assert len(new_sentences) == len(new_labels)
        # add entry in the data dict
        data[f'file_{i+1}'] = {'sentences':new_sentences, 'labels':new_labels, 'file_name':file}
    print(f'total of {num_none_labels} elements got deleted as of None labels')
    return data

def load_data(path) -> dict:
    json_file = load_json_file(path)
    data = construct_data_labels(json_file)
    return data

def docs_to_df(data: dict):
    dfs = []
    for doc in data.keys():
        dfs.append(pd.DataFrame(data[doc]))
    return pd.concat(dfs)

In [5]:
train_cl = load_data(path_cl_train)
dev_cl = load_data(path_cl_dev)
test_cl = load_data(path_cl_test)

train_it = load_data(path_it_train)
dev_it = load_data(path_it_dev)
test_it = load_data(path_it_test)

total of 53 elements got deleted as of None labels
total of 2 elements got deleted as of None labels
total of 1 elements got deleted as of None labels
total of 244 elements got deleted as of None labels
total of 31 elements got deleted as of None labels
total of 23 elements got deleted as of None labels


In [6]:
train_cl_df = docs_to_df(train_cl)
dev_cl_df = docs_to_df(dev_cl)
test_cl_df = docs_to_df(test_cl)

train_it_df = docs_to_df(train_it)
dev_it_df = docs_to_df(dev_it)
test_it_df = docs_to_df(test_it)

In [7]:
f'The CL data has: {train_cl_df.shape[0]} Train, {dev_cl_df.shape[0]} dev,{test_cl_df.shape[0]} Test Sentences.'

'The CL data has: 10540 Train, 1394 dev,1338 Test Sentences.'

In [8]:
f'The CL data has: {train_it_df.shape[0]} Train, {dev_it_df.shape[0]} dev,{test_it_df.shape[0]} Test Sentences.'

'The CL data has: 6008 Train, 715 dev,835 Test Sentences.'

In [9]:
train_cl_df.to_csv('train_cl.csv', index = False)
dev_cl_df.to_csv('dev_cl.csv', index = False)
test_cl_df.to_csv('test_cl.csv', index = False)

train_it_df.to_csv('train_it.csv', index = False)
dev_it_df.to_csv('dev_it.csv', index = False)
test_it_df.to_csv('test_it.csv', index = False)