In [3]:
## Importing required libraries
import json
import pandas as pd
from pathlib import Path
import sys
from transformers import BertTokenizer
import json
from sklearn.model_selection import train_test_split
#from allennlp.common.util import pad_sequence_to_length
import torch

In [4]:
## path for the training and testing files
main_path = '/Users/ahmed/Desktop/SS23/practical-legalNLP/'
BERT_VOCAB = "bert-base-uncased"
MAX_SEQ_LENGTH = 128
tokenizer = BertTokenizer.from_pretrained(BERT_VOCAB, do_lower_case=True)

label_mapper = {
    "Fact": "Fact",
    "Issue": "Fact",
    "ArgumentPetitioner": "Argument",
    "ArgumentRespondent": "Argument",
    "PrecedentReliedUpon": "Precedent",
    "PrecedentNotReliedUpon": "Precedent",
    "PrecedentOverruled": "Precedent",
    "RatioOfTheDecision": "Ratio",
    "RulingByLowerCourt": "RulingL",
    "RulingByPresentCourt": "RulingP",
    "Statute": "Statute",
    "Dissent": "Dissent",
    "None": "None"
}

In [5]:
def load_json_file(path) -> dict:
    with open(path,'r') as f:
        d = json.load(f)
    return d



def save_dict_to_json(dictionary, file_path):
    with open(file_path, 'w') as file:
        json.dump(dictionary, file)
    print(f"Dictionary saved as JSON file: {file_path}")


In [6]:
def transform_dict_structure(input_dict, meta):
    transformed_list = []
    
    for doc_id, doc in enumerate(input_dict.keys()):
        transformed_dict = {}
        document_name = doc
        transformed_dict['id'] = meta + '_' + str(doc_id)
        transformed_dict['annotations'] = []

        sentences = input_dict[document_name]['sentences']
        if isinstance(sentences, str):
            sentences = sentences[2:-2].split('\', \'')
        labels = input_dict[document_name]['complete']
        
        # remove sentences with None labels
        new_sentences = []
        new_labels = []
        for j,l in enumerate(labels):
            if (l == 'None'):
                continue
            else:
                new_sentences.append(sentences[j])
                new_labels.append(labels[j])
                        
        start, end = 0, 0
        transformed_dict['annotations'] = [{'result':[]}]
        for i, sentence in enumerate(new_sentences):
            end = start + len(sentence)
            value = {'text': sentence, 'labels': new_labels[i], 'start':start, 'end':end}
            start = end
            annotation = {'id': i, 'value': value}
            transformed_dict['annotations'][0]['result'].append(annotation)

        transformed_dict['data'] = ' '.join(sentences)
        transformed_dict['meta'] = meta
        transformed_list.append(transformed_dict)
    return transformed_list

In [7]:
def transform_pipeline(part):
    # Construct Paths
    path_train = Path(main_path, 'data','malik_data', part, f'{part}_train.json')
    path_dev = Path(main_path, 'data','malik_data', part, f'{part}_dev.json')
    path_test = Path(main_path, 'data','malik_data', part, f'{part}_test.json')

    save_path_train = Path(main_path, 'rhetorical-role-baseline/datasets/malik', f'{part}_train.json')
    save_path_dev = Path(main_path, 'rhetorical-role-baseline/datasets/malik', f'{part}_dev.json')
    save_path_test = Path(main_path, 'rhetorical-role-baseline/datasets/malik', f'{part}_test.json')


    # load json files
    train_json = load_json_file(path_train)
    dev_json = load_json_file(path_dev)
    test_json = load_json_file(path_test)

    # transform to new structure
    train_json = transform_dict_structure(train_json, part)
    dev_json = transform_dict_structure(dev_json, part)
    test_json = transform_dict_structure(test_json, part)

    # save new json files
    save_dict_to_json(train_json, save_path_train)
    save_dict_to_json(dev_json, save_path_dev)
    save_dict_to_json(test_json, save_path_test)
    
    return train_json, dev_json, test_json


In [8]:
train, dev, test = transform_pipeline('CL')

Dictionary saved as JSON file: /Users/ahmed/Desktop/SS23/practical-legalNLP/rhetorical-role-baseline/datasets/malik/CL_train.json
Dictionary saved as JSON file: /Users/ahmed/Desktop/SS23/practical-legalNLP/rhetorical-role-baseline/datasets/malik/CL_dev.json
Dictionary saved as JSON file: /Users/ahmed/Desktop/SS23/practical-legalNLP/rhetorical-role-baseline/datasets/malik/CL_test.json


In [36]:
label_mapper = {
    "Fact": "Fact",
    "Issue": "Fact",
    "ArgumentPetitioner": "Argument",
    "ArgumentRespondent": "Argument",
    "PrecedentReliedUpon": "Precedent",
    "PrecedentNotReliedUpon": "Precedent",
    "PrecedentOverruled": "Precedent",
    "RatioOfTheDecision": "Ratio",
    "RulingByLowerCourt": "RulingL",
    "RulingByPresentCourt": "RulingP",
    "Statute": "Statute",
    "Dissent": "Fact",
    "None": "None"
}

        
def write_in_hsln_json_format(input_dict, hsln_format_txt_dirpath, tokenizer):
    final_string = ''
    all_doc_toknized={}
    for file in input_dict:
        doc_tokenized = {}
        labels = []
        tokenized_sentences = []
        attention_masks = []
        for annotation in file['annotations'][0]['result']:
            sentence_label = label_mapper[annotation['value']['labels']]
            labels.append(sentence_label)
            
            
            sentence_txt=annotation['value']['text']
            sentence_txt = sentence_txt.replace("\r", "")
            if sentence_txt.strip() != "":
                sent_tokens = tokenizer.encode(sentence_txt, add_special_tokens=True, max_length=128)
                tokenized_sentences.append(sent_tokens)
            mask = [1]*(len(sent_tokens))
            new_mask = [yi if xi !=0 else 0 for yi,xi in zip(mask,sent_tokens)]
            attention_masks.append(new_mask)
        
        doc_tokenized =  {
            "sentence_mask": [1] * len(tokenized_sentences),
            "input_ids": tokenized_sentences,
            "attention_mask": attention_masks,
            "label_ids": labels,
            "doc_name": file['id']
        }
        all_doc_toknized[file['id']] = doc_tokenized


            
    with open(hsln_format_txt_dirpath, 'w') as fp:
        json.dump(all_doc_toknized, fp)


In [37]:
write_in_hsln_json_format(train,'datasets/malik/train_scibert.json',tokenizer)

In [38]:
write_in_hsln_json_format(dev,'datasets/malik/dev_scibert.json',tokenizer)

In [39]:
write_in_hsln_json_format(test,'datasets/malik/test_scibert.json',tokenizer)