### __Task 1: Part 1__
__a.__ First, split the train data of Task_1 into training and validation sets with an 85:15 ratio (randomly
stratified).

__b.__ Implement a code for BIO (Beginning-Intermediate-Outside) chunking of the given dataset of Task_1
(for the three splits). Tokenization should be done based on space, and each token needs to be
assigned a BIO label (in format B_label, I_label or O, where “label” refers to one of the 13 legal
entities). Preprocessing can be done on the text if required.

__Importing essential libraries__

In [74]:
import re
import spacy
import random
import simplejson as json

__Loading datasets from JSON__

In [75]:
# Loading train data JSON
with open('../data/NER_TRAIN_JUDGEMENT.json','r') as f:
    train_json = json.load(f)
# Loading test data JSON
with open('../data/NER_TEST_JUDGEMENT.json','r') as f:
    test_json = json.load(f)

__Data Preprocessing__

In [76]:
def remove_html(json_data):
    """
    @brief  This function removes <span> tags from the dataset.
    @param  json_data   JSON-extracted dataset.
    @return json_data   Processed data as a list of dictionaries.
    """
    for sample in json_data:    # sample-wise processing of data
        text = sample['data']['text']   # extract the sample text
        processed_text = re.sub(r'<span.*?>|</span>', '', text) # remove HTML tags
        sample['data']['text'] = processed_text # overwrite to the sample
    return json_data

In [77]:
def remove_special_sequence(json_data):
    """
    @brief  This function removes special sequences from the text data.
    @param  json_data   JSON-extracted dataset.
    @return json_data   Processed data as a list of dictionaries.
    """
    for sample in json_data:
        text = sample['data']['text']
        processed_text = re.sub(r'\n+|\t+|\f+','',text) # new-line character
        processed_text = re.sub(r'\u00ad|\u2013','-',processed_text)    # soft-hyphen and dash
        processed_text = re.sub(r'\u2018|\u2019',"'",processed_text)    # single quotation
        processed_text = re.sub(r'\u20b9','Rs.',processed_text) # Indian Rupee symbol
        processed_text = re.sub(r'\u201e|\u201f|\u201c|\u201d','\"',processed_text) # double quotation
        processed_text = re.sub(r'\u00e0','a',processed_text)   # 'a' with accent
        processed_text = re.sub(r'\u00a0',' ',processed_text)   # non-breaking space
        sample['data']['text'] = processed_text
    return json_data

In [78]:
def remove_extra_spaces(json_data):
    """
    @brief  This function removes the unwanted extra spaces in text and entity labels.
    @param  json_data   JSON-extracted dataset.
    @return json_data   Processed data as a list of dictionaries.
    """
    for sample in json_data:
        # processing text data of the sample
        text = sample['data']['text']
        processed_text = re.sub(' +', ' ', text)
        sample['data']['text'] = processed_text.strip()
        # processing annotation texts of the sample
        for annot in sample['annotations'][0]['result']:
            entity = annot['value']['text']
            processed_entity = re.sub(' +', ' ', entity)
            annot['value']['text'] = processed_entity
    return json_data

In [79]:
# Removing HTML from training data
train_json = remove_html(train_json)
# Removing HTML from testing data
test_json = remove_html(test_json)

# Removing special characters from training data
train_json = remove_special_sequence(train_json)
# Removing special characters from testing data
test_json = remove_special_sequence(test_json)

# Removing extra spaces from training data
train_json = remove_extra_spaces(train_json)
# Removing extra spaces from testing data
test_json = remove_extra_spaces(test_json)

__Splitting NER_TRAIN_JUDGEMENT into training (.85) and validation (.15)__

In [80]:
validation_split = 0.15 # ratio of validation data to be sampled
N_train = len(train_json)   # number of samples in NER_TRAIN_JUDGEMENT.json
val_size = int(N_train * validation_split) # number of samples in validation data

train_data = [] # initialize training data
val_data = random.sample(train_json, val_size)  # randomly sample validation data 
for sample in train_json:   # add the remaining samples to training data
    if sample not in val_data:
        train_data.append(sample)

print(f'Size of training data: {len(train_data)}')
print(f'Size of validation data: {len(val_data)}')

Size of training data: 8020
Size of validation data: 1415


__Performing BIO-encoding on TRAIN, VAL and TEST data__

In [81]:
class BIOEncoder:
    """
    To perform BIO chunking to TRAIN, VAL, and TEST data.
    """

    def __init__(self, data, tokenizer):
        """
        @brief  Constructs an instance of BIOEncoder with the given data.
        @param  data    Dataset on which encoding is to be performed.
        """
        self.data = data
        self.tokenizer = tokenizer
    
    def __find_index_range(self, arr, seq, tags):
        """
        @brief  (Inner method) Finds the index range of a sequence of tokens 
        in the array given the current state of tag assignment. 
        @param  arr     The list of tokens in which search is needed.
        @param  seq     Sequence of tokens to be matched.
        @param  tags    Current state of the assigned tags.
        @return Tuple of start and end indices of the matched range.
        """
        start_index = -1
        end_index = -1
        for i in range(len(arr) - len(seq) + 1):
            if tags[i] == 'O':  # if the current index is not already marked for a B/I tag
                list1 = arr[i:i+len(seq)]   # sequence of tokens from the array
                list2 = seq # sequence of tokens to be matched
                # to ensure that special characters and 's are eliminated during string matching
                regex_pattern = re.compile(r"[^a-zA-Z0-9\s']+|'s\b") 
                matched = True  # a flag to keep check on the current match status
                for elem1, elem2 in zip(list1, list2):
                    match1 = re.sub(regex_pattern, '', elem1)   # remove special chars from elem1
                    match2 = re.sub(regex_pattern, '', elem2)   # remove special chars from elem2
                    if match1 != match2:    # if not matched, no need to check further
                        matched = False
                        break
                if matched: # if match found, set start and end indices for the range
                    start_index = i
                    end_index = i + len(seq) - 1
                    break
        return (start_index, end_index) # return the extreme indices

    def __helper_encode(self, text_tokens, entities):
        """
        @brief  (Inner method) Encodes a tokenized text sample for the given 
        set of entities into BIO tags.
        @param  text_tokens Tokenized text sample.
        @param  entities    Set of entities to be marked into BIO tags.
        @return tags    Sequence of BIO tags from the given text.
        """
        tags = ['O' for _ in range(len(text_tokens))]   # initialize all tags with 'O'
        for entity in entities: # perform BIO tagging for each named-entity in the dataset
            # find the range of indices for the entity in the tokenized text sample
            ent = entity['value']['text'].strip()
            e_tokens = list(self.tokenizer(ent))
            tokenized_entity = [token.text for token in e_tokens]
            rng = self.__find_index_range(text_tokens, tokenized_entity, tags)
            start_index = rng[0]
            end_index = rng[1]
            label = entity['value']['labels'][0]
            for idx in range(start_index, end_index+1):
                if idx == start_index:  # perform B-tagging
                    tags[idx] = 'B_' + label
                else:   # perform I-tagging for the remain span ahead of the B-tag
                    tags[idx] = 'I_' + label
        return tags 

    def encode(self):
        """
        @brief  This method performs BIO encoding on the dataset provided to the constructor.
        @param  None.
        @return tagged_data     A dictionary of dictionaries containing BIO-tagged data.
        """
        tagged_data = dict()
        for sample in self.data:    # perform sample-wise tagging
            text = sample['data']['text']
            tokens = list(self.tokenizer(text)) # tokenize the text sample at every space
            tokenized_text = [token.text for token in tokens]
            entities = sample['annotations'][0]['result']   # extract the entities to be tagged in the sample
            tags = self.__helper_encode(tokenized_text, entities)  # perform BIO-tagging on the sample
            tagged_data[sample['id']] = {'text':" ".join(tokenized_text).strip(),'labels':tags}   # store to tagged_data as needed
        return tagged_data

In [82]:
tokenizer = spacy.blank("en")

# Perform BIO-encoding on TRAIN data
tagger_train = BIOEncoder(train_data, tokenizer)
tagged_train_data = tagger_train.encode()

# Perform BIO-encoding on VAL data
tagger_val = BIOEncoder(val_data, tokenizer)
tagged_val_data = tagger_val.encode()

# Perform BIO-encoding on TEST data
tagger_test = BIOEncoder(test_json, tokenizer)
tagged_test_data = tagger_test.encode()

__Final check to ensure equal lengths of inputs and labels__

In [83]:
def run_length_check(data):
    for case_id, case_details in data.items():
        text_tokens = case_details['text'].split(' ')
        labels = case_details['labels']
        if len(text_tokens) != len(labels):
            print(case_id)
            print(case_details['text'])
            print(len(case_details['text'].split(' ')))
            print(len(case_details['labels']))
            return 0
    return 1

check_train = run_length_check(tagged_train_data)
check_val = run_length_check(tagged_val_data)
check_test = run_length_check(tagged_test_data)

if check_train and check_val and check_test:
    print('SUCCESS!')

SUCCESS!


__Save processed data as JSON__ 

In [84]:
# Save TRAIN data
with open('../data/NER_train.json','w') as f:
    json.dump(tagged_train_data, f, indent=4)
# Save VAL data
with open('../data/NER_val.json','w') as f:
    json.dump(tagged_val_data, f, indent=4)
# Save TEST data
with open('../data/NER_test.json','w') as f:
    json.dump(tagged_test_data, f, indent=4)