converts my annotation file (a json file) from Label studio into the CoNLL-2003 format : "IOB" format.
it extracts the annotated text as well as the coressponding label and converts it into IOB format, i.e:-

"text"  "annotation label"

It then checks if the annotated text has any relation/connection with another text block of the same label, and merges them if it exists.

it then splits the IOB format data to train, test, and validation splits.

this data is then used to fine-tune "t-ner" model, based on my labels.

the last code block takes in my annotation config file (XML file), which contains all the labels i've used to annotate my data, and creates a dictionary called "label2id" which is used as an input variable while fine-tuning t-ner model.

In [None]:
import json
from bs4 import BeautifulSoup

In [None]:
with open('fixed_data.json', 'r') as file:
    data = json.load(file)

In [None]:
def text_to_iob(text, label):
    words = text.split()
    iob_format = []
    for i, word in enumerate(words):
        if i == 0:
            iob_format.append(f"{word} B-{label}")
        else:
            iob_format.append(f"{word} I-{label}")
    return iob_format



In [None]:
def merge_related_labels(annotations, relations):
    merged_annotations = {}
    for annotation in annotations:
        if annotation['type'] == 'hypertextlabels':
            ann_id = annotation['id']
            text = annotation['value']['text']
            label = annotation['value']['hypertextlabels'][0]
            start_offset = annotation['value']['globalOffsets']['start']
            end_offset = annotation['value']['globalOffsets']['end']
            if ann_id not in merged_annotations:
                merged_annotations[ann_id] = {'text': text, 'label': label, 'start': start_offset, 'end': end_offset}
            else:
                merged_annotations[ann_id]['text'] += ' ' + text
                merged_annotations[ann_id]['end'] = end_offset

    for from_id, to_id in relations.items():
        if from_id in merged_annotations and to_id in merged_annotations:
            if merged_annotations[from_id]['label'] == merged_annotations[to_id]['label']:
                merged_annotations[from_id]['text'] += ' ' + merged_annotations[to_id]['text']
                merged_annotations[from_id]['end'] = merged_annotations[to_id]['end']
                del merged_annotations[to_id]

    return merged_annotations

In [None]:
def tag_with_o(text, annotations):
    annotated_ranges = [(ann['start'], ann['end']) for ann in annotations.values()]
    annotated_ranges.sort()
    current_index = 0
    o_tagged_text = []

    for start, end in annotated_ranges:
        if current_index < start:
            o_text = text[current_index:start].split()
            o_tagged_text.extend([f"{word} O" for word in o_text])
        annotated_text = text[start:end]
        o_tagged_text.extend(text_to_iob(annotated_text, annotations[start]['label']))
        current_index = end

    if current_index < len(text):
        o_text = text[current_index:].split()
        o_tagged_text.extend([f"{word} O" for word in o_text])

    return o_tagged_text

In [None]:
iob_results = []


for item in data:
    if 'data' in item and 'html' in item['data']:
        html_content = item['data']['html']
        soup = BeautifulSoup(html_content, 'lxml')
        document_text = soup.get_text()

        if 'annotations' in item:
            for annotation_set in item['annotations']:
                annotations = annotation_set['result']
                relations = {rel['from_id']: rel['to_id'] for rel in annotations if rel.get('type') == 'relation'}
                merged_annotations = merge_related_labels(annotations, relations)

                annotations_by_offset = {ann['start']: ann for ann in merged_annotations.values()}
                tagged_text = tag_with_o(document_text, annotations_by_offset)
                final_output = []
                for line in tagged_text:
                    final_output.append(line)
                    if line.split()[0].endswith('.'):
                        final_output.append('')

                iob_results.extend(final_output)
                iob_results.append('')

In [None]:
with open('IOB_output.txt', 'w') as outfile:
    for pair in iob_results:
        outfile.write(pair + '\n')

In [None]:
import os

def split_text_file(file_path, train_ratio=0.7, validation_ratio=0.15, test_ratio=0.15):


  if (train_ratio + validation_ratio + test_ratio)!= 1:
    raise ValueError("Ratios for train, validation, and test must sum to 1.")


  with open(file_path, 'r') as f:
    lines = f.readlines()


  total_lines = len(lines)
  train_end = int(train_ratio * total_lines)
  validation_end = int((train_ratio + validation_ratio) * total_lines)


  train_data = lines[:train_end]
  validation_data = lines[train_end:validation_end]
  test_data = lines[validation_end:]

  os.makedirs("data", exist_ok=True)



  with open("data/train.txt", 'w') as f:
    f.writelines(train_data)

  with open("data/validation.txt", 'w') as f:
    f.writelines(validation_data)

  with open("data/test.txt", 'w') as f:
    f.writelines(test_data)

  print("Text file split into train, validation, and test sets.")


split_text_file(r"IOB_output.txt")

Text file split into train, validation, and test sets.


In [None]:
import xml.etree.ElementTree as ET

def parse_text(filename):
    """Parses the text file and returns a list of words."""
    with open(filename, 'r') as file:
        text = file.read()
    words = text.split()
    return words

def create_label2id(filename):
    """Parses the XML file and creates a label2id dictionary."""
    tree = ET.parse(filename)
    root = tree.getroot()

    # Extract labels from the XML
    labels = []
    for label in root.findall(".//Label"):
        label_value = label.get('value').replace(' ','')
        labels.append(label_value)

    label2id = {'O': 0}
    idx = 1  # Start from 1 since 0 is already taken by 'O'

    for label in labels:
        label2id[f'B-{label}'] = idx
        idx += 1
        label2id[f'I-{label}'] = idx
        idx += 1

    return label2id
create_label2id('deal_terms.xml')

{'O': 0,
 'B-Date': 1,
 'I-Date': 2,
 'B-DateType': 3,
 'I-DateType': 4,
 'B-DealClass': 5,
 'I-DealClass': 6,
 'B-DealSub-Class': 7,
 'I-DealSub-Class': 8,
 'B-AmountType': 9,
 'I-AmountType': 10,
 'B-AmountValue': 11,
 'I-AmountValue': 12,
 'B-PrincipalofValue': 13,
 'I-PrincipalofValue': 14,
 'B-FeeType': 15,
 'I-FeeType': 16,
 'B-FeeValue': 17,
 'I-FeeValue': 18,
 'B-FeePrincipalofValue': 19,
 'I-FeePrincipalofValue': 20,
 'B-FeeCondition': 21,
 'I-FeeCondition': 22,
 'B-LoanAmountType': 23,
 'I-LoanAmountType': 24,
 'B-LoanAmountValue': 25,
 'I-LoanAmountValue': 26,
 'B-LoanPrincipalofValue': 27,
 'I-LoanPrincipalofValue': 28,
 'B-LoanAmountCondition': 29,
 'I-LoanAmountCondition': 30,
 'B-SpreadIndex': 31,
 'I-SpreadIndex': 32,
 'B-SpreadType': 33,
 'I-SpreadType': 34,
 'B-SpreadValue': 35,
 'I-SpreadValue': 36,
 'B-SpreadCondition': 37,
 'I-SpreadCondition': 38}