### Data Pre Processing
- Cleaning labelled data and conveting to DocBin Spacy Objects for model traning
- Annotating relations in Train, Test, Evaluation data

#### Note
- Upload Train, Test & Eval spacy datasets
- Annotation data file [.txt | .json]

In [None]:
# Install necessary libraries
!pip install -U spacy

In [None]:
# Import necessary libraries
import json
import typer
import spacy
import re
from pathlib import Path

from spacy.tokens import Span, DocBin, Doc
from spacy.vocab import Vocab
from wasabi import Printer
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from spacy.util import compile_infix_regex

In [None]:
# Intialize blank spacy pipeline
nlp = spacy.blank("en")

In [None]:
# Labels used for annotation
SYMM_LABELS = ["Binds"]
MAP_LABELS = {
    "ACCESS_USING": "ACCESS_USING",
    "INPUT": "INPUT",
    "NOTIFY": "NOTIFY",
    "AUTHENTICATION": "AUTHENTICATION"
}

msg = Printer()

In [None]:
# File Paths for data loading
annotated_data = "/content/annotated_realtions.txt"
train_file='/content/relations_training.spacy'
test_file='/content/relations_test.spacy'
dev_file='/content/relations_dev.spacy'

In [None]:
def annotator(json_data: Path, output_file: Path):
    """Creating the corpus from annotations."""
    Doc.set_extension("rel", default={},force=True)
    vocab = Vocab()

    docs = {"train": [], "dev": [], "test": [], "total": []}
    ids = {"train": set(), "dev": set(), "test": set(), "total":set()}
    count_all = {"train": 0, "dev": 0, "test": 0,"total": 0}
    count_pos = {"train": 0, "dev": 0, "test": 0,"total": 0}

    with open(json_data, encoding="utf8") as jsonfile:
        file = json.load(jsonfile)
        for example in file:
            span_starts = set()
            neg = 0
            pos = 0
                    # Parse the tokens
            tokens=nlp(example["document"])    

            spaces=[]
            spaces = [True if tok.whitespace_ else False for tok in tokens]
            words = [t.text for t in tokens]
            doc = Doc(nlp.vocab, words=words, spaces=spaces)


            # Parse the GGP entities
            spans = example["tokens"]
            entities = []
            span_end_to_start = {}
            for span in spans:
                entity = doc.char_span(
                     span["start"], span["end"], label=span["entityLabel"]
                 )


                span_end_to_start[span["token_start"]] = span["token_start"]
                #print(span_end_to_start)
                entities.append(entity)
                span_starts.add(span["token_start"])

            doc.ents = entities

            # Parse the relations
            rels = {}
            for x1 in span_starts:
                for x2 in span_starts:
                    rels[(x1, x2)] = {}
                    #print(rels)
            relations = example["relations"]
            #print(len(relations))
            for relation in relations:
                # the 'head' and 'child' annotations refer to the end token in the span
                # but we want the first token
                start = span_end_to_start[relation["head"]]
                end = span_end_to_start[relation["child"]]
                label = relation["relationLabel"]
                #print(rels[(start, end)])
                #print(label)
                #label = MAP_LABELS[label]
                if label not in rels[(start, end)]:
                    rels[(start, end)][label] = 1.0
                    pos += 1
                    #print(pos)
                    #print(rels[(start, end)])

            # The annotation is complete, so fill in zero's where the data is missing
            for x1 in span_starts:
                for x2 in span_starts:
                    for label in MAP_LABELS.values():
                        if label not in rels[(x1, x2)]:
                            neg += 1
                            rels[(x1, x2)][label] = 0.0

                            #print(rels[(x1, x2)])
            doc._.rel = rels
            #print(doc._.rel)

            # only keeping documents with at least 1 positive case
            if pos > 0:
                    docs["total"].append(doc)
                    count_pos["total"] += pos
                    count_all["total"] += pos + neg

                    
                    
    #print(len(docs["total"]))
    output_path = "/content/train-relation.spacy"
    docbin = DocBin(docs=docs["total"], store_user_data=True)
    docbin.to_disk(output_path)
    
    msg.info(
        f"{len(docs['total'])} training sentences"
    )


In [None]:
annotator(annotated_data, train_file)