In [None]:
import json
import re

from numpy import cumsum

from aymurai.text.extraction import extract_document
from glob import glob
from more_itertools import flatten

## Annotations

In [None]:
# Read annotations - JSON (change path if necessary)
json_path = "/resources/annotations/label-studio/project-4-at-2022-11-16-19-55-cb21c77f.json"
json_annotations = json.load(open(json_path))

In [None]:
len(json_annotations)

In [None]:
# Read annotations - CoNLL format (change path if necessary)
annotations_path = "/resources/annotations/label-studio/project-4-at-2022-11-16-19-55-cb21c77f.conll"

with open(annotations_path) as file:
    annotations = file.read()

In [None]:
# Inspect the file
for line in annotations.splitlines()[:20]: print(line)

In [None]:
# Split annotations corresponding to different documents
annots = annotations.split("\n\n")

In [None]:
# Pop empty element
annots.pop(annots.index(""))

In [None]:
# Number of annotated documents
len(annots)

## Documents

In [None]:
# Document paths
doc_paths = [
    json_annotation["data"]["meta_info"]["path"] for json_annotation in json_annotations
]

# Correct paths
doc_paths = [
    (
        doc_path.replace("restricted", "data/restricted").replace(
            "RESOLUCIONES DEL JUZGADO - DOCS", "RESOLUCIONES DEL JUZGADO"
        )
    )
    for doc_path in doc_paths
]

doc_paths


In [None]:
# Read document
doc_path =  doc_paths[2] # Change if necessary
doc = extract_document(doc_path)

In [None]:
# Inspect the text
for line in doc.splitlines()[:20]: print(line)

In [None]:
# Number of lines
len(doc.splitlines())

In [None]:
# Number of lines removing multiple new line characters
doc = re.sub(r"\n+", "\n", doc)
len(doc.splitlines())

In [None]:
# Inspect the result
for line in doc.splitlines(): print(line)

In [None]:
set(re.findall(r"\s", doc))

In [None]:
# Replace '\t' and '\xa0' for white space
doc = re.sub(r"(?:\t|\xa0)+", " ", doc)

# Remove multiple spaces except new lines
doc = re.sub(r"[^\S\r\n]+", " ", doc)

# Replace multiple new lines with just one break
doc = re.sub(r"\n+", "\n", doc)

In [None]:
# Inspect the result
for line in doc.splitlines(): print(line)

In [None]:
# Number of tokens
len(doc.split())

## Prepare annotations

Split the document by paragraphs

In [None]:
# Document annotations
doc_annots = annots[2] # Change if necessary
# doc_annots

In [None]:
for line in doc_annots.splitlines(): print(line)

In [None]:
# Remove -DOCSTART- from annotations
doc_annots = doc_annots.replace("-DOCSTART- -X- O\n", "")

In [None]:
# Split document by line
splitted = doc.splitlines()

Insert new line characters to split a document into paragraphs

In [None]:
# Find number of tokens per line
n_tokens = [len(line.split()) for line in splitted]
print(n_tokens)

In [None]:
# Find indexes where a new line character must be inserted,
idx = [idx + i for i, idx in enumerate(cumsum(n_tokens))]
print(idx)

In [None]:
# Split document annotations by line
splitted_annots = doc_annots.splitlines()
splitted_annots

In [None]:
# Insert new line character where needed
for i in idx:
    splitted_annots.insert(i, "\n")

In [None]:
# Inspect the result
splitted_annots

In [None]:
# Join the new annotations
joined_annots = "\n".join(splitted_annots)
joined_annots = re.sub("\n{3,}", "\n\n", joined_annots)

In [None]:
# Inspect the result
for line in joined_annots.splitlines(): print(line)

## Write the result

In [None]:
new_annots = []

for doc_path, annot in zip(doc_paths, annots):
    # Extract document
    doc = extract_document(doc_path)

    # Replace '\t' and '\xa0' for white space
    doc = re.sub(r"(?:\t|\xa0)+", " ", doc)

    # Remove multiple spaces except new lines
    doc = re.sub(r"[^\S\r\n]+", " ", doc)

    # Replace multiple new lines with just one break
    doc = re.sub(r"\n+", "\n", doc)

    # Split document by line
    splitted = doc.splitlines()

    # Find number of tokens per line
    n_tokens = [len(line.split()) for line in splitted]

    # Find indexes where a new line character must be inserted,
    idx = [idx + i for i, idx in enumerate(cumsum(n_tokens))]

    # Remove -DOCSTART- from annotations
    annot = annot.replace("-DOCSTART- -X- O\n", "")

    # Remove unuseful tags
    annot = annot.replace(" -X- _", "")

    # Split document annotations by line
    splitted_annots = annot.splitlines()

    # Insert new line character where needed
    for i in idx:
        splitted_annots.insert(i, "\n")
        
    # Join the new annotations
    joined_annots = "\n".join(splitted_annots)
    joined_annots = re.sub("\n{3,}", "\n\n", joined_annots)

    # Append annots
    new_annots.append(joined_annots)

In [None]:
result = "\n".join(new_annots)
result = re.sub("\n{3,}", "\n\n", result)
print(result)

In [None]:
with open("/resources/annotations/annots.txt", "w") as file:
    file.write(result)

## Train - dev - test split

In [None]:
import random
from sklearn.model_selection import train_test_split
from aymurai.datasets.ar_juz_pcyf_10.annotations import ArgentinaJuzgadoPCyF10LabelStudioAnnotations

In [None]:
dataset = ArgentinaJuzgadoPCyF10LabelStudioAnnotations('/resources/annotations/label-studio/resos-annotations/30-nov/no-decision/')

In [None]:
len(dataset)

In [None]:
len(set([sample["path"] for sample in dataset]))

In [None]:
print(dataset[random.choice(range(1200))]["annotations"]["conll"])

In [None]:
# doc_texts = [sample["data"]["doc.text"] for sample in dataset] 
# entities = [sample["annotations"]["entities"] for sample in dataset]
# conll = [sample["annotations"]["conll"] for sample in dataset]

conll = []
paths = set()

for sample in dataset:
    if sample["path"] not in paths:
        conll.append(sample["annotations"]["conll"])
        paths.add(sample["path"])

In [None]:
len(conll)

In [None]:
print(conll[-1])

#### EDA

In [None]:
n_tokens = [len([line for line in sample.splitlines() if line != ""]) for sample in conll]

In [None]:
labels = [line.split()[-1] for annot in conll for line in annot.splitlines() if line != ""]

In [None]:
len(labels)

In [None]:
sum(n_tokens)

In [None]:
import pandas as pd

In [None]:
pd.Series(n_tokens).describe(percentiles=[.01, .05, .25, .5, .75, .95, .99])

In [None]:
import matplotlib.pyplot as plt

In [None]:
labels = pd.Series(labels)
labels.sample(10)

In [None]:
import re

In [None]:
labels = labels.map(lambda x: re.sub(r"B-|I-", "", x))

In [None]:
labels.value_counts().head(10)

In [None]:
labels.value_counts().tail(10)

In [None]:
labels.nunique()

In [None]:
plt.figure(figsize=(20,10))
labels.value_counts().plot(kind="bar");

In [None]:
plt.figure(figsize=(20,10))
labels.value_counts()[1:].plot(kind="bar");

In [None]:
# Random shuffle - documents
random.seed(42)
random.shuffle(conll)

In [None]:
# Train - val - test split
train, test = train_test_split(conll, random_state=42)
dev, test = train_test_split(test, test_size=0.5, random_state=42)

In [None]:
len(train), len(dev), len(test)

In [None]:
assert len(train) + len(dev) + len(test) == len(conll)

In [None]:
print(test[0])

In [None]:
# Random shuffle - paragraphs
# Join annotations
train = "\n\n".join(train)
dev = "\n\n".join(dev)
test = "\n\n".join(test)

# Resplit by paragraph
train = [paragraph for paragraph in train.split("\n\n") if paragraph != ""]
dev = [paragraph for paragraph in dev.split("\n\n") if paragraph != ""]
test = [paragraph for paragraph in test.split("\n\n") if paragraph != ""]

# Shuffle
random.seed(42)
random.shuffle(train)
random.shuffle(dev)
random.shuffle(test)

In [None]:
train[0]

In [None]:
dev[0]

In [None]:
test[0]

In [None]:
len(train), len(dev), len(test)

In [None]:
# Rejoin annotations
train = "\n\n".join(train)
dev = "\n\n".join(dev)
test = "\n\n".join(test)

In [None]:
!mkdir /resources/ner/flair/resos-20221130-no-decision

In [None]:
with open("/resources/ner/flair/resos-20221130-no-decision/train.txt", "w") as file:
    file.write(train)

with open("/resources/ner/flair/resos-20221130-no-decision/dev.txt", "w") as file:
    file.write(dev)

with open("/resources/ner/flair/resos-20221130-no-decision/test.txt", "w") as file:
    file.write(test)