# 3. Get corpus examples

Now that we finally know which corpus examples can be found, we will look for those corpus examples and collect them in files, aggregated by lexunit id.

To read COREX files, we use my [plk2xml](https://github.com/AntheSevenants/plk2xml) library.

In [6]:
import json
import re
import random
import lxml.etree as etree
import pandas as pd
from tqdm.auto import tqdm
from plk2xml.plk2xml.plk import plk
from plk2xml.plk2xml.plk2xml import plk2xml
from plk2xml.plk2xml.plk2json import plk2json
from glob import glob
import gc
NO_SAMPLES = 60

In [3]:
from constants import *

## Loading the dataset

First, let's load in the dataset we created in the previous notebook (the JSONL files).

In [3]:
jsonl_files = glob(f"sense_example_references/*.jsonl")

## Functions for retrieving corpus examples

This function retrieves all words from a sentence XML element, either for FOLIA or PLK

In [4]:
def element_to_sentence(element, element_type="folia"):
    if element_type == "folia":
        t_elements = element.xpath("./folia:w/folia:t", namespaces=ns)
    elif element_type == "corex":
        t_elements = element.xpath("./token/form")
        
    words = list(map(lambda element: element.text, t_elements))
    return " ".join(words)

Processing of a FOLIA file:

In [7]:
ns = { "folia": "http://ilk.uvt.nl/folia",
       "xml": "http://www.w3.org/XML/1998/namespace" }

def folia_to_sentences(xml_filename, xml_id):
    if not xml_filename in parsed_file_cache:
        root = etree.parse(xml_filename)
        #parsed_file_cache[xml_filename] = root
    else:
        root = parsed_file_cache[xml_filename]

    xmlid_regex = re.search(r"(.*?s\.)(\d+)\.w\.(\d+)$", xml_id).groups()
    sentence_id_prefix = xmlid_regex[0]
    sentence_id = int(xmlid_regex[1])
    word_index = int(xmlid_regex[2])
    
    center_sentence_xpath = f"//folia:s[@xml:id='{row['docid']}.{sentence_id_prefix}{sentence_id}']"
    left_sentence_xpath = f"//folia:s[@xml:id='{row['docid']}.{sentence_id_prefix}{sentence_id - 1}']"
    right_sentence_xpath = f"//folia:s[@xml:id='{row['docid']}.{sentence_id_prefix}{sentence_id + 1}']"
    
    center_sentence_xpath = root.xpath(center_sentence_xpath, namespaces=ns)
    left_sentence_xpath = root.xpath(left_sentence_xpath, namespaces=ns)
    right_sentence_xpath = root.xpath(right_sentence_xpath, namespaces=ns)
    
    return center_sentence_xpath, left_sentence_xpath, right_sentence_xpath, word_index

Processing of a PLK file:

In [9]:
def plk_to_sentences(plk_filename, xml_id):
    if not plk_filename in parsed_file_cache:
        plk_file = plk(plk_filename).data
        #parsed_file_cache[plk_filename] = plk_file
    else:
        plk_file = parsed_file_cache[plk_filename]
        
    sentence_info_regex = re.search(r"^s\.(\d+)\.w\.(\d+)", xml_id).groups()
    sentence_id = int(sentence_info_regex[0]) - 1 # Python is 0-indexed
    word_index = int(sentence_info_regex[1])
    
    center_sentence = [token["form"] for token in plk_file[sentence_id]["tokens"]]
    
    left_sentence = None
    if sentence_id - 1 >= 0:
        left_sentence = [token["form"] for token in plk_file[sentence_id - 1]["tokens"]]
        
    right_sentence = None
    if sentence_id + 1 < len(plk_file):
        right_sentence = [token["form"] for token in plk_file[sentence_id + 1]["tokens"]]
    
    return center_sentence, left_sentence, right_sentence, word_index

## Retrieving all examples

Now, we will go over all JSONL files and read each line. We throw away all "lost" references. Then, we sample the required number of corpus examples and retrieve these examples. The exact method of retrieval is different for each source. Corpus examples are aggregated per sense, and saved as a CSV file.

In [None]:
# We go over all files with unique senses we have
for jsonl_file in tqdm(jsonl_files):
    parsed_file_cache = {}
    data = []
    candidates = []
    with open(jsonl_file, "rt") as reader:
        # We loop overall references
        for json_line in reader:
            row = json.loads(json_line)
            
            # Do not process lost tokens
            if row["lost"]:
                continue
                
            candidates.append(row)
            
        if len(candidates) < NO_SAMPLES:
            pass
        else:
            random.seed(1337)
            candidates = random.sample(candidates, NO_SAMPLES)
        
        if len(candidates) == 0:
            print(f"{jsonl_file} has no candidates!")
        
        # We check where the reference is coming from
        for row in tqdm(candidates, leave=False):
            # CGN
            if row["docid"].startswith("CGN"):
                cgn_id = re.search(r"_(.*?)$", row["docid"]).groups()[0]
                plk_filename = f"{SONAR_PATH}{cgn_id}.plk"
                    
                center_sentence, left_sentence, right_sentence, word_index = plk_to_sentences(plk_filename, row["xmlid"])
                
                element_type = None
            # Lassy (pre-processed)
            elif row["docid"].startswith("WR-P-P-I"):
               
                lassy_hit = lassy_hits.loc[(lassy_hits["docid"] == row["docid"]) & 
                                       (lassy_hits["xmlid"] == row["xmlid"])]               
                lassy_hit = lassy_hit[0]
                
                center_sentence = lassy_hit["center_sentence"]
                left_sentence = lassy_hit["left_sentence"]
                right_sentence = lassy_hit["right_sentence"]
                
                word_index = None
                
                element_type = None
            # SONAR
            else:
                xml_filename = f"{SONAR_PATH}{row['docid']}.folia.xml"
            
                center_sentence_xpath, left_sentence_xpath, right_sentence_xpath, word_index = folia_to_sentences(xml_filename, row["xmlid"])
                element_type = "folia"
            
            if element_type is not None:
                center_sentence = element_to_sentence(center_sentence_xpath[0], element_type) if len(center_sentence_xpath) > 0 else None
                left_sentence = element_to_sentence(left_sentence_xpath[0], element_type) if len(left_sentence_xpath) > 0 else None
                right_sentence = element_to_sentence(right_sentence_xpath[0], element_type) if len(right_sentence_xpath) > 0 else None
        
            data_entry = { "docid": row["docid"],
                           "lemma": row["lemma"],
                           "pos": row["pos"],
                           "sense": row["sense"],
                           "xmlid": row["xmlid"],
                           "word_index": word_index,
                           "center_sentence": center_sentence,
                           "left_sentence": left_sentence,
                           "right_sentence": right_sentence }
            
            data.append(data_entry)
        
        df = pd.DataFrame.from_dict(data)
        df.to_csv(f"{SENSE_EX}{row['sense']}.csv", index=False)
        
        del parsed_file_cache
        gc.collect()