In [17]:
# 1. Read Senseval's and Semeval's test data
# 2. Read Sense Inventory with Glosses
# 3. Convert the test data and glosses into prompts

In [1]:
from tqdm import tqdm

from pathlib import Path
from typing import Any, Sequence, Dict, Tuple, List

In [2]:
from xml.dom.minidom import parse

datadir = Path("../data/en/")


def read_keys(filepath: str) -> Dict[str, str]:
    keys = {}
    with open(filepath, 'r') as ifile:
        for line in ifile:
            row = line.strip().split()
            keys[row[0]] = row[1:]
    return keys

def read_data(filepath: str) -> Sequence[Tuple[str, Any]]:
    data = parse(filepath)

    for text in data.getElementsByTagName('text'):
        source = text.getAttribute('source')
        id = text.getAttribute('id')
        yield (source, id, text)

def read_directory(datadir: Path) -> Sequence[Any]:
    data = {}

    keys = {
        '.data.xml': 'data',
        '.gold.key.txt': 'bn-keys',
        '.gold.key.txt.pwn': 'wn-keys'
    }

    for filepath in datadir.iterdir():
        key = ''.join(filepath.suffixes)
        if key not in keys:
            continue

        if key.endswith('.xml'):
            data[keys[key]] = list(read_data(str(filepath)))
        else:
            data[keys[key]] = read_keys(str(filepath))
    
    return data

In [3]:
class Token:

    def __init__(self, elem):
        self._elem = elem

    @property
    def pos(self):
        return self._elem.getAttribute('pos')

    @property
    def lemma(self):
        try:
            return self._elem.getAttribute('lemma')
        except Exception:
            pass

    @property
    def orth(self):
        return self._elem.firstChild.nodeValue

    @property
    def instance(self):
        try:
            return self._elem.getAttribute('id')
        except Exception:
            pass


In [4]:
from xml.dom.minidom import Element

def parse_sentence(sentence: Element) -> List[str]:
    text = []

    for elem in sentence.childNodes:
        token = Token(elem)
        try:
            if token.pos == 'PUNCT':
                text.append(f"{token.orth} ") 
            else:
                text.append(f" {token.orth}")
        except Exception:
            continue
    return ''.join(text).replace('  ', ' ').strip()

In [5]:
from xml.dom.minidom import Element


class Inventory:

    def __init__(self, datapath: Path):
        data = self._read_data(datapath)
        self._pos_mapping = {
            '1': 'NOUN',
            '2': 'VERB',
            '3': 'ADJ',
            '4': 'ADV',
            '5': 'ADJ'
        }
        self._lemma_index = self._create_lemma_index(data)
        self._sense_index = self._create_sense_index(data)

    def _read_data(self, datapath: Path) -> Sequence[Any]:
        pass

    def _create_lemma_index(self, data: Any) -> Dict[str, List[str]]:
        pass

    def _create_sense_index(self, data: Any) -> Dict[str, List[str]]:
        pass

    def sense_glosses(self, key: str) -> List[str]:
        return self._sense_index[key]

    def lemma_glosses(self, lemma: str, pos: str, ret_key: bool = False) -> List[str]:
        if ret_key:
            return [(key, self._sense_index[key])
                    for key in self._lemma_index[(lemma, pos)]]
                    
        return [self._sense_index[key] for key in self._lemma_index[(lemma, pos)]]


class XMLInventory(Inventory):

    def __init__(self, datadir: Path):
        super(XMLInventory, self).__init__(datadir)
    
    def _read_data(self, datapath: Path) -> Sequence[Any]:
        return read_directory(datapath)

    def _create_lemma_index(self, inventory: Dict[str, Any]) -> Dict[str, List[str]]:
        index = {}

        for source, _, text in tqdm(inventory['data']):
            sense_key = source
            
            lemma, attrs = source.split('%')
            pos = self._pos_mapping[attrs.split(':')[0]]

            if (lemma, pos) not in index:
                index[(lemma, pos)] = []
            index[(lemma, pos)].append(sense_key)
            
        return index
    
    def _create_sense_index(self, inventory: Dict[str, Any]) -> Dict[str, List[str]]:
        index = {}

        for source, _, text in tqdm(inventory['data']):
            sense_key = source

            glosses = [
                parse_sentence(sentence)
                for sentence in text.getElementsByTagName('sentence')
            ]
            index[sense_key] = glosses
            
        return index


class PlainInventory(Inventory):

    def __init__(self, datapath: Path):
        super(PlainInventory, self).__init__(datapath)

    def _read_data(self, datapath: Path) -> Sequence[List[str]]:
        data = []
        with open(str(datapath), 'r') as ifile:
            for line in ifile:
                row = line.strip().split('\t')
                data.append(row)
        return data

    def _create_lemma_index(self, data: Any) -> Dict[str, List[str]]:
        index = {}
        for row in tqdm(data):
            sense_key = row[0]

            lemma, attrs = sense_key.split('%')
            pos = self._pos_mapping[attrs.split(':')[0]]

            if (lemma, pos) not in index:
                index[(lemma, pos)] = []
            index[(lemma, pos)].append(sense_key)
        
        return index

    def _create_sense_index(self, data: Any) -> Dict[str, List[str]]:
        index = {}
        for row in tqdm(data):
            sense_key, gloss = row[0], row[-1]
            lemma = sense_key.split('%')[0]

            # index[sense_key] = f"{lemma}: {gloss}"
            index[sense_key] = gloss

        return index

In [23]:
# inventory = XMLInventory(datadir / "inventory")

In [24]:
# inventory.lemma_glosses('evoke', 'NOUN')

In [6]:
inventory = PlainInventory(datadir / "inventory/wngt_glosses.plain.txt")

100%|██████████| 206941/206941 [00:00<00:00, 880307.45it/s]
100%|██████████| 206941/206941 [00:00<00:00, 2154101.02it/s]


In [7]:
inventory.lemma_glosses('evoke', 'VERB', ret_key=True)

[('evoke%2:32:01::', 'call to mind'),
 ('evoke%2:36:00::', 'evoke or provoke to appear or occur'),
 ('evoke%2:36:01::', 'deduce (a principle) or construe (a meaning)'),
 ('evoke%2:36:02::',
  'summon into action or bring into existence, often as if by magic'),
 ('evoke%2:37:00::', 'call forth (emotions, feelings, and responses)')]

In [8]:
# testdata = read_directory(datadir / "test")
testdata = read_directory(datadir / "valid")

In [9]:
from itertools import chain


class TokenWindowContext:

    def __init__(self, wsize: int = 30) -> None:
        self._wsize = wsize

    def _sense_annotated(self, token: Token) -> bool:
        return token.is_instance

    def __call__(self, sequence: Sequence[Element]) -> Sequence[Sequence[Element]]:
        for ind, token in enumerate(sequence):
            if self._sense_annotated(token):
                yield sequence[:ind][-self._wsize:] + sequence[ind:][:self._wsize]


class SentenceWindowContext:

    def __init__(self, wsize: int = 2) -> None:
        self._wsize = wsize

    def __call__(self, sequence: Sequence[Element]) -> Sequence[Sequence[Element]]:
        for ind, _ in enumerate(sequence):
            for token in sequence[ind].childNodes:
                token = Token(token)
                if not token.instance:
                    continue

                lbound = min(self._wsize, len(sequence[:ind]))
                rbound = 2 * self._wsize - lbound + 1

                yield (token.lemma, token.pos, token.instance), list(chain(*[
                        sequence[:ind][-lbound:],
                        sequence[ind:][:rbound]
                ]))

In [10]:
from itertools import chain

contexter = SentenceWindowContext(wsize=2)

texts = list(chain(*[text.getElementsByTagName('sentence')
                     for _, _, text in testdata['data']]))

contexts = contexter(texts)

In [12]:
# read gold data
gold = {}
# with open(datadir / "test/ALL.gold.key.txt") as ifile:
with open(datadir / "valid/semeval2007.gold.key.txt") as ifile:
    for line in ifile:
        row = line.strip().split()
        key, val = row[0], row[1]
        gold[key] = val

In [16]:
testdata

{'bn-keys': {'d000.s000.t000': ['refer%2:32:01::'],
  'd000.s000.t001': ['research%1:04:00::'],
  'd000.s000.t002': ['report%2:32:04::'],
  'd000.s001.t000': ['comment%1:10:00::'],
  'd000.s001.t001': ['imply%2:32:00::'],
  'd000.s001.t002': ['discover%2:31:01::'],
  'd000.s001.t003': ['cause%1:11:00::'],
  'd000.s001.t004': ['find%2:39:02::'],
  'd000.s002.t000': ['make%2:36:12::'],
  'd000.s002.t001': ['statement%1:10:06::'],
  'd000.s003.t000': ['become%2:30:00::'],
  'd000.s004.t000': ['cause%1:11:00::'],
  'd000.s004.t001': ['understand%2:31:00::'],
  'd000.s005.t000': ['quote%2:32:02::'],
  'd000.s005.t001': ['emphasize%2:32:00::'],
  'd000.s006.t000': ['note%2:32:00::'],
  'd000.s006.t001': ['people%1:14:00::'],
  'd000.s006.t002': ['examine%2:39:00::'],
  'd000.s006.t003': ['have%2:29:02::'],
  'd000.s006.t004': ['multitude%1:23:00::'],
  'd000.s007.t000': ['suffer%2:29:01::', 'suffer%2:29:03::'],
  'd000.s008.t000': ['people%1:14:00::'],
  'd000.s008.t001': ['lack%2:42:00::'],

In [17]:
# Which sense of the word „peculiar” is expressed in the following context:

# „The art of change-ringing is peculiar to the English, and, like most English peculiarities, unintelligible to the rest of the world. Dorothy L. Sayers, `` The Nine Tailors`` ASLACTON, England-- Of all scenes that evoke rural England, this is one of the loveliest: An ancient stone church stands amid the fields, the sound of bells cascading from its tower, calling the faithful to evensong. The parishioners of St. Michael and All Angels stop to chat at the church door, as members here always have. ”. The senses are as follows:

# {
# „peculiar%5:00:00:characteristic:00”: „peculiar: characteristic of one only; distinctive or special”,
# „peculiar%5:00:00:specific:00”: „peculiar: unique or specific to a person or thing or category”,
# „peculiar%5:00:00:strange:00”: „peculiar: beyond or deviating from the usual or expected”,
# „peculiar%5:00:00:unusual:00”: „peculiar: markedly different from the usual”
# }

# Return the key of the correct sense.

with open('wsd-prompts-s07.tsv', 'w') as ofile:

    for ((lemma, pos, instance), context) in contexts:
        if instance not in gold:
            print(instance)
            continue

        glosses = str({
            key: gloss for _, (key, gloss) in enumerate(
                inventory.lemma_glosses(lemma, pos, ret_key=True)
            )
        })

        context = ' '.join([
            parse_sentence(sentence) for sentence in context
        ])

        prompt = f"""Which meaning of the word „{lemma}” is expressed in the following context: "{context}" The meanings are as follows: {glosses}. Return only the key of the most relevant meaning."""
        ofile.write(f"{instance}\t{prompt}\t{gold[instance]}\t\n")