In [1]:
from collections import Counter, defaultdict
from collections.abc import Sequence, Iterable
import json
import os

from tqdm import tqdm

from tokenization import text_to_tokens
from dag import EntitiesDAG, BaseEntity, ConnectingEntity, TextEntity
from analyser import (BaseAnalyser, MatchAnalyser,
                      punctuation_analyser,
                      spacing_analyser,
                      integer_analyser,
                     )

# WordAnalyser

Taking wordforms prepared in the notebook "0_DataLoading", let's build `WordAnalyser`, which would add `WordEntity` to the DAG for each token (or several consequent tokens) matched with existing wordform.

## Loading Wordforms

In [2]:
def load_wordworms(path='data'):
    filenames = os.listdir(path)
    wf_filenames = [i for i in filenames if i.endswith('wordforms.json')]
    all_wordforms = []
    for fn in wf_filenames:
        filepath = os.path.join(path, fn)
        with open(filepath) as f:
            wforms = json.load(f)
            assert isinstance(wforms, list)
            print(f'{len(wforms)} from {fn}')
            all_wordforms.extend(wforms)
    return all_wordforms

In [3]:
all_wordforms = load_wordworms()
len(all_wordforms)

4928901 from ukr_wordforms.json
4449602 from ru_wordforms.json


9378503

## Exploring

Here it's presented how wordforms are stored and which fields does they contains

In [4]:
all_wordforms[:3]

[{'wordform': 'а',
  'main_form': 'а',
  'mphdict_word_base_id': 0,
  'mphdict_pos_name': 'вигук',
  'lang': 'українська'},
 {'wordform': 'а',
  'main_form': 'а',
  'mphdict_word_base_id': 1,
  'mphdict_pos_name': 'сполучник',
  'lang': 'українська'},
 {'wordform': 'а',
  'main_form': 'а',
  'mphdict_word_base_id': 2,
  'mphdict_pos_name': 'частка',
  'lang': 'українська'}]

In [5]:
# How many records have each feature filled:

cnt = Counter()
for i in tqdm(all_wordforms):
    cnt.update(list(i))
cnt.most_common(100)

100%|█████████████████████████████████████████████████| 9378503/9378503 [00:10<00:00, 904334.75it/s]


[('wordform', 9378503),
 ('main_form', 9378503),
 ('lang', 9378503),
 ('mphdict_word_base_id', 4928901),
 ('mphdict_pos_name', 4928901),
 ('mphdict_gramm_category', 4916764),
 ('source', 4449602),
 ('odict_column', 4449602),
 ('odict_row', 4449602),
 ('odict_pos', 4449602),
 ('mphdict_field5', 730022),
 ('pos', 146587),
 ('person_name_part', 146587),
 ('case', 119617),
 ('case_ukr', 111697),
 ('father_name', 17255),
 ('case_ru', 7920)]

In [6]:
# Which unique values does features with low cardinality have:

feature_counters = defaultdict(Counter)

for i in tqdm(all_wordforms):
    for k, v in i.items():
        feature_counters[k].update([v])

for k, v in feature_counters.items():
    if len(v) > 1000:
        continue
    print(k, len(v))
    print(v.most_common(10))
    print()

# Format is "feature name", "cardinality", (new line) "top-10 values"

100%|█████████████████████████████████████████████████| 9378503/9378503 [00:44<00:00, 209424.40it/s]

mphdict_pos_name 51
[('прикметник', 1751512), ('дієслово недоконаного виду', 573273), ('дієслово доконаного виду', 522705), ('дієприкметник', 464668), ('іменник чоловічого роду', 405010), ('іменник жіночого роду', 363454), ('іменник чоловічого роду, істота', 242205), ('іменник середнього роду', 208465), ('іменник жіночого роду, істота', 96076), ('прикметник, найвищий ступінь', 57661)]

lang 2
[('українська', 4928901), ('русский', 4449602)]

mphdict_gramm_category 27
[(6, 412314), (7, 307210), (3, 304728), (1, 291101), (4, 286495), (13, 268691), (10, 256157), (2, 253108), (5, 244636), (12, 215154)]

pos 1
[('proper_name', 146587)]

person_name_part 3
[('surname', 121338), ('patronimic', 17255), ('first_name', 7994)]

case_ukr 7
[('місцевий', 19461), ('кличний', 18588), ('давальний', 17211), ('родовий', 14118), ('знахідний', 14118), ('орудний', 14118), ('називний', 14083)]

case 7
[('locative', 20781), ('vocative', 18588), ('dative', 18531), ('genitive', 15438), ('accusative', 15438), ('




## Preparing wordforms

In [7]:
# dict wordform (as string) -> list of full wordform info for every matched wordform

wf_to_matches_tmp = defaultdict(list)

j = 0
for i in tqdm(all_wordforms):
    wf_to_matches_tmp[i['wordform']] += [i]
len(wf_to_matches_tmp)

100%|█████████████████████████████████████████████████| 9378503/9378503 [00:09<00:00, 959385.46it/s]


5215116

In [8]:
# Reduce different cases into single record
cols_to_ignore = {'mphdict_gramm_category', 'case_ukr', 'odict_column', 'case_ru'}

wf_to_matches = {}
for wf, matches in tqdm(wf_to_matches_tmp.items()):
    multicases = defaultdict(set)
    for i in matches:
        i = {k:v for k, v in i.items() if k not in cols_to_ignore}
        if not 'case' in i:
            frozen = tuple(sorted(i.items()))
            multicases[frozen] = set()
            continue
        case = None
        frozen = []
        for k, v in i.items():
            if k == 'case':
                case = v
            elif k in cols_to_ignore:
                pass
            else:
                frozen.append((k, v))
        frozen = tuple(sorted(frozen))
        multicases[frozen].add(case)
    
    matches = []
    for k, v in multicases.items():
        dct = dict(k)
        if len(v) > 0:
            dct['possible_cases'] = list(v)
        matches.append(dct)
    wf_to_matches[wf] = matches
len(wf_to_matches)

100%|█████████████████████████████████████████████████| 5215116/5215116 [00:29<00:00, 175813.56it/s]


5215116

In [9]:
unique_wfs = list(wf_to_matches)
print(f'Total {len(unique_wfs)} unique wordforms')
unique_wfs[:3]

Total 5215116 unique wordforms


['а', 'аахен', 'аахена']

In [10]:
# Tokenized wordforms (i.e. words with dash would consist of several tokens)

tokenized_unique_wf = []
for i in tqdm(unique_wfs):
    tokenized_unique_wf.append(text_to_tokens(i))

100%|█████████████████████████████████████████████████| 5215116/5215116 [00:46<00:00, 111214.00it/s]


In [11]:
# Mapping of first token in a complex word to the tail of tokens

start_to_continuations = defaultdict(list)
for head, *tail in tqdm(tokenized_unique_wf):
    if len(tail) == 0:
        continue
    start_to_continuations[head] += [tail]

for head, *tail in tqdm(tokenized_unique_wf):
    if len(tail) == 0 and head in start_to_continuations:
        start_to_continuations[head] += [tail]
len(start_to_continuations)

100%|████████████████████████████████████████████████| 5215116/5215116 [00:02<00:00, 2430544.17it/s]
100%|████████████████████████████████████████████████| 5215116/5215116 [00:02<00:00, 2207117.10it/s]


6026

In [12]:
# Example
start_to_continuations['аби']

[['-', 'аби'], ['-', 'но'], ['-', 'то'], []]

## Building WordAnalyser

`WordAnalyser` designed to match all possible wordforms for a given token. The problems with ambiguity between common and rare wordforms could be potentially solved later by assigning some scores to wordforms.

**Implementation of `WordAnalyser`:**

Analyser iterates over all entities in the DAG, and executes method `trigger` on those which are instances of `TextEntity`. Then it checks if there are wordforms which, when tokenized, starts like a given `TextEntity`. If there are matches of multi-token wordforms analyser also looks for corresponding chains of tokens in the graph (ommiting `ConnectingEntity` on it's way). For each full match new `WordEntity` created. New entity placed into DAG in a way, that all entities which have edges toward first matched token now points to that new entity too, and that entity points to all nodes toward which last matched token has edges (so new entiry becomes kinda parallel to matched sequence).<br>
Example could be rendered as that:<br>
Let's take DAG generated from string "Count d'Artagnan". It will consist of 11 entities (including `ConnectingEntity`)
```
•Count• •d•'•Artagnan•
```
And we'll match 2 words. Let's say first token "Count" was matched to two possible wordforms and chain of 3 tokens "d->'->Artagnan" matched to a single wordform. We will obtain such DAG:
```
•─Count───────────• •─d•'•Artagnan───•
├Word<count(noun)>┤ └Word<d'artagnan>┘
└Word<count(verb)>┘
```
Resulting DAG wil consist of 14 entities (3 new `WordEntity` added)

In [13]:
class WordEntity(BaseEntity):
    def __init__(self, text_content: str, features, **kwargs):
        super().__init__(**kwargs)
        self.features |= features
        self.features['text_content'] = text_content
    
    @property
    def text_content(self):
        return self.features['text_content']

    def __str__(self):
        main_form = self.features.get('main_form')
        main_form = f'|{main_form}' if main_form else ''
        return f'Word<{self.text_content}{main_form}>' #  *len(self.text_content)


class WordAnalyser(BaseAnalyser):
    def __init__(self,
                 wordform_to_features,
                 start_to_continuations,
                 allow_intermediate_types: Iterable[BaseEntity] = (ConnectingEntity,)
                ):
        super().__init__(trigger_on_instances=[TextEntity])
        self.wordform_to_features = wordform_to_features
        self.start_to_continuations = start_to_continuations
        self.allow_intermediate_types = tuple(allow_intermediate_types)

    def trigger(self, dag_entity: TextEntity):
        continuations = self.start_to_continuations.get(dag_entity.text_content.lower(), [[]])
        
        for next_tokens in continuations:
            if len(next_tokens) == 0:  # Last entity of sequence matched
                self.embed_result(matched_dag_entities=[dag_entity])
                continue
            self.match_sequence(dag_entity, next_tokens)
    
    def match_sequence(self, dag_entity: BaseEntity, next_tokens: Sequence[str], matched=None):
        if matched is None:
            matched = [dag_entity]
        for i in dag_entity.next_entities:
            if isinstance(i, TextEntity) and i.text_content.lower() == next_tokens[0]:
                matched.append(i)
                if len(next_tokens) == 1:
                    self.embed_result(matched)
                    continue
                self.match_sequence(dag_entity=i, next_tokens=next_tokens[1:], matched=matched)
            elif isinstance(i, self.allow_intermediate_types):  # Pass allowed entity
                matched.append(i)
                self.match_sequence(dag_entity=i, next_tokens=next_tokens, matched=matched)
                
    
    def embed_result(self, matched_dag_entities: Sequence[BaseEntity]):
        text = ''.join(i.text_content for i in matched_dag_entities if isinstance(i, TextEntity))
        possible_features = self.wordform_to_features.get(text.lower(), [])
        for features in possible_features:
            new_entity = WordEntity(text_content=text, features=features)
            for i in matched_dag_entities[0].previous_entities:
                i.add_next(new_entity)
            for i in matched_dag_entities[-1].next_entities:
                new_entity.add_next(i)
            for i in matched_dag_entities:
                i.part_of.append(new_entity)
    
    def to_json(self, filepath: str, **json_kwargs):
        data = {
            'wordform_to_features': self.wordform_to_features,
            'start_to_continuations': self.start_to_continuations,
        }
        with open(filepath, 'w') as f:
            json.dump(data, f, **json_kwargs)
    
    @classmethod
    def from_json(cls, filepath: str,
                  allow_intermediate_types: Iterable[BaseEntity] = (ConnectingEntity,)):
        with open(filepath) as f:
            data = json.load(f)
        return cls(**data, allow_intermediate_types=allow_intermediate_types)

In [14]:
word_analyser = WordAnalyser(wordform_to_features=wf_to_matches,
                             start_to_continuations=start_to_continuations)

In [15]:
# WordAnalyser may be saved locally as json file
# and loaded later without wordforms postprocessing

word_analyser.to_json('data/word_analyser.json', indent=0)

In [16]:
# allowed intermediate entity types are not saved in json
word_analyser = WordAnalyser.from_json('data/word_analyser.json',
                                       allow_intermediate_types=(ConnectingEntity,))

In [17]:
input_text = 'Программа Microsoft Imagine Academy для КПИ им. Игоря Сикорского...'
tokens = text_to_tokens(input_text)
dag = EntitiesDAG(tokens)
dag.pprint()

•Программа• •Microsoft• •Imagine• •Academy• •для• •КПИ• •им•.• •Игоря• •Сикорско
--------------------------------------------------------------------------------
го•.•.•.•


In [18]:
spacing_analyser.analyse(dag)
punctuation_analyser.analyse(dag)
word_analyser.analyse(dag)

dag.pprint()

•Программа• •Microsoft• •Imagine• •Academy• •для• •КПИ• •им•.• •Игоря• •Сикорско
                                                                                
                                                                                
                                                                                
                                                                      ␣         
                                                              ␣ Word<Игоря|игорь
                                                       ␣    Punct<.>
                                                 ␣ Word<КПИ|кпити>
                                             Word<для|для>
                                             Word<для|дляти>
                                             Word<для|длить>
           ␣           ␣         ␣         ␣ Word<для|для>
 Word<Программа|программа>
--------------------------------------------------------------------------------
го•.•.•.•
       Punct<.>
    

In [19]:
input_text = 'Дідусь, той що атестував, посміхнувся й спитав:'
tokens = text_to_tokens(input_text)
dag = EntitiesDAG(tokens)

spacing_analyser.analyse(dag)
punctuation_analyser.analyse(dag)
word_analyser.analyse(dag)

dag.pprint()

•Дідусь•,• •той• •що• •атестував•,• •посміхнувся• •й• •спитав•:•
                                                              Punct<:>
                                                     ␣ Word<спитав|спитати>
                                                   Word<й|й>
                                                 ␣ Word<й|й>
                                   ␣ Word<посміхнувся|посміхнутися>
                                 Punct<,>
                     ␣ Word<атестував|атестувати>
                  Word<що|що>
                  Word<що|що>
                ␣ Word<що|що>
            Word<той|той>
            Word<той|тоя>
          ␣ Word<той|той>
        Punct<,>
 Word<Дідусь|дідусь>


# NormalizeAnalyzer

Some symbols in the text could be written in a multiple ways: notably, there are multiple different dash symbols, apostrophe could be written by different symbols, etc.

Let's normalize some of them as part of the analysis pipeline

In [20]:
input_text = 'Всі вони будуть безпосередньо пов’язані з'
tokens = text_to_tokens(input_text)
dag = EntitiesDAG(tokens)
word_analyser.analyse(dag)
dag.pprint()

•Всі• •вони• •будуть• •безпосередньо• •пов•’•язані• •з•
                                                     Word<з|з>
                       Word<безпосередньо|безпосередньо>
              Word<будуть|бути>
       Word<вони|вон>
       Word<вони|вони>
       Word<вони|вонь>
 Word<Всі|ввесь>
 Word<Всі|весь>


Currently word "пов’язані" is not recognized, even tho it is present in the dictionary. The reason is - difference in apostrophe writing.

In [21]:
normalization = {
    '’': "'",
    '–': '-',
    '—': '-',
}

def normalization_factory(matched_entities: list[BaseEntity]):
    text = ''.join(i.features.get('text_content', '') for i in matched_entities)
    normalized_text = normalization[text]
    new_entity = TextEntity(text_content=normalized_text)
    new_entity.features['normalized_from'] = text
    return new_entity

valid_normalization_sequences = [
    *([TextEntity(i)] for i in list(normalization)),
]

normalize_analyser = MatchAnalyser(valid_entity_sequences=valid_normalization_sequences,
                                   match_entity_factory=normalization_factory,
                                   trigger_on_instances=[TextEntity])

In [22]:
input_text = 'Всі вони будуть безпосередньо пов’язані з'
tokens = text_to_tokens(input_text)
dag = EntitiesDAG(tokens)
normalize_analyser.analyse(dag)
word_analyser.analyse(dag)
dag.pprint()

•Всі• •вони• •будуть• •безпосередньо• •пов•’•язані• •з•
                                           '         Word<з|з>
                                       Word<пов'язані|пов'язаний>
                       Word<безпосередньо|безпосередньо>
              Word<будуть|бути>
       Word<вони|вон>
       Word<вони|вони>
       Word<вони|вонь>
 Word<Всі|ввесь>
 Word<Всі|весь>


Here `normalize_analyser` created new `TextEntiry` with normalized apostrophe, so now there is path in the DAG "пов->'->язані" (as well as old one "пов->`->язані" with backtick) and the word is recognized by the same word_analyser