# Package installation and loading

In [1]:
!pip install simpletransformers
!pip install pyjarowinkler
!pip install Levenshtein

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting simpletransformers
  Downloading simpletransformers-0.63.7-py3-none-any.whl (249 kB)
[K     |████████████████████████████████| 249 kB 5.1 MB/s 
[?25hCollecting transformers>=4.6.0
  Downloading transformers-4.20.0-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 64.4 MB/s 
Collecting tokenizers
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 50.7 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 65.0 MB/s 
Collecting datasets
  Downloading datasets-2.3.2-py3-none-any.whl (362 kB)
[K     |████████████████████████████████| 362 kB 69.6 MB/s 
Collecting wandb>=0.10.32
  Downloading wandb-0.12.18-py2.py3-none-any.whl 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyjarowinkler
  Downloading pyjarowinkler-1.8-py2.py3-none-any.whl (5.9 kB)
Installing collected packages: pyjarowinkler
Successfully installed pyjarowinkler-1.8
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Levenshtein
  Downloading Levenshtein-0.18.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (258 kB)
[K     |████████████████████████████████| 258 kB 5.1 MB/s 
[?25hCollecting rapidfuzz<3.0.0,>=2.0.1
  Downloading rapidfuzz-2.0.11-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 65.6 MB/s 
[?25hCollecting jarowinkler<1.1.0,>=1.0.2
  Downloading jarowinkler-1.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (103 kB)
[K     |████████████████████████████████| 103 kB 46.2 MB/s 
[?25hInstalling collected packages: jarowi

In [2]:
from enum import Enum
import logging
import pandas as pd
from simpletransformers.seq2seq import Seq2SeqModel
import torch
import Levenshtein
from pyjarowinkler import distance as jw
import numpy as np
from itertools import cycle

# Supporting functions

In [3]:
def load_conllu_dataset(datafile):
    arr = []
    with open(datafile, encoding='utf-8') as inp:
        strings = inp.readlines()
    for s in strings:
      if (s[0] != "#" and s.strip()):
          split_string = s.split('\t')
          arr.append([split_string[1] + " " + split_string[3]+ " " + split_string[5], split_string[2]])    
    return pd.DataFrame(arr, columns=["input_text", "target_text"])

In [4]:
def dameraulevenshtein(s1, s2):
    d = {}
    lenstr1 = len(s1)
    lenstr2 = len(s2)
    for i in range(-1,lenstr1+1):
        d[(i,-1)] = i+1
    for j in range(-1,lenstr2+1):
        d[(-1,j)] = j+1

    for i in range(lenstr1):
        for j in range(lenstr2):
            if s1[i] == s2[j]:
                cost = 0
            else:
                cost = 1
            d[(i,j)] = min(
                           d[(i-1,j)] + 1, # deletion
                           d[(i,j-1)] + 1, # insertion
                           d[(i-1,j-1)] + cost, # substitution
                          )
            if i and j and s1[i]==s2[j-1] and s1[i-1] == s2[j]:
                d[(i,j)] = min (d[(i,j)], d[i-2,j-2] + cost) # transposition
    return d[lenstr1-1,lenstr2-1] 

In [5]:
def prepare_metrics(given_list, metrics):
    raw_list = []    
    for i in given_list:
        raw_list.append(i[2])
    Q1, Q3 = np.percentile(raw_list, [25,75])
    IQR=Q3-Q1
    minimum=Q1-1.5*IQR
    maximum=Q3+1.5*IQR
    result_list = []
    errors_data = pd.DataFrame(columns=['TRUE', 'PRED', 'METRICS', 'RESULT'])
    counter = 0
    for i in given_list:
        if (i[2] > minimum and i[2] < maximum):
            result_list.append(i[2])
        elif (i[2] < minimum or i[2] > maximum):
            errors_data.loc[counter] = [i[0], i[1], metrics, i[2]]
            counter = counter + 1
    return raw_list, result_list, errors_data

 # Model loading

In [6]:
MODEL_NAME = "/content/drive/MyDrive/middle_russian_models/leg_2_batch_64_transformed_changed_size_1"

In [7]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)


model = Seq2SeqModel(
    encoder_decoder_type="bart",
    encoder_decoder_name=MODEL_NAME,
    use_cuda = torch.cuda.is_available()
)

# Model evaluation

In [8]:
EVAL_NAME = "/content/drive/MyDrive/middle_russian_models/transformed_bezobrazov.conllu"

In [9]:
eval_df = load_conllu_dataset(EVAL_NAME)

In [10]:
predictions = model.predict(eval_df["input_text"].tolist())

Generating outputs:   0%|          | 0/63272 [00:00<?, ?it/s]

In [11]:
predictions[1:10]

['177-й',
 'годъ',
 'генварь',
 'въ',
 '23',
 'день',
 'привезти',
 'изъ',
 'бѣлевский']

In [12]:
print('Accuracy score: ' + str((sum([1 if label == pred else 0 for pred, label in zip(predictions, eval_df['target_text'].tolist())])/len(predictions))*100) + '%')

Accuracy score: 92.63792166740753%


In [13]:
error_datasets = []

In [14]:
levenshteins = [(label, pred, Levenshtein.distance(label, pred)) for pred, label in zip(predictions, eval_df['target_text'].tolist())]       

In [15]:
raw_levenshteins, cleared_levenshteins, errors_levenshteins = prepare_metrics(levenshteins, 'Levenshtein distance')
print('Raw average Levenshtein distance: ' + str(sum(raw_levenshteins)/len(raw_levenshteins)))
print('Normalized average Levenshtein distance: ' + (str(sum(cleared_levenshteins)/len(cleared_levenshteins)) if len(cleared_levenshteins) > 0 else str(sum(raw_levenshteins)/len(raw_levenshteins))))
error_datasets.append(errors_levenshteins) 

Raw average Levenshtein distance: 0.10885365733195042
Normalized average Levenshtein distance: 0.10885365733195042


In [16]:
damerau_levenshteins = [(label, pred, dameraulevenshtein(label, pred)) for pred, label in zip(predictions, eval_df['target_text'].tolist())]

In [17]:
raw_damerau_levenshteins, cleared_damerau_levenshteins, errors_damerau_levenshteins = prepare_metrics(damerau_levenshteins, 'Damerau-Levenshtein distance')
print('Raw average Damerau-Levenshtein distance: ' + str(sum(raw_damerau_levenshteins)/len(raw_damerau_levenshteins)))
print('Normalized average Damerau-Levenshtein distance: ' + (str(sum(cleared_damerau_levenshteins)/len(cleared_damerau_levenshteins)) if len(cleared_damerau_levenshteins) > 0 else str(sum(raw_damerau_levenshteins)/len(raw_damerau_levenshteins))))
error_datasets.append(errors_damerau_levenshteins)

Raw average Damerau-Levenshtein distance: 0.10882995011606658
Normalized average Damerau-Levenshtein distance: 0.10882995011606658


In [18]:
jaro_winklers = [(label, pred, jw.get_jaro_distance(label, pred)) for pred, label in zip(predictions, eval_df['target_text'].tolist())]

In [19]:
raw_jaro_winklers, cleared_jaro_winklers, errors_jaro_winklers = prepare_metrics(jaro_winklers, 'Jaro-Winkler distance')
print('Raw average Jaro-Winkler distance: ' + str(sum(raw_jaro_winklers)/len(raw_jaro_winklers)))
print('Normalized average Jaro-Winkler distance: ' + (str(sum(cleared_jaro_winklers)/len(cleared_jaro_winklers)) if len(cleared_jaro_winklers) > 0 else str(sum(raw_jaro_winklers)/len(raw_jaro_winklers))))
error_datasets.append(errors_jaro_winklers)

Raw average Jaro-Winkler distance: 0.988179661184422
Normalized average Jaro-Winkler distance: 0.988179661184422


In [20]:
errors = pd.concat(error_datasets)

In [21]:
errors.to_csv('/content/drive/MyDrive/middle_russian_models/errors_' + (MODEL_NAME.split('/')[len(MODEL_NAME.split('/')) - 1] if '/' in MODEL_NAME else MODEL_NAME) + '.csv', index = False, encoding='utf-8')

In [22]:
predictions = cycle(predictions)
with open(EVAL_NAME, encoding='utf-8') as inp:
    strings = inp.readlines()
    predicted = []
    for s in strings:
      if (s[0] != "#" and s.strip()):
          split_string = s.split('\t')
          split_string[2] = next(predictions)
          joined_string = '\t'.join(split_string)
          predicted.append(joined_string)
          continue
      predicted.append(s)      
    with open("/content/drive/MyDrive/middle_russian_models/predictions_bezobrazov_norm.conllu", 'w', encoding='utf-8') as out:
      out.write('\n'.join(predicted))

# Prediction

In [None]:
DATA_PRED_NAME = "/content/drive/MyDrive/middle_russian_models/all_bezobrazov.conllu"

In [None]:
pred_data = load_conllu_dataset(DATA_PRED_NAME)["input_text"].tolist()

In [None]:
predictions = cycle(model.predict(pred_data))

In [None]:
with open(DATA_PRED_NAME, encoding='utf-8') as inp:
    strings = inp.readlines()
    predicted = []
    for s in strings:
      if (s[0] != "#" and s.strip()):
          split_string = s.split('\t')
          split_string[2] = next(predictions)
          joined_string = '\t'.join(split_string)
          predicted.append(joined_string)
          continue
      predicted.append(s)      
    with open("/content/drive/MyDrive/middle_russian_models/predictions_bezobrazov.conllu", 'w', encoding='utf-8') as out:
      out.write('\n'.join(predicted))