In [None]:
from os.path import join as join_path

from tqdm import tqdm as log_progress

# Datasets

In [None]:
dir = 'data/eval'

# !wget https://rusvectores.org/static/testsets/ru_simlex965_tagged.tsv -P {dir}
# !wget https://rusvectores.org/static/testsets/ru_simlex965.tsv -P {dir}

# !wget https://github.com/nlpub/russe-evaluation/raw/master/russe/evaluation/hj.csv -P {dir}
# !wget https://raw.githubusercontent.com/nlpub/russe-evaluation/master/russe/evaluation/rt.csv -P {dir}
# !wget https://github.com/nlpub/russe-evaluation/raw/master/russe/evaluation/ae-train.csv -P {dir}
# !wget https://github.com/nlpub/russe-evaluation/raw/master/russe/evaluation/ae-test.csv -P {dir}
# !wget https://raw.githubusercontent.com/nlpub/russe-evaluation/master/russe/evaluation/ae2.csv -P {dir}

# !wget https://tlk.s3.yandex.net/dataset/LRWC.zip -P {dir}
# !unzip -p {dir}/LRWC.zip LRWC/lrwc-1.1-aggregated.tsv > {dir}/lrwc.tsv
# !rm {dir}/LRWC.zip

In [None]:
from navec.eval.dataset import (
    Dataset,
    load_pairs as load_pairs_,

    SIMLEX_965, HJ, RT, AE, AE2, LRWC,
    CORR, CLF,
    
    noun_tagged,
    get_pos_analyzer,
    pos_tagged
)


def load_pairs(filename, **kwargs):
    path = join_path('data', 'eval', filename)
    pairs = load_pairs_(path, **kwargs)
    return list(pairs)


simlex965 = Dataset(
    SIMLEX_965, CORR,
    load_pairs('ru_simlex965.tsv', delimiter='\t'),
    load_pairs('ru_simlex965_tagged.tsv', delimiter='\t')
)

pairs = load_pairs('hj.csv')
hj = Dataset(
    HJ, CORR,
    pairs, list(noun_tagged(pairs))
)

pairs = load_pairs('rt.csv')
rt = Dataset(
    RT, CLF,
    pairs, list(noun_tagged(pairs))
)

analyzer = get_pos_analyzer()
pairs = (
    load_pairs('ae-train.csv', column=3)
    + load_pairs('ae-test.csv')
)
ae = Dataset(
    AE, CLF,
    pairs, list(pos_tagged(pairs, analyzer))
)

pairs = load_pairs('ae2.csv')
ae2 = Dataset(
    AE2, CLF,
    pairs, list(noun_tagged(pairs))
)

pairs = load_pairs('lrwc.tsv', delimiter='\t', column=3)
lrwc = Dataset(
    LRWC, CLF,
    pairs, list(noun_tagged(pairs))
)
datasets = [simlex965, hj, rt, ae, ae2, lrwc]

# Models

## Rusvectores

In [None]:
# dir = 'data/models/rusvectores'
# !wget http://vectors.nlpl.eu/repository/11/180.zip -O {dir}/ruscorpora_upos_cbow_300_20_2019.zip
# !unzip {dir}/ruscorpora_upos_cbow_300_20_2019.zip -d {dir}/ruscorpora_upos_cbow_300_20_2019
# !rm {dir}/ruscorpora_upos_cbow_300_20_2019.zip {dir}/ruscorpora_upos_cbow_300_20_2019/model.txt

# !wget http://vectors.nlpl.eu/repository/11/182.zip -O {dir}/ruwikiruscorpora_upos_skipgram_300_2_2019.zip
# !unzip {dir}/ruwikiruscorpora_upos_skipgram_300_2_2019.zip -d {dir}/ruwikiruscorpora_upos_skipgram_300_2_2019
# !rm {dir}/ruwikiruscorpora_upos_skipgram_300_2_2019.zip {dir}/ruwikiruscorpora_upos_skipgram_300_2_2019/model.txt

# !wget http://vectors.nlpl.eu/repository/11/185.zip -O {dir}/tayga_upos_skipgram_300_2_2019.zip
# !unzip {dir}/tayga_upos_skipgram_300_2_2019.zip -d {dir}/tayga_upos_skipgram_300_2_2019
# !rm {dir}/tayga_upos_skipgram_300_2_2019.zip {dir}/tayga_upos_skipgram_300_2_2019/model.txt

# !wget http://vectors.nlpl.eu/repository/11/187.zip -O {dir}/tayga_none_fasttextcbow_300_10_2019.zip
# !unzip {dir}/tayga_none_fasttextcbow_300_10_2019.zip -d {dir}/tayga_none_fasttextcbow_300_10_2019
# !rm {dir}/tayga_none_fasttextcbow_300_10_2019.zip

# !wget https://rusvectores.org/static/models/rusvectores4/fasttext/araneum_none_fasttextcbow_300_5_2018.tgz -O {dir}/araneum_none_fasttextcbow_300_5_2018.tgz
# !mkdir {dir}/araneum_none_fasttextcbow_300_5_2018
# !tar xzvf {dir}/araneum_none_fasttextcbow_300_5_2018.tgz -C {dir}/araneum_none_fasttextcbow_300_5_2018
# !rm {dir}/araneum_none_fasttextcbow_300_5_2018.tgz

In [None]:
from navec.eval.model import (
    RusvectoresScheme,
    RusvectoresFasttextScheme,
)


def get_path(dir, filename='model.bin'):
    return join_path('data', 'models', 'rusvectores', dir, filename)


ruscorpora_upos_cbow_300_20_2019 = RusvectoresScheme(
    'ruscorpora_upos_cbow_300_20_2019',
    get_path('ruscorpora_upos_cbow_300_20_2019')
)
ruwikiruscorpora_upos_skipgram_300_2_2019 = RusvectoresScheme(
    'ruwikiruscorpora_upos_skipgram_300_2_2019',
    get_path('ruwikiruscorpora_upos_skipgram_300_2_2019')
)
tayga_none_fasttextcbow_300_10_2019 = RusvectoresFasttextScheme(
    'tayga_none_fasttextcbow_300_10_2019',
    get_path(
        'tayga_none_fasttextcbow_300_10_2019',
        'model.model'
    )
)
tayga_upos_skipgram_300_2_2019 = RusvectoresScheme(
    'tayga_upos_skipgram_300_2_2019',
    get_path('tayga_upos_skipgram_300_2_2019')
)
araneum_none_fasttextcbow_300_5_2018 = RusvectoresFasttextScheme(
    'araneum_none_fasttextcbow_300_5_2018',
    get_path(
        'araneum_none_fasttextcbow_300_5_2018',
        'araneum_none_fasttextcbow_300_5_2018.model'
    )
)
rusvectores = [
    ruscorpora_upos_cbow_300_20_2019,
    ruwikiruscorpora_upos_skipgram_300_2_2019,
    tayga_upos_skipgram_300_2_2019,
    tayga_none_fasttextcbow_300_10_2019,
    araneum_none_fasttextcbow_300_5_2018
]

## Navec

In [None]:
from navec.eval.model import NavecScheme

navecs = []
for name in ['hudlit_12B_500K_300d_100q', 'news_1B_250K_300d_100q']:
    path = join_path('data', 'models', 'navec', name + '.tar')
    navec = NavecScheme(name, path)
    navecs.append(navec)

# Eval

In [None]:
from navec.eval.metrics import eval_schemes

schemes = rusvectores + navecs
records = eval_schemes(schemes, datasets)
records = list(log_progress(records))

In [None]:
from navec.eval.report import report_table, format_report

table = report_table(records, schemes, datasets)
table = format_report(table, datasets)
print(table.to_html())
display(table)

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>type</th>
      <th>init, s</th>
      <th>get, µs</th>
      <th>disk, mb</th>
      <th>ram, mb</th>
      <th>simlex965, spear</th>
      <th>hj, spear</th>
      <th>rt, prec</th>
      <th>ae, prec</th>
      <th>ae2, prec</th>
      <th>lrwc, prec</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>ruscorpora_upos_cbow_300_20_2019</th>
      <td>w2v</td>
      <td>11.9</td>
      <td>5.1</td>
      <td>220.6</td>
      <td>236.1</td>
      <td>0.359/1.00</td>
      <td>0.685/0.95</td>
      <td>0.852/0.54</td>
      <td>0.758/0.71</td>
      <td>0.896/0.58</td>
      <td>0.602/0.59</td>
    </tr>
    <tr>
      <th>ruwikiruscorpora_upos_skipgram_300_2_2019</th>
      <td>w2v</td>
      <td>19.7</td>
      <td>5.0</td>
      <td>290.0</td>
      <td>309.4</td>
      <td>0.321/1.00</td>
      <td>0.723/0.94</td>
      <td>0.817/0.58</td>
      <td>0.801/0.74</td>
      <td>0.860/0.63</td>
      <td>0.629/0.58</td>
    </tr>
    <tr>
      <th>tayga_upos_skipgram_300_2_2019</th>
      <td>w2v</td>
      <td>20.5</td>
      <td>6.6</td>
      <td>290.7</td>
      <td>310.9</td>
      <td>0.429/0.99</td>
      <td>0.749/0.96</td>
      <td>0.871/0.57</td>
      <td>0.771/0.76</td>
      <td>0.899/0.62</td>
      <td>0.639/0.59</td>
    </tr>
    <tr>
      <th>tayga_none_fasttextcbow_300_10_2019</th>
      <td>fasttext</td>
      <td>3.7</td>
      <td>16.3</td>
      <td>910.6</td>
      <td>909.7</td>
      <td>0.370/1.00</td>
      <td>0.643/1.00</td>
      <td>0.792/1.00</td>
      <td>0.695/1.00</td>
      <td>0.809/1.00</td>
      <td>0.533/1.00</td>
    </tr>
    <tr>
      <th>araneum_none_fasttextcbow_300_5_2018</th>
      <td>fasttext</td>
      <td>5.9</td>
      <td>12.6</td>
      <td>945.3</td>
      <td>926.5</td>
      <td>0.349/1.00</td>
      <td>0.670/1.00</td>
      <td>0.804/1.00</td>
      <td>0.717/1.00</td>
      <td>0.796/1.00</td>
      <td>0.578/1.00</td>
    </tr>
    <tr>
      <th>hudlit_12B_500K_300d_100q</th>
      <td>navec</td>
      <td>1.5</td>
      <td>22.1</td>
      <td>50.6</td>
      <td>95.3</td>
      <td>0.310/0.99</td>
      <td>0.707/0.98</td>
      <td>0.842/0.71</td>
      <td>0.931/0.95</td>
      <td>0.923/0.83</td>
      <td>0.604/0.64</td>
    </tr>
    <tr>
      <th>news_1B_250K_300d_100q</th>
      <td>navec</td>
      <td>0.7</td>
      <td>18.5</td>
      <td>25.4</td>
      <td>47.7</td>
      <td>0.230/0.93</td>
      <td>0.590/0.97</td>
      <td>0.784/0.51</td>
      <td>0.866/0.89</td>
      <td>0.861/0.66</td>
      <td>0.589/0.61</td>
    </tr>
  </tbody>
</table>

In [None]:
from navec.eval.report import format_github_report1, format_github_report2


def fix(html):
    return html.replace('border="1"', 'border="0"')


table = report_table(records, schemes, datasets)
table1 = format_github_report1(table)
table2 = format_github_report2(table, datasets)
print(fix(table1.to_html(escape=False)))
display(table1)
print(fix(table2.to_html(escape=False)))
display(table2)