## Импорт библиотек

In [4]:
%%time
import gzip
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
%matplotlib inline
from IPython.display import clear_output
from ipywidgets.widgets import interact, interact_manual
import os
from PIL import Image
from multiprocessing.pool import Pool
from functools import partial
import sentencepiece as spm
import json
import gensim
from gensim.test.utils import datapath
import logging
import time
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

CPU times: user 3.41 s, sys: 5.07 s, total: 8.48 s
Wall time: 838 ms




## Вывод всех сгенерированных файлов с последовтельностями

In [5]:
!ls files/*seq*

files/seqs_city_name.gz		     files/seqs_date_sorting_val.gz
files/seqs_cpe_manufacturer_name.gz  files/seqs_price.gz
files/seqs_cpe_model_name.gz	     files/seqs_region_name.gz
files/seqs_cpe_model_os_type.gz      files/seqs_request_cnt.gz
files/seqs_cpe_type_cd.gz	     files/seqs_url_host.gz


## Пример считывания одного файла

In [5]:
XX = []
c = 0
with gzip.open('files/seqs_cpe_model_name.gz', 'rt', encoding='utf-8') as f:
    for line in tqdm(f):
        #if c%10==0:
        XX.append(line.strip().split(' '))
        c += 1
len(XX)

0it [00:00, ?it/s]

415317

## Выставляем тэги документа для каждого юзера

In [6]:
tags = [[str(i)] for i in range(len(XX))]

## Учим Doc2Vec на разных окнах и числе эмбедов

In [7]:
%%time
class DocCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __init__(self, fold=1):
        print('Start initializing %d'%fold)
        print('Finished initializing %d'%fold)

    def __iter__(self):
        for tag_ids, tokens in zip(tags, XX):
            # assume there's one document per line, tokens separated by whitespace
            yield gensim.models.doc2vec.TaggedDocument(tokens,
                                                       tag_ids)

def train_doc2vec(args):
    WSIZE, EMBSIZE = args
    model_path = 'word2vec_models/cpemodname_d2v_tokens_w=%d_emb=%d_thr=5'%(WSIZE, EMBSIZE)
    if os.path.exists(model_path):
        return
    documents = DocCorpus()
    doc2vec_model = gensim.models.doc2vec.Doc2Vec(documents,
                                                  workers=4,
                                                  window=WSIZE,
                                                  vector_size=EMBSIZE,
                                                  min_count=20,
                                                  epochs=10)
    os.makedirs(model_path, exist_ok=True)
    doc2vec_model.save('%s/model'%(model_path))

args = []
candidates = [3, 128], [5, 64], [7, 256]
for candidate in candidates:
    args.append([*candidate])

args = np.array(args)
args = [arguments for arguments in args[np.random.permutation(len(args))].copy()]

for arguments in tqdm(args):
    train_doc2vec(arguments)
    time.sleep(1)

  0%|          | 0/3 [00:00<?, ?it/s]

2023-03-04 16:14:44,781 : INFO : collecting all words and their counts
2023-03-04 16:14:44,782 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags


Start initializing 1
Finished initializing 1


2023-03-04 16:14:45,703 : INFO : PROGRESS: at example #10000, processed 7792919 words (8458121/s), 404 word types, 10000 tags
2023-03-04 16:14:46,632 : INFO : PROGRESS: at example #20000, processed 15646370 words (8465818/s), 453 word types, 20000 tags
2023-03-04 16:14:52,387 : INFO : PROGRESS: at example #30000, processed 23496756 words (1364243/s), 486 word types, 30000 tags
2023-03-04 16:14:53,358 : INFO : PROGRESS: at example #40000, processed 31484172 words (8231941/s), 509 word types, 40000 tags
2023-03-04 16:14:54,294 : INFO : PROGRESS: at example #50000, processed 39289387 words (8351158/s), 529 word types, 50000 tags
2023-03-04 16:14:55,227 : INFO : PROGRESS: at example #60000, processed 47052059 words (8332395/s), 542 word types, 60000 tags
2023-03-04 16:14:56,181 : INFO : PROGRESS: at example #70000, processed 54897738 words (8235048/s), 551 word types, 70000 tags
2023-03-04 16:14:57,134 : INFO : PROGRESS: at example #80000, processed 62672255 words (8158945/s), 563 word typ

2023-03-04 16:17:09,508 : INFO : EPOCH 1 - PROGRESS: at 19.91% examples, 1786167 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:17:10,508 : INFO : EPOCH 1 - PROGRESS: at 21.02% examples, 1786699 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:17:11,512 : INFO : EPOCH 1 - PROGRESS: at 22.13% examples, 1786864 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:17:12,515 : INFO : EPOCH 1 - PROGRESS: at 23.29% examples, 1790265 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:17:13,516 : INFO : EPOCH 1 - PROGRESS: at 24.40% examples, 1789795 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:17:14,516 : INFO : EPOCH 1 - PROGRESS: at 25.59% examples, 1792069 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:17:15,519 : INFO : EPOCH 1 - PROGRESS: at 26.77% examples, 1793040 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:17:16,520 : INFO : EPOCH 1 - PROGRESS: at 27.94% examples, 1796001 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:17:17,524 : INFO : EPOCH 1 - PROGRESS: at 29.06% examples, 1800064 words/s

2023-03-04 16:18:21,625 : INFO : EPOCH 1 - PROGRESS: at 99.11% examples, 1767095 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:18:22,434 : INFO : worker thread finished; awaiting finish of 3 more threads
2023-03-04 16:18:22,438 : INFO : worker thread finished; awaiting finish of 2 more threads
2023-03-04 16:18:22,439 : INFO : worker thread finished; awaiting finish of 1 more threads
2023-03-04 16:18:22,443 : INFO : worker thread finished; awaiting finish of 0 more threads
2023-03-04 16:18:22,443 : INFO : EPOCH - 1 : training on 322899435 raw words (160737820 effective words) took 91.0s, 1766794 effective words/s
2023-03-04 16:18:23,447 : INFO : EPOCH 2 - PROGRESS: at 1.02% examples, 1643254 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:18:24,447 : INFO : EPOCH 2 - PROGRESS: at 2.10% examples, 1702702 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:18:25,447 : INFO : EPOCH 2 - PROGRESS: at 3.19% examples, 1722016 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:18:26,449 : INFO : EPOCH 2

2023-03-04 16:19:30,595 : INFO : EPOCH 2 - PROGRESS: at 74.33% examples, 1758177 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:19:31,597 : INFO : EPOCH 2 - PROGRESS: at 75.44% examples, 1757250 words/s, in_qsize 8, out_qsize 0
2023-03-04 16:19:32,600 : INFO : EPOCH 2 - PROGRESS: at 76.54% examples, 1756653 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:19:33,601 : INFO : EPOCH 2 - PROGRESS: at 77.61% examples, 1757157 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:19:34,601 : INFO : EPOCH 2 - PROGRESS: at 78.73% examples, 1757150 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:19:35,605 : INFO : EPOCH 2 - PROGRESS: at 79.83% examples, 1756234 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:19:36,606 : INFO : EPOCH 2 - PROGRESS: at 80.95% examples, 1756476 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:19:37,607 : INFO : EPOCH 2 - PROGRESS: at 82.01% examples, 1756398 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:19:38,608 : INFO : EPOCH 2 - PROGRESS: at 83.09% examples, 1757104 words/s

2023-03-04 16:20:39,055 : INFO : EPOCH 3 - PROGRESS: at 48.65% examples, 1738432 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:20:40,056 : INFO : EPOCH 3 - PROGRESS: at 49.72% examples, 1737659 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:20:41,057 : INFO : EPOCH 3 - PROGRESS: at 50.75% examples, 1737297 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:20:42,057 : INFO : EPOCH 3 - PROGRESS: at 51.79% examples, 1736686 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:20:43,058 : INFO : EPOCH 3 - PROGRESS: at 52.85% examples, 1737381 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:20:44,059 : INFO : EPOCH 3 - PROGRESS: at 53.94% examples, 1737879 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:20:45,061 : INFO : EPOCH 3 - PROGRESS: at 55.00% examples, 1737418 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:20:46,063 : INFO : EPOCH 3 - PROGRESS: at 56.06% examples, 1736869 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:20:47,064 : INFO : EPOCH 3 - PROGRESS: at 57.14% examples, 1736673 words/s

2023-03-04 16:21:47,786 : INFO : EPOCH 4 - PROGRESS: at 24.25% examples, 1779299 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:21:48,786 : INFO : EPOCH 4 - PROGRESS: at 25.40% examples, 1778304 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:21:49,787 : INFO : EPOCH 4 - PROGRESS: at 26.53% examples, 1777618 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:21:50,788 : INFO : EPOCH 4 - PROGRESS: at 27.69% examples, 1779539 words/s, in_qsize 8, out_qsize 0
2023-03-04 16:21:51,790 : INFO : EPOCH 4 - PROGRESS: at 28.78% examples, 1782716 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:21:52,793 : INFO : EPOCH 4 - PROGRESS: at 29.90% examples, 1782206 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:21:53,796 : INFO : EPOCH 4 - PROGRESS: at 31.01% examples, 1783320 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:21:54,797 : INFO : EPOCH 4 - PROGRESS: at 32.12% examples, 1784814 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:21:55,799 : INFO : EPOCH 4 - PROGRESS: at 33.29% examples, 1787837 words/s

2023-03-04 16:22:56,608 : INFO : EPOCH 5 - PROGRESS: at 2.16% examples, 1736712 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:22:57,610 : INFO : EPOCH 5 - PROGRESS: at 3.20% examples, 1732798 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:22:58,612 : INFO : EPOCH 5 - PROGRESS: at 4.29% examples, 1749251 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:22:59,615 : INFO : EPOCH 5 - PROGRESS: at 5.38% examples, 1752429 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:23:00,615 : INFO : EPOCH 5 - PROGRESS: at 6.48% examples, 1746726 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:23:01,618 : INFO : EPOCH 5 - PROGRESS: at 7.58% examples, 1759173 words/s, in_qsize 6, out_qsize 1
2023-03-04 16:23:02,618 : INFO : EPOCH 5 - PROGRESS: at 8.68% examples, 1760177 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:23:03,619 : INFO : EPOCH 5 - PROGRESS: at 9.80% examples, 1763574 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:23:04,622 : INFO : EPOCH 5 - PROGRESS: at 10.88% examples, 1762075 words/s, in_qsi

2023-03-04 16:24:08,758 : INFO : EPOCH 5 - PROGRESS: at 80.01% examples, 1736452 words/s, in_qsize 6, out_qsize 1
2023-03-04 16:24:09,761 : INFO : EPOCH 5 - PROGRESS: at 81.07% examples, 1735428 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:24:10,763 : INFO : EPOCH 5 - PROGRESS: at 82.12% examples, 1735712 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:24:11,766 : INFO : EPOCH 5 - PROGRESS: at 83.11% examples, 1734769 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:24:12,766 : INFO : EPOCH 5 - PROGRESS: at 84.19% examples, 1734304 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:24:13,766 : INFO : EPOCH 5 - PROGRESS: at 85.24% examples, 1733404 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:24:14,767 : INFO : EPOCH 5 - PROGRESS: at 86.32% examples, 1732611 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:24:15,768 : INFO : EPOCH 5 - PROGRESS: at 87.38% examples, 1732391 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:24:16,770 : INFO : EPOCH 5 - PROGRESS: at 88.42% examples, 1731531 words/s

2023-03-04 16:25:17,828 : INFO : EPOCH 6 - PROGRESS: at 52.27% examples, 1683216 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:25:18,833 : INFO : EPOCH 6 - PROGRESS: at 53.29% examples, 1683546 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:25:19,834 : INFO : EPOCH 6 - PROGRESS: at 54.38% examples, 1683937 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:25:20,834 : INFO : EPOCH 6 - PROGRESS: at 55.41% examples, 1684659 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:25:21,839 : INFO : EPOCH 6 - PROGRESS: at 56.47% examples, 1685006 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:25:22,839 : INFO : EPOCH 6 - PROGRESS: at 57.53% examples, 1685053 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:25:23,842 : INFO : EPOCH 6 - PROGRESS: at 58.54% examples, 1683970 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:25:24,843 : INFO : EPOCH 6 - PROGRESS: at 59.57% examples, 1683779 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:25:25,845 : INFO : EPOCH 6 - PROGRESS: at 60.64% examples, 1682999 words/s

2023-03-04 16:26:26,350 : INFO : EPOCH 7 - PROGRESS: at 23.89% examples, 1677443 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:26:27,351 : INFO : EPOCH 7 - PROGRESS: at 24.97% examples, 1675980 words/s, in_qsize 6, out_qsize 1
2023-03-04 16:26:28,353 : INFO : EPOCH 7 - PROGRESS: at 26.04% examples, 1675973 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:26:29,353 : INFO : EPOCH 7 - PROGRESS: at 27.14% examples, 1677186 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:26:30,356 : INFO : EPOCH 7 - PROGRESS: at 28.20% examples, 1678670 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:26:31,361 : INFO : EPOCH 7 - PROGRESS: at 29.22% examples, 1680114 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:26:32,369 : INFO : EPOCH 7 - PROGRESS: at 30.28% examples, 1679211 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:26:33,370 : INFO : EPOCH 7 - PROGRESS: at 31.32% examples, 1680544 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:26:34,371 : INFO : EPOCH 7 - PROGRESS: at 32.43% examples, 1685606 words/s

2023-03-04 16:27:38,444 : INFO : worker thread finished; awaiting finish of 3 more threads
2023-03-04 16:27:38,445 : INFO : worker thread finished; awaiting finish of 2 more threads
2023-03-04 16:27:38,447 : INFO : worker thread finished; awaiting finish of 1 more threads
2023-03-04 16:27:38,451 : INFO : worker thread finished; awaiting finish of 0 more threads
2023-03-04 16:27:38,451 : INFO : EPOCH - 7 : training on 322899435 raw words (160718381 effective words) took 95.2s, 1689008 effective words/s
2023-03-04 16:27:39,454 : INFO : EPOCH 8 - PROGRESS: at 1.01% examples, 1625322 words/s, in_qsize 8, out_qsize 0
2023-03-04 16:27:40,456 : INFO : EPOCH 8 - PROGRESS: at 2.06% examples, 1669728 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:27:41,461 : INFO : EPOCH 8 - PROGRESS: at 3.09% examples, 1669989 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:27:42,462 : INFO : EPOCH 8 - PROGRESS: at 4.14% examples, 1680223 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:27:43,462 : INFO : EPOCH 8 

2023-03-04 16:28:47,604 : INFO : EPOCH 8 - PROGRESS: at 72.76% examples, 1696572 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:28:48,605 : INFO : EPOCH 8 - PROGRESS: at 73.79% examples, 1696104 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:28:49,609 : INFO : EPOCH 8 - PROGRESS: at 74.84% examples, 1695391 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:28:50,612 : INFO : EPOCH 8 - PROGRESS: at 75.91% examples, 1694818 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:28:51,615 : INFO : EPOCH 8 - PROGRESS: at 76.98% examples, 1694566 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:28:52,618 : INFO : EPOCH 8 - PROGRESS: at 78.05% examples, 1694458 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:28:53,620 : INFO : EPOCH 8 - PROGRESS: at 79.10% examples, 1694585 words/s, in_qsize 6, out_qsize 1
2023-03-04 16:28:54,622 : INFO : EPOCH 8 - PROGRESS: at 80.19% examples, 1694721 words/s, in_qsize 7, out_qsize 1
2023-03-04 16:28:55,623 : INFO : EPOCH 8 - PROGRESS: at 81.27% examples, 1694842 words/s

2023-03-04 16:29:56,399 : INFO : EPOCH 9 - PROGRESS: at 45.95% examples, 1718010 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:29:57,400 : INFO : EPOCH 9 - PROGRESS: at 47.01% examples, 1717397 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:29:58,400 : INFO : EPOCH 9 - PROGRESS: at 48.07% examples, 1717697 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:29:59,401 : INFO : EPOCH 9 - PROGRESS: at 49.14% examples, 1717051 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:30:00,406 : INFO : EPOCH 9 - PROGRESS: at 50.22% examples, 1717780 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:30:01,410 : INFO : EPOCH 9 - PROGRESS: at 51.27% examples, 1718668 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:30:02,412 : INFO : EPOCH 9 - PROGRESS: at 52.32% examples, 1718282 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:30:03,413 : INFO : EPOCH 9 - PROGRESS: at 53.39% examples, 1719320 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:30:04,416 : INFO : EPOCH 9 - PROGRESS: at 54.52% examples, 1720712 words/s

2023-03-04 16:31:04,782 : INFO : EPOCH 10 - PROGRESS: at 17.69% examples, 1681410 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:31:05,783 : INFO : EPOCH 10 - PROGRESS: at 18.69% examples, 1680461 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:31:06,783 : INFO : EPOCH 10 - PROGRESS: at 19.73% examples, 1677818 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:31:07,788 : INFO : EPOCH 10 - PROGRESS: at 20.76% examples, 1678124 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:31:08,789 : INFO : EPOCH 10 - PROGRESS: at 21.81% examples, 1678910 words/s, in_qsize 8, out_qsize 0
2023-03-04 16:31:09,796 : INFO : EPOCH 10 - PROGRESS: at 22.89% examples, 1679817 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:31:10,797 : INFO : EPOCH 10 - PROGRESS: at 23.93% examples, 1682104 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:31:11,800 : INFO : EPOCH 10 - PROGRESS: at 25.02% examples, 1680399 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:31:12,802 : INFO : EPOCH 10 - PROGRESS: at 26.05% examples, 167797

2023-03-04 16:32:16,915 : INFO : EPOCH 10 - PROGRESS: at 94.22% examples, 1698538 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:32:17,918 : INFO : EPOCH 10 - PROGRESS: at 95.29% examples, 1698511 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:32:18,920 : INFO : EPOCH 10 - PROGRESS: at 96.36% examples, 1699163 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:32:19,921 : INFO : EPOCH 10 - PROGRESS: at 97.36% examples, 1698197 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:32:20,924 : INFO : EPOCH 10 - PROGRESS: at 98.41% examples, 1697702 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:32:21,924 : INFO : EPOCH 10 - PROGRESS: at 99.47% examples, 1698029 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:32:22,397 : INFO : worker thread finished; awaiting finish of 3 more threads
2023-03-04 16:32:22,398 : INFO : worker thread finished; awaiting finish of 2 more threads
2023-03-04 16:32:22,400 : INFO : worker thread finished; awaiting finish of 1 more threads
2023-03-04 16:32:22,405 : INFO : work

Start initializing 1
Finished initializing 1


2023-03-04 16:32:25,401 : INFO : PROGRESS: at example #10000, processed 7792919 words (9112886/s), 404 word types, 10000 tags
2023-03-04 16:32:26,272 : INFO : PROGRESS: at example #20000, processed 15646370 words (9027018/s), 453 word types, 20000 tags
2023-03-04 16:32:27,130 : INFO : PROGRESS: at example #30000, processed 23496756 words (9155731/s), 486 word types, 30000 tags
2023-03-04 16:32:28,001 : INFO : PROGRESS: at example #40000, processed 31484172 words (9186996/s), 509 word types, 40000 tags
2023-03-04 16:32:28,834 : INFO : PROGRESS: at example #50000, processed 39289387 words (9384522/s), 529 word types, 50000 tags
2023-03-04 16:32:29,662 : INFO : PROGRESS: at example #60000, processed 47052059 words (9382614/s), 542 word types, 60000 tags
2023-03-04 16:32:30,511 : INFO : PROGRESS: at example #70000, processed 54897738 words (9245445/s), 551 word types, 70000 tags
2023-03-04 16:32:31,346 : INFO : PROGRESS: at example #80000, processed 62672255 words (9325588/s), 563 word typ

2023-03-04 16:34:58,776 : INFO : EPOCH 1 - PROGRESS: at 16.45% examples, 1478160 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:34:59,781 : INFO : EPOCH 1 - PROGRESS: at 17.36% examples, 1477975 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:35:00,785 : INFO : EPOCH 1 - PROGRESS: at 18.29% examples, 1479213 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:35:01,787 : INFO : EPOCH 1 - PROGRESS: at 19.25% examples, 1482751 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:35:02,791 : INFO : EPOCH 1 - PROGRESS: at 20.23% examples, 1484342 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:35:03,793 : INFO : EPOCH 1 - PROGRESS: at 21.15% examples, 1485092 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:35:04,793 : INFO : EPOCH 1 - PROGRESS: at 22.09% examples, 1486420 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:35:05,795 : INFO : EPOCH 1 - PROGRESS: at 23.06% examples, 1489059 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:35:06,798 : INFO : EPOCH 1 - PROGRESS: at 23.98% examples, 1490908 words/s

2023-03-04 16:36:10,960 : INFO : EPOCH 1 - PROGRESS: at 84.16% examples, 1501986 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:36:11,961 : INFO : EPOCH 1 - PROGRESS: at 85.12% examples, 1502070 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:36:12,967 : INFO : EPOCH 1 - PROGRESS: at 86.08% examples, 1501898 words/s, in_qsize 6, out_qsize 1
2023-03-04 16:36:13,968 : INFO : EPOCH 1 - PROGRESS: at 87.02% examples, 1502244 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:36:14,970 : INFO : EPOCH 1 - PROGRESS: at 87.98% examples, 1502225 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:36:15,976 : INFO : EPOCH 1 - PROGRESS: at 88.89% examples, 1501835 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:36:16,978 : INFO : EPOCH 1 - PROGRESS: at 89.81% examples, 1501401 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:36:17,979 : INFO : EPOCH 1 - PROGRESS: at 90.77% examples, 1501619 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:36:18,980 : INFO : EPOCH 1 - PROGRESS: at 91.72% examples, 1500981 words/s

2023-03-04 16:37:20,023 : INFO : EPOCH 2 - PROGRESS: at 47.63% examples, 1472047 words/s, in_qsize 6, out_qsize 1
2023-03-04 16:37:21,023 : INFO : EPOCH 2 - PROGRESS: at 48.48% examples, 1469941 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:37:22,024 : INFO : EPOCH 2 - PROGRESS: at 49.39% examples, 1469004 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:37:23,026 : INFO : EPOCH 2 - PROGRESS: at 50.28% examples, 1469171 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:37:24,031 : INFO : EPOCH 2 - PROGRESS: at 51.04% examples, 1465734 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:37:25,031 : INFO : EPOCH 2 - PROGRESS: at 51.92% examples, 1465130 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:37:26,036 : INFO : EPOCH 2 - PROGRESS: at 52.82% examples, 1465489 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:37:27,037 : INFO : EPOCH 2 - PROGRESS: at 53.71% examples, 1465540 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:37:28,038 : INFO : EPOCH 2 - PROGRESS: at 54.59% examples, 1464065 words/s

2023-03-04 16:38:28,302 : INFO : EPOCH 3 - PROGRESS: at 7.87% examples, 1420748 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:38:29,305 : INFO : EPOCH 3 - PROGRESS: at 8.71% examples, 1412982 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:38:30,310 : INFO : EPOCH 3 - PROGRESS: at 9.51% examples, 1404035 words/s, in_qsize 6, out_qsize 1
2023-03-04 16:38:31,312 : INFO : EPOCH 3 - PROGRESS: at 10.40% examples, 1402037 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:38:32,314 : INFO : EPOCH 3 - PROGRESS: at 11.24% examples, 1402306 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:38:33,317 : INFO : EPOCH 3 - PROGRESS: at 12.14% examples, 1402870 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:38:34,326 : INFO : EPOCH 3 - PROGRESS: at 13.02% examples, 1405310 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:38:35,328 : INFO : EPOCH 3 - PROGRESS: at 13.88% examples, 1403442 words/s, in_qsize 6, out_qsize 1
2023-03-04 16:38:36,333 : INFO : EPOCH 3 - PROGRESS: at 14.77% examples, 1404709 words/s, i

2023-03-04 16:39:40,473 : INFO : EPOCH 3 - PROGRESS: at 71.22% examples, 1415287 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:39:41,478 : INFO : EPOCH 3 - PROGRESS: at 72.15% examples, 1415462 words/s, in_qsize 6, out_qsize 1
2023-03-04 16:39:42,482 : INFO : EPOCH 3 - PROGRESS: at 73.05% examples, 1415713 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:39:43,489 : INFO : EPOCH 3 - PROGRESS: at 73.91% examples, 1415325 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:39:44,495 : INFO : EPOCH 3 - PROGRESS: at 74.83% examples, 1415551 words/s, in_qsize 8, out_qsize 0
2023-03-04 16:39:45,496 : INFO : EPOCH 3 - PROGRESS: at 75.77% examples, 1415827 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:39:46,500 : INFO : EPOCH 3 - PROGRESS: at 76.70% examples, 1415901 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:39:47,502 : INFO : EPOCH 3 - PROGRESS: at 77.53% examples, 1415808 words/s, in_qsize 8, out_qsize 0
2023-03-04 16:39:48,504 : INFO : EPOCH 3 - PROGRESS: at 78.47% examples, 1416112 words/s

2023-03-04 16:40:49,014 : INFO : EPOCH 4 - PROGRESS: at 33.10% examples, 1480487 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:40:50,014 : INFO : EPOCH 4 - PROGRESS: at 34.04% examples, 1481386 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:40:51,016 : INFO : EPOCH 4 - PROGRESS: at 34.98% examples, 1482096 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:40:52,017 : INFO : EPOCH 4 - PROGRESS: at 35.93% examples, 1482911 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:40:53,017 : INFO : EPOCH 4 - PROGRESS: at 36.88% examples, 1483787 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:40:54,019 : INFO : EPOCH 4 - PROGRESS: at 37.83% examples, 1484325 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:40:55,020 : INFO : EPOCH 4 - PROGRESS: at 38.77% examples, 1484776 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:40:56,023 : INFO : EPOCH 4 - PROGRESS: at 39.69% examples, 1483138 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:40:57,026 : INFO : EPOCH 4 - PROGRESS: at 40.62% examples, 1482931 words/s

2023-03-04 16:42:01,183 : INFO : EPOCH 4 - PROGRESS: at 98.40% examples, 1461055 words/s, in_qsize 6, out_qsize 1
2023-03-04 16:42:02,186 : INFO : EPOCH 4 - PROGRESS: at 99.31% examples, 1461012 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:42:02,996 : INFO : worker thread finished; awaiting finish of 3 more threads
2023-03-04 16:42:03,002 : INFO : worker thread finished; awaiting finish of 2 more threads
2023-03-04 16:42:03,003 : INFO : worker thread finished; awaiting finish of 1 more threads
2023-03-04 16:42:03,007 : INFO : worker thread finished; awaiting finish of 0 more threads
2023-03-04 16:42:03,008 : INFO : EPOCH - 4 : training on 322899435 raw words (160739446 effective words) took 110.1s, 1460195 effective words/s
2023-03-04 16:42:04,014 : INFO : EPOCH 5 - PROGRESS: at 0.83% examples, 1329984 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:42:05,015 : INFO : EPOCH 5 - PROGRESS: at 1.70% examples, 1390673 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:42:06,019 : INFO : EPOCH

2023-03-04 16:43:10,204 : INFO : EPOCH 5 - PROGRESS: at 59.27% examples, 1423478 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:43:11,209 : INFO : EPOCH 5 - PROGRESS: at 60.17% examples, 1423123 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:43:12,215 : INFO : EPOCH 5 - PROGRESS: at 61.08% examples, 1422986 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:43:13,216 : INFO : EPOCH 5 - PROGRESS: at 61.92% examples, 1422613 words/s, in_qsize 6, out_qsize 1
2023-03-04 16:43:14,216 : INFO : EPOCH 5 - PROGRESS: at 62.81% examples, 1422653 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:43:15,219 : INFO : EPOCH 5 - PROGRESS: at 63.70% examples, 1422837 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:43:16,222 : INFO : EPOCH 5 - PROGRESS: at 64.56% examples, 1422762 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:43:17,225 : INFO : EPOCH 5 - PROGRESS: at 65.45% examples, 1422675 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:43:18,229 : INFO : EPOCH 5 - PROGRESS: at 66.31% examples, 1422445 words/s

2023-03-04 16:44:18,548 : INFO : EPOCH 6 - PROGRESS: at 19.23% examples, 1414234 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:44:19,550 : INFO : EPOCH 6 - PROGRESS: at 20.13% examples, 1412756 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:44:20,554 : INFO : EPOCH 6 - PROGRESS: at 21.00% examples, 1412801 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:44:21,555 : INFO : EPOCH 6 - PROGRESS: at 21.89% examples, 1413238 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:44:22,556 : INFO : EPOCH 6 - PROGRESS: at 22.78% examples, 1413778 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:44:23,561 : INFO : EPOCH 6 - PROGRESS: at 23.71% examples, 1417191 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:44:24,562 : INFO : EPOCH 6 - PROGRESS: at 24.66% examples, 1419001 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:44:25,568 : INFO : EPOCH 6 - PROGRESS: at 25.55% examples, 1418331 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:44:26,568 : INFO : EPOCH 6 - PROGRESS: at 26.39% examples, 1414742 words/s

2023-03-04 16:45:30,685 : INFO : EPOCH 6 - PROGRESS: at 86.67% examples, 1481022 words/s, in_qsize 6, out_qsize 1
2023-03-04 16:45:31,686 : INFO : EPOCH 6 - PROGRESS: at 87.58% examples, 1480756 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:45:32,687 : INFO : EPOCH 6 - PROGRESS: at 88.53% examples, 1481247 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:45:33,691 : INFO : EPOCH 6 - PROGRESS: at 89.42% examples, 1480433 words/s, in_qsize 8, out_qsize 0
2023-03-04 16:45:34,696 : INFO : EPOCH 6 - PROGRESS: at 90.33% examples, 1480084 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:45:35,699 : INFO : EPOCH 6 - PROGRESS: at 91.32% examples, 1480159 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:45:36,701 : INFO : EPOCH 6 - PROGRESS: at 92.26% examples, 1480288 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:45:37,706 : INFO : EPOCH 6 - PROGRESS: at 93.22% examples, 1480477 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:45:38,710 : INFO : EPOCH 6 - PROGRESS: at 94.14% examples, 1480524 words/s

2023-03-04 16:46:39,216 : INFO : EPOCH 7 - PROGRESS: at 49.86% examples, 1483696 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:46:40,216 : INFO : EPOCH 7 - PROGRESS: at 50.79% examples, 1484492 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:46:41,216 : INFO : EPOCH 7 - PROGRESS: at 51.72% examples, 1485160 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:46:42,217 : INFO : EPOCH 7 - PROGRESS: at 52.65% examples, 1486152 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:46:43,222 : INFO : EPOCH 7 - PROGRESS: at 53.55% examples, 1486723 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:46:44,222 : INFO : EPOCH 7 - PROGRESS: at 54.45% examples, 1485414 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:46:45,224 : INFO : EPOCH 7 - PROGRESS: at 55.40% examples, 1486284 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:46:46,227 : INFO : EPOCH 7 - PROGRESS: at 56.32% examples, 1486544 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:46:47,228 : INFO : EPOCH 7 - PROGRESS: at 57.27% examples, 1486449 words/s

2023-03-04 16:47:47,703 : INFO : EPOCH 8 - PROGRESS: at 12.24% examples, 1413592 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:47:48,703 : INFO : EPOCH 8 - PROGRESS: at 13.07% examples, 1411063 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:47:49,705 : INFO : EPOCH 8 - PROGRESS: at 13.96% examples, 1411790 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:47:50,709 : INFO : EPOCH 8 - PROGRESS: at 14.85% examples, 1412811 words/s, in_qsize 8, out_qsize 0
2023-03-04 16:47:51,711 : INFO : EPOCH 8 - PROGRESS: at 15.71% examples, 1411674 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:47:52,712 : INFO : EPOCH 8 - PROGRESS: at 16.60% examples, 1414424 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:47:53,713 : INFO : EPOCH 8 - PROGRESS: at 17.52% examples, 1416012 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:47:54,717 : INFO : EPOCH 8 - PROGRESS: at 18.40% examples, 1416594 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:47:55,719 : INFO : EPOCH 8 - PROGRESS: at 19.28% examples, 1416722 words/s

2023-03-04 16:48:59,891 : INFO : EPOCH 8 - PROGRESS: at 76.33% examples, 1425492 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:49:00,891 : INFO : EPOCH 8 - PROGRESS: at 77.20% examples, 1425405 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:49:01,896 : INFO : EPOCH 8 - PROGRESS: at 78.11% examples, 1425359 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:49:02,897 : INFO : EPOCH 8 - PROGRESS: at 78.99% examples, 1425491 words/s, in_qsize 8, out_qsize 0
2023-03-04 16:49:03,901 : INFO : EPOCH 8 - PROGRESS: at 79.90% examples, 1425151 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:49:04,901 : INFO : EPOCH 8 - PROGRESS: at 80.82% examples, 1425285 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:49:05,904 : INFO : EPOCH 8 - PROGRESS: at 81.70% examples, 1425533 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:49:06,906 : INFO : EPOCH 8 - PROGRESS: at 82.58% examples, 1425768 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:49:07,907 : INFO : EPOCH 8 - PROGRESS: at 83.40% examples, 1425491 words/s

2023-03-04 16:50:08,583 : INFO : EPOCH 9 - PROGRESS: at 37.55% examples, 1438910 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:50:09,585 : INFO : EPOCH 9 - PROGRESS: at 38.50% examples, 1440626 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:50:10,587 : INFO : EPOCH 9 - PROGRESS: at 39.50% examples, 1442192 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:50:11,588 : INFO : EPOCH 9 - PROGRESS: at 40.46% examples, 1443843 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:50:12,592 : INFO : EPOCH 9 - PROGRESS: at 41.41% examples, 1445723 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:50:13,593 : INFO : EPOCH 9 - PROGRESS: at 42.35% examples, 1447890 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:50:14,599 : INFO : EPOCH 9 - PROGRESS: at 43.31% examples, 1449372 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:50:15,601 : INFO : EPOCH 9 - PROGRESS: at 44.26% examples, 1450789 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:50:16,602 : INFO : EPOCH 9 - PROGRESS: at 45.21% examples, 1452551 words/s

2023-03-04 16:51:17,573 : INFO : EPOCH - 9 : training on 322899435 raw words (160743933 effective words) took 111.1s, 1446880 effective words/s
2023-03-04 16:51:18,576 : INFO : EPOCH 10 - PROGRESS: at 0.85% examples, 1351606 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:51:19,578 : INFO : EPOCH 10 - PROGRESS: at 1.73% examples, 1419323 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:51:20,580 : INFO : EPOCH 10 - PROGRESS: at 2.66% examples, 1419044 words/s, in_qsize 6, out_qsize 1
2023-03-04 16:51:21,583 : INFO : EPOCH 10 - PROGRESS: at 3.49% examples, 1411077 words/s, in_qsize 6, out_qsize 1
2023-03-04 16:51:22,587 : INFO : EPOCH 10 - PROGRESS: at 4.36% examples, 1420079 words/s, in_qsize 8, out_qsize 0
2023-03-04 16:51:23,589 : INFO : EPOCH 10 - PROGRESS: at 5.23% examples, 1421540 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:51:24,589 : INFO : EPOCH 10 - PROGRESS: at 6.11% examples, 1412214 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:51:25,592 : INFO : EPOCH 10 - PROGRESS: at 7

2023-03-04 16:53:51,127 : INFO : PROGRESS: at example #330000, processed 256630218 words (11219357/s), 599 word types, 330000 tags
2023-03-04 16:53:51,811 : INFO : PROGRESS: at example #340000, processed 264314448 words (11260955/s), 599 word types, 340000 tags
2023-03-04 16:53:52,540 : INFO : PROGRESS: at example #350000, processed 272207371 words (10834316/s), 599 word types, 350000 tags
2023-03-04 16:53:53,223 : INFO : PROGRESS: at example #360000, processed 279845387 words (11194132/s), 599 word types, 360000 tags
2023-03-04 16:53:53,912 : INFO : PROGRESS: at example #370000, processed 287590149 words (11259473/s), 599 word types, 370000 tags
2023-03-04 16:53:54,604 : INFO : PROGRESS: at example #380000, processed 295365889 words (11247748/s), 599 word types, 380000 tags
2023-03-04 16:53:55,295 : INFO : PROGRESS: at example #390000, processed 303167760 words (11301725/s), 599 word types, 390000 tags
2023-03-04 16:53:55,995 : INFO : PROGRESS: at example #400000, processed 311002724 

2023-03-04 16:56:06,062 : INFO : EPOCH 1 - PROGRESS: at 60.51% examples, 1803572 words/s, in_qsize 8, out_qsize 0
2023-03-04 16:56:07,064 : INFO : EPOCH 1 - PROGRESS: at 61.63% examples, 1803364 words/s, in_qsize 8, out_qsize 0
2023-03-04 16:56:08,069 : INFO : EPOCH 1 - PROGRESS: at 62.75% examples, 1803594 words/s, in_qsize 6, out_qsize 1
2023-03-04 16:56:09,072 : INFO : EPOCH 1 - PROGRESS: at 63.89% examples, 1804068 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:56:10,076 : INFO : EPOCH 1 - PROGRESS: at 65.01% examples, 1805264 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:56:11,077 : INFO : EPOCH 1 - PROGRESS: at 66.12% examples, 1804842 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:56:12,079 : INFO : EPOCH 1 - PROGRESS: at 67.25% examples, 1804493 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:56:13,082 : INFO : EPOCH 1 - PROGRESS: at 68.37% examples, 1804228 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:56:14,084 : INFO : EPOCH 1 - PROGRESS: at 69.52% examples, 1803852 words/s

2023-03-04 16:57:15,168 : INFO : EPOCH 2 - PROGRESS: at 38.44% examples, 1820345 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:57:16,169 : INFO : EPOCH 2 - PROGRESS: at 39.61% examples, 1819481 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:57:17,171 : INFO : EPOCH 2 - PROGRESS: at 40.73% examples, 1818953 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:57:18,175 : INFO : EPOCH 2 - PROGRESS: at 41.89% examples, 1821071 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:57:19,179 : INFO : EPOCH 2 - PROGRESS: at 43.00% examples, 1819754 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:57:20,180 : INFO : EPOCH 2 - PROGRESS: at 44.17% examples, 1820038 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:57:21,181 : INFO : EPOCH 2 - PROGRESS: at 45.32% examples, 1821063 words/s, in_qsize 6, out_qsize 1
2023-03-04 16:57:22,186 : INFO : EPOCH 2 - PROGRESS: at 46.45% examples, 1821396 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:57:23,190 : INFO : EPOCH 2 - PROGRESS: at 47.55% examples, 1820008 words/s

2023-03-04 16:58:23,416 : INFO : EPOCH 3 - PROGRESS: at 15.68% examples, 1814955 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:58:24,421 : INFO : EPOCH 3 - PROGRESS: at 16.84% examples, 1817773 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:58:25,423 : INFO : EPOCH 3 - PROGRESS: at 18.00% examples, 1819079 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:58:26,427 : INFO : EPOCH 3 - PROGRESS: at 19.13% examples, 1821104 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:58:27,427 : INFO : EPOCH 3 - PROGRESS: at 20.27% examples, 1820826 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:58:28,429 : INFO : EPOCH 3 - PROGRESS: at 21.33% examples, 1814862 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:58:29,429 : INFO : EPOCH 3 - PROGRESS: at 22.50% examples, 1816522 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:58:30,430 : INFO : EPOCH 3 - PROGRESS: at 23.64% examples, 1818505 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:58:31,431 : INFO : EPOCH 3 - PROGRESS: at 24.75% examples, 1813873 words/s

2023-03-04 16:59:35,582 : INFO : EPOCH 3 - PROGRESS: at 97.99% examples, 1827469 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:59:36,583 : INFO : EPOCH 3 - PROGRESS: at 99.14% examples, 1827843 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:59:37,329 : INFO : worker thread finished; awaiting finish of 3 more threads
2023-03-04 16:59:37,331 : INFO : worker thread finished; awaiting finish of 2 more threads
2023-03-04 16:59:37,332 : INFO : worker thread finished; awaiting finish of 1 more threads
2023-03-04 16:59:37,337 : INFO : worker thread finished; awaiting finish of 0 more threads
2023-03-04 16:59:37,337 : INFO : EPOCH - 3 : training on 322899435 raw words (160727305 effective words) took 87.9s, 1827624 effective words/s
2023-03-04 16:59:38,341 : INFO : EPOCH 4 - PROGRESS: at 1.12% examples, 1826443 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:59:39,347 : INFO : EPOCH 4 - PROGRESS: at 2.29% examples, 1833938 words/s, in_qsize 7, out_qsize 0
2023-03-04 16:59:40,348 : INFO : EPOCH 

2023-03-04 17:00:44,473 : INFO : EPOCH 4 - PROGRESS: at 75.03% examples, 1801052 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:00:45,474 : INFO : EPOCH 4 - PROGRESS: at 76.17% examples, 1800404 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:00:46,475 : INFO : EPOCH 4 - PROGRESS: at 77.28% examples, 1800535 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:00:47,478 : INFO : EPOCH 4 - PROGRESS: at 78.38% examples, 1799083 words/s, in_qsize 8, out_qsize 0
2023-03-04 17:00:48,482 : INFO : EPOCH 4 - PROGRESS: at 79.50% examples, 1798338 words/s, in_qsize 8, out_qsize 0
2023-03-04 17:00:49,483 : INFO : EPOCH 4 - PROGRESS: at 80.59% examples, 1797576 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:00:50,485 : INFO : EPOCH 4 - PROGRESS: at 81.66% examples, 1796962 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:00:51,485 : INFO : EPOCH 4 - PROGRESS: at 82.76% examples, 1797321 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:00:52,486 : INFO : EPOCH 4 - PROGRESS: at 83.85% examples, 1796532 words/s

2023-03-04 17:01:53,067 : INFO : EPOCH 5 - PROGRESS: at 51.28% examples, 1793974 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:01:54,070 : INFO : EPOCH 5 - PROGRESS: at 52.41% examples, 1794376 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:01:55,070 : INFO : EPOCH 5 - PROGRESS: at 53.51% examples, 1794864 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:01:56,071 : INFO : EPOCH 5 - PROGRESS: at 54.68% examples, 1796512 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:01:57,072 : INFO : EPOCH 5 - PROGRESS: at 55.84% examples, 1797999 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:01:58,074 : INFO : EPOCH 5 - PROGRESS: at 56.99% examples, 1799089 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:01:59,076 : INFO : EPOCH 5 - PROGRESS: at 58.12% examples, 1799831 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:02:00,083 : INFO : EPOCH 5 - PROGRESS: at 59.24% examples, 1799829 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:02:01,089 : INFO : EPOCH 5 - PROGRESS: at 60.36% examples, 1798727 words/s

2023-03-04 17:03:01,237 : INFO : EPOCH 6 - PROGRESS: at 28.18% examples, 1811849 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:03:02,239 : INFO : EPOCH 6 - PROGRESS: at 29.24% examples, 1811400 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:03:03,241 : INFO : EPOCH 6 - PROGRESS: at 30.38% examples, 1811374 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:03:04,243 : INFO : EPOCH 6 - PROGRESS: at 31.51% examples, 1812810 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:03:05,245 : INFO : EPOCH 6 - PROGRESS: at 32.64% examples, 1814112 words/s, in_qsize 8, out_qsize 0
2023-03-04 17:03:06,245 : INFO : EPOCH 6 - PROGRESS: at 33.79% examples, 1814532 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:03:07,250 : INFO : EPOCH 6 - PROGRESS: at 34.94% examples, 1814676 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:03:08,251 : INFO : EPOCH 6 - PROGRESS: at 36.09% examples, 1815521 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:03:09,253 : INFO : EPOCH 6 - PROGRESS: at 37.23% examples, 1815796 words/s

2023-03-04 17:04:09,718 : INFO : EPOCH 7 - PROGRESS: at 5.33% examples, 1740089 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:04:10,720 : INFO : EPOCH 7 - PROGRESS: at 6.46% examples, 1743232 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:04:11,724 : INFO : EPOCH 7 - PROGRESS: at 7.56% examples, 1754439 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:04:12,729 : INFO : EPOCH 7 - PROGRESS: at 8.66% examples, 1754686 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:04:13,729 : INFO : EPOCH 7 - PROGRESS: at 9.76% examples, 1760803 words/s, in_qsize 8, out_qsize 0
2023-03-04 17:04:14,736 : INFO : EPOCH 7 - PROGRESS: at 10.88% examples, 1762369 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:04:15,738 : INFO : EPOCH 7 - PROGRESS: at 11.95% examples, 1760724 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:04:16,740 : INFO : EPOCH 7 - PROGRESS: at 13.09% examples, 1766824 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:04:17,743 : INFO : EPOCH 7 - PROGRESS: at 14.23% examples, 1773369 words/s, in_

2023-03-04 17:05:21,875 : INFO : EPOCH 7 - PROGRESS: at 86.38% examples, 1801739 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:05:22,876 : INFO : EPOCH 7 - PROGRESS: at 87.47% examples, 1801336 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:05:23,879 : INFO : EPOCH 7 - PROGRESS: at 88.58% examples, 1800884 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:05:24,880 : INFO : EPOCH 7 - PROGRESS: at 89.69% examples, 1800044 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:05:25,881 : INFO : EPOCH 7 - PROGRESS: at 90.79% examples, 1799727 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:05:26,883 : INFO : EPOCH 7 - PROGRESS: at 91.93% examples, 1798956 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:05:27,884 : INFO : EPOCH 7 - PROGRESS: at 93.08% examples, 1799233 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:05:28,887 : INFO : EPOCH 7 - PROGRESS: at 94.20% examples, 1799166 words/s, in_qsize 8, out_qsize 0
2023-03-04 17:05:29,891 : INFO : EPOCH 7 - PROGRESS: at 95.36% examples, 1799193 words/s

2023-03-04 17:06:30,259 : INFO : EPOCH 8 - PROGRESS: at 62.16% examples, 1786685 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:06:31,262 : INFO : EPOCH 8 - PROGRESS: at 63.17% examples, 1784863 words/s, in_qsize 8, out_qsize 0
2023-03-04 17:06:32,264 : INFO : EPOCH 8 - PROGRESS: at 64.23% examples, 1782968 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:06:33,266 : INFO : EPOCH 8 - PROGRESS: at 65.27% examples, 1781023 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:06:34,266 : INFO : EPOCH 8 - PROGRESS: at 66.30% examples, 1779407 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:06:35,268 : INFO : EPOCH 8 - PROGRESS: at 67.34% examples, 1777150 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:06:36,272 : INFO : EPOCH 8 - PROGRESS: at 68.39% examples, 1775714 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:06:37,273 : INFO : EPOCH 8 - PROGRESS: at 69.45% examples, 1773258 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:06:38,277 : INFO : EPOCH 8 - PROGRESS: at 70.47% examples, 1771257 words/s

2023-03-04 17:07:38,755 : INFO : EPOCH 9 - PROGRESS: at 33.44% examples, 1682480 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:07:39,756 : INFO : EPOCH 9 - PROGRESS: at 34.48% examples, 1682613 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:07:40,758 : INFO : EPOCH 9 - PROGRESS: at 35.55% examples, 1683153 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:07:41,760 : INFO : EPOCH 9 - PROGRESS: at 36.61% examples, 1683424 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:07:42,764 : INFO : EPOCH 9 - PROGRESS: at 37.63% examples, 1682052 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:07:43,765 : INFO : EPOCH 9 - PROGRESS: at 38.66% examples, 1680680 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:07:44,772 : INFO : EPOCH 9 - PROGRESS: at 39.73% examples, 1679278 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:07:45,773 : INFO : EPOCH 9 - PROGRESS: at 40.77% examples, 1679703 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:07:46,773 : INFO : EPOCH 9 - PROGRESS: at 41.82% examples, 1679392 words/s

2023-03-04 17:08:47,106 : INFO : EPOCH 10 - PROGRESS: at 4.01% examples, 1631530 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:08:48,106 : INFO : EPOCH 10 - PROGRESS: at 5.04% examples, 1632874 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:08:49,108 : INFO : EPOCH 10 - PROGRESS: at 6.04% examples, 1629428 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:08:50,108 : INFO : EPOCH 10 - PROGRESS: at 7.08% examples, 1638314 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:08:51,109 : INFO : EPOCH 10 - PROGRESS: at 8.11% examples, 1642983 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:08:52,111 : INFO : EPOCH 10 - PROGRESS: at 9.12% examples, 1651124 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:08:53,117 : INFO : EPOCH 10 - PROGRESS: at 10.19% examples, 1648935 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:08:54,119 : INFO : EPOCH 10 - PROGRESS: at 11.23% examples, 1656108 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:08:55,119 : INFO : EPOCH 10 - PROGRESS: at 12.25% examples, 1653300 word

2023-03-04 17:09:59,288 : INFO : EPOCH 10 - PROGRESS: at 78.45% examples, 1657733 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:10:00,290 : INFO : EPOCH 10 - PROGRESS: at 79.48% examples, 1657129 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:10:01,296 : INFO : EPOCH 10 - PROGRESS: at 80.52% examples, 1657073 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:10:02,297 : INFO : EPOCH 10 - PROGRESS: at 81.55% examples, 1657261 words/s, in_qsize 6, out_qsize 1
2023-03-04 17:10:03,298 : INFO : EPOCH 10 - PROGRESS: at 82.58% examples, 1657726 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:10:04,298 : INFO : EPOCH 10 - PROGRESS: at 83.61% examples, 1658369 words/s, in_qsize 6, out_qsize 1
2023-03-04 17:10:05,301 : INFO : EPOCH 10 - PROGRESS: at 84.65% examples, 1658454 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:10:06,305 : INFO : EPOCH 10 - PROGRESS: at 85.74% examples, 1658576 words/s, in_qsize 7, out_qsize 0
2023-03-04 17:10:07,306 : INFO : EPOCH 10 - PROGRESS: at 86.79% examples, 165893

CPU times: user 2h 15min 58s, sys: 6min 47s, total: 2h 22min 45s
Wall time: 55min 37s


## Повторяем проделанное выше для url_host, date и region файлов

## Получившиеся модели

In [11]:
!ls word2vec_models/*128*

word2vec_models/cpemodname_d2v_tokens_w=3_emb=128_thr=5:
model  model.docvecs.vectors_docs.npy

word2vec_models/d2v_tokens_w=3_emb=128_thr=5:
model  model.docvecs.vectors_docs.npy

word2vec_models/date_d2v_tokens_w=3_emb=128_thr=5:
model  model.docvecs.vectors_docs.npy

word2vec_models/region_d2v_tokens_w=3_emb=128_thr=5:
model  model.docvecs.vectors_docs.npy


## Загружаем их и конкатенируем их эмбеддинги документов в один эмбеддинг Doc2Vec

In [5]:
feats = []
pbar = tqdm()
for doc2vec in ['d2v_tokens_w=3_emb=128_thr=5',
                'date_d2v_tokens_w=3_emb=128_thr=5',
                'region_d2v_tokens_w=3_emb=128_thr=5',
                'cpemodname_d2v_tokens_w=3_emb=128_thr=5']:
    doc2vec = gensim.models.doc2vec.Doc2Vec.load('word2vec_models/%s/model'%doc2vec)
    feats.append(doc2vec.docvecs.vectors_docs.copy())
    pbar.update(1)
feats = np.concatenate(feats, axis=1)
with gzip.open('doc2vec_feats_128x4.pickle.gz', 'wb') as f:
    pickle.dump(feats, f, protocol=-1)

0it [00:00, ?it/s]

2023-03-04 20:07:15,633 : INFO : loading Doc2Vec object from word2vec_models/d2v_tokens_w=3_emb=128_thr=5/model
2023-03-04 20:07:16,749 : INFO : loading vocabulary recursively from word2vec_models/d2v_tokens_w=3_emb=128_thr=5/model.vocabulary.* with mmap=None
2023-03-04 20:07:16,750 : INFO : loading trainables recursively from word2vec_models/d2v_tokens_w=3_emb=128_thr=5/model.trainables.* with mmap=None
2023-03-04 20:07:16,750 : INFO : loading wv recursively from word2vec_models/d2v_tokens_w=3_emb=128_thr=5/model.wv.* with mmap=None
2023-03-04 20:07:16,751 : INFO : loading docvecs recursively from word2vec_models/d2v_tokens_w=3_emb=128_thr=5/model.docvecs.* with mmap=None
2023-03-04 20:07:16,751 : INFO : loading vectors_docs from word2vec_models/d2v_tokens_w=3_emb=128_thr=5/model.docvecs.vectors_docs.npy with mmap=None
2023-03-04 20:07:16,914 : INFO : loaded word2vec_models/d2v_tokens_w=3_emb=128_thr=5/model
2023-03-04 20:07:17,196 : INFO : loading Doc2Vec object from word2vec_models/

## Проверяем наличие файла

In [6]:
!ls doc2vec_feats_128x4.pickle.gz

doc2vec_feats_128x4.pickle.gz
