## Импорт библиотек

In [5]:
%%time
import gzip
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
%matplotlib inline
from scipy.sparse import csr_matrix, vstack
from IPython.display import clear_output
from ipywidgets.widgets import interact, interact_manual
import os
import gc
from PIL import Image
from multiprocessing.pool import Pool
from functools import partial
import sentencepiece as spm
import json
import gensim
from gensim.test.utils import datapath
import logging
from sklearn.decomposition import TruncatedSVD
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

CPU times: user 1.09 ms, sys: 0 ns, total: 1.09 ms
Wall time: 1.11 ms


## Вывод всех файлов с последовательностями

In [2]:
!ls files/*seq*

files/seqs_city_name.gz		     files/seqs_date_sorting_val.gz
files/seqs_cpe_manufacturer_name.gz  files/seqs_price.gz
files/seqs_cpe_model_name.gz	     files/seqs_region_name.gz
files/seqs_cpe_model_os_type.gz      files/seqs_request_cnt.gz
files/seqs_cpe_type_cd.gz	     files/seqs_url_host.gz


## Загрузка последовательностей ссылок

In [2]:
XX = []
c = 0
with gzip.open('files/seqs_url_host.gz', 'rt', encoding='utf-8') as f:
    for line in tqdm(f):
        XX.append(list(map(int, line.strip().split(' '))))
        c += 1
len(XX)

0it [00:00, ?it/s]

415317

## Загрузка последовательностей дат

In [3]:
TT = []
c = 0
with gzip.open('files/seqs_date_sorting_val.gz', 'rt', encoding='utf-8') as f:
    for line in tqdm(f):
        TT.append(list(map(int, line.strip().split(' '))))
        c += 1
len(TT)

0it [00:00, ?it/s]

415317

## Статистики : mean-max-std кол-во дублей в последовательностях

In [12]:
stats = []
for tt in tqdm(TT):
    tmp = np.unique(tt, return_counts=True)[-1].copy()
    stats.append((tmp.max(), tmp.mean(), tmp.std()))
np.mean(stats, axis=0), np.max(stats, axis=0)

(array([27.77666939,  7.61395986,  5.74789045]),
 array([437.        ,  99.5       ,  69.12409523]))

## Подсчёт частоты ссылок в последовательностях

In [4]:
idx_cnt_map = {}
for xx in tqdm(XX):
    for x in xx:
        idx_cnt_map[x] = idx_cnt_map.get(x, 0) + 1
len(idx_cnt_map)

  0%|          | 0/415317 [00:00<?, ?it/s]

199683

## Оставляем элементы последовательностей с большой частотой

In [6]:
XX_ = []
TT_ = []
pbar = tqdm(total=len(XX))
for xx, tt in zip(XX, TT):
    keep_mask = [idx_cnt_map.get(xxx, 0) >= 50 for xxx in xx]
    XX_.append([xxx for xxx, remain in zip(xx, keep_mask) if remain])
    TT_.append([ttt for ttt, remain in zip(tt, keep_mask) if remain])
    pbar.update(1)
del XX, TT
gc.collect()
max(map(len, XX_))

  0%|          | 0/415317 [00:00<?, ?it/s]

27971

## Статистики уменьшились

In [7]:
stats = []
for tt in tqdm(TT_):
    if not len(tt):
        stats.append((0, 0, 0))
        continue
    tmp = np.unique(tt, return_counts=True)[-1].copy()
    stats.append((tmp.max(), tmp.mean(), tmp.std()))
np.mean(stats, axis=0), np.max(stats, axis=0)

  0%|          | 0/415317 [00:00<?, ?it/s]

(array([27.53051765,  7.58641791,  5.70882081]),
 array([392.        ,  86.        ,  58.85378115]))

## Считаем частоты встречаемости пар ссылок в наших данных

In [9]:
interaction_cnt_map = dict()
pbar = tqdm(total=len(XX_))
for xx, tt in zip(XX_, TT_):
    if not len(xx):
        #II.append([])
        pbar.update(1)
        continue
    tmp = np.array(tt)
    stoppers = list(np.where(tmp[:-1] != tmp[1:])[0] + 1)
    stoppers = [0] + stoppers + [len(tt)]
    xx = np.array(xx)
    segments = []

    for left, right in zip(stoppers[:-1], stoppers[1:]):
        segments.append(set(list(xx[left:right])))

    for left_segment, right_segment in zip(segments[:-1], segments[1:]):
        for i in left_segment:
            for j in right_segment:
                # тут сортируем индексы ссылок, чтобы уменьшить разнообразие
                key = tuple(sorted([i, j]))
                interaction_cnt_map[key] = interaction_cnt_map.get(key, 0) + 1
    pbar.update(1)

  0%|          | 0/415317 [00:00<?, ?it/s]

## Сколько всего получилось пар ссылок

In [10]:
len(interaction_cnt_map)

11803853

## Сохраняем промежуточный результат на всякий случай

In [11]:
%%time
with gzip.open('url_bigrams_keys.pickle.gz', 'wb') as f:
    pickle.dump(interaction_cnt_map, f, protocol=-1)

CPU times: user 2min 43s, sys: 3.69 s, total: 2min 47s
Wall time: 2min 48s


## Оставляем только частотные пары ссылок

In [12]:
interaction_whitelist = set([k for k, v in interaction_cnt_map.items() if v>=80])
interaction_whitelist = {k: i for i, k in enumerate(interaction_whitelist)}
len(interaction_whitelist)

653561

## Считаем биграммы на декартовых произведениях соседних временных множест

In [25]:
output_folder = 'auxilary/bigrams'
os.makedirs(output_folder, exist_ok=True)
II_rows = []
II_cols = []
II_data = []
row_offset = 0
CHUNK_SIZE = 50_000
pbar = tqdm(total=len(XX_))
for row_id, (xx, tt) in enumerate(zip(XX_, TT_)):
    if not len(xx):
        pbar.update(1)
        continue
    tmp = np.array(tt)
    stoppers = list(np.where(tmp[:-1] != tmp[1:])[0] + 1)
    stoppers = [0] + stoppers + [len(tt)]
    xx = np.array(xx)
    segments = []

    for left, right in zip(stoppers[:-1], stoppers[1:]):
        segments.append(set(list(xx[left:right])))

    ii_cols = []
    for left_segment, right_segment in zip(segments[:-1], segments[1:]):
        for i in left_segment:
            for j in right_segment:
                key = tuple(sorted([i, j]))
                if key not in interaction_whitelist: continue
                ii_cols.append(interaction_whitelist[key])
    II_cols.extend(ii_cols)
    II_rows.extend([row_id - row_offset]*len(ii_cols))
    II_data.extend([1]*len(ii_cols))
    pbar.update(1)
    if (row_id % CHUNK_SIZE == 0) and row_id:
        tmp_dat = csr_matrix((II_data, (II_rows, II_cols)),
                             shape=(II_rows[-1] + 1,
                                    len(interaction_whitelist)),
                             dtype=np.uint32)
        with gzip.open(os.path.join(output_folder, '%06d.pickle.gz'%row_id), 'wb') as f:
            pickle.dump(tmp_dat, f, protocol=-1)
        row_offset = row_id + 1
        II_rows = []
        II_cols = []
        II_data = []
if len(II_rows):
    tmp_dat = csr_matrix((II_data, (II_rows, II_cols)),
                         shape=(II_rows[-1] + 1,
                                len(interaction_whitelist)),
                         dtype=np.uint16)
    with gzip.open(os.path.join(output_folder, '%d.pickle.gz'%row_id), 'wb') as f:
        pickle.dump(tmp_dat, f, protocol=-1)
len(II_data)

  0%|          | 0/415317 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceed

131444619

## Склеиваем чанки в одну sparse матрицу

In [94]:
%%time
sparse_dats = []
for file in tqdm(sorted(os.listdir(output_folder))):
    if 'ipynb_checkpoints' in file:
        continue
    print(file)
    with gzip.open(os.path.join(output_folder, file), 'rb') as f:
        sparse_dats.append(pickle.load(f))
sparse_dats[2] = \
csr_matrix((sparse_dats[2].data,
            sparse_dats[2][1:].nonzero()),
           shape=(50000, sparse_dats[2].shape[1]))
sparse_dats[-1] = \
csr_matrix((sparse_dats[-1].data,
            sparse_dats[-1][1:].nonzero()),
           shape=(sparse_dats[-1].shape[0], sparse_dats[-1].shape[1]))

  0%|          | 0/10 [00:00<?, ?it/s]

050000.pickle.gz
100000.pickle.gz
150000.pickle.gz
200000.pickle.gz
250000.pickle.gz
300000.pickle.gz
350000.pickle.gz
400000.pickle.gz
415316.pickle.gz
CPU times: user 27.9 s, sys: 8.73 s, total: 36.6 s
Wall time: 36.7 s


In [95]:
sparse_data = vstack(sparse_dats[:1] +
                     [(sd[1:] if sd[0].data.size==0 else sd) for sd in sparse_dats[1:]])

In [96]:
sparse_data.shape

(415317, 653561)

## Проверяем на случайном поднаборе последовательностей, что биграмы упорядочены правильно

In [97]:
checks = 0
for row_id, (xx, tt) in tqdm(enumerate(zip(XX_, TT_))):
    if not len(xx):
        print(row_id)
        continue
    if np.random.random() < 0.98: continue
    tmp = np.array(tt)
    stoppers = list(np.where(tmp[:-1] != tmp[1:])[0] + 1)
    stoppers = [0] + stoppers + [len(tt)]
    xx = np.array(xx)
    segments = []

    for left, right in zip(stoppers[:-1], stoppers[1:]):
        segments.append(set(list(xx[left:right])))

    ii_cols = []
    for left_segment, right_segment in zip(segments[:-1], segments[1:]):
        for i in left_segment:
            for j in right_segment:
                key = tuple(sorted([i, j]))
                if key not in interaction_whitelist: continue
                ii_cols.append(interaction_whitelist[key])
    assert set(ii_cols) == set(list(sparse_data[row_id].nonzero()[-1]))
    checks += 1
checks

0it [00:00, ?it/s]

307633
357254


8382

## Сохраняем биграмы как разряженную матрицу

In [98]:
%%time
with gzip.open('auxilary/bigrams_sparse.pickle.gz', 'wb') as f:
    pickle.dump(sparse_data, f, protocol=-1)

CPU times: user 49min 30s, sys: 1min 7s, total: 50min 38s
Wall time: 50min 45s


## До этого фильтровали по кол-ву элементов, а сейчас по числу уникальных юзеров

In [100]:
feats_mask = (np.array((sparse_data>0).sum(axis=0)).flatten() > 40)
feats_mask.sum()

580437

## Понижаем размерность с 580к до 512 признаков

In [101]:
%%time
sub_data = sparse_data[:, feats_mask]
sub_data = csr_matrix(
    (sub_data.data**0.5, sub_data.nonzero()), shape=sub_data.shape, dtype=np.float32
)
latents = \
TruncatedSVD(n_components=512, random_state=42
            ).fit_transform(sub_data)

CPU times: user 43min 48s, sys: 3min 20s, total: 47min 9s
Wall time: 37min 52s


## Сохраняем

In [102]:
%%time
with gzip.open('auxilary/bigrams_dense.pickle.gz', 'wb') as f:
    pickle.dump(latents, f, protocol=-1)

CPU times: user 39 s, sys: 2.44 s, total: 41.4 s
Wall time: 41.5 s
