## Импортирование библиотек

In [1]:
import pandas as pd
import numpy as np
import os
from tqdm.auto import tqdm
from scipy.sparse import csr_matrix
import gzip
import pickle

## Распаковка набора данных

In [3]:
!unzip archive.zip

Archive:  archive.zip
  inflating: part-00000-aba60f69-2b63-4cc1-95ca-542598094698-c000.snappy.parquet  
  inflating: part-00001-aba60f69-2b63-4cc1-95ca-542598094698-c000.snappy.parquet  
  inflating: part-00002-aba60f69-2b63-4cc1-95ca-542598094698-c000.snappy.parquet  
  inflating: part-00003-aba60f69-2b63-4cc1-95ca-542598094698-c000.snappy.parquet  
  inflating: part-00004-aba60f69-2b63-4cc1-95ca-542598094698-c000.snappy.parquet  
  inflating: part-00005-aba60f69-2b63-4cc1-95ca-542598094698-c000.snappy.parquet  
  inflating: part-00006-aba60f69-2b63-4cc1-95ca-542598094698-c000.snappy.parquet  
  inflating: part-00007-aba60f69-2b63-4cc1-95ca-542598094698-c000.snappy.parquet  
  inflating: part-00008-aba60f69-2b63-4cc1-95ca-542598094698-c000.snappy.parquet  
  inflating: part-00009-aba60f69-2b63-4cc1-95ca-542598094698-c000.snappy.parquet  
  inflating: public_train.pqt        
  inflating: sample_submit.csv       
  inflating: submit_2.pqt            


In [3]:
!ls

archive.zip
files
part-00000-aba60f69-2b63-4cc1-95ca-542598094698-c000.snappy.parquet
part-00001-aba60f69-2b63-4cc1-95ca-542598094698-c000.snappy.parquet
part-00002-aba60f69-2b63-4cc1-95ca-542598094698-c000.snappy.parquet
part-00003-aba60f69-2b63-4cc1-95ca-542598094698-c000.snappy.parquet
part-00004-aba60f69-2b63-4cc1-95ca-542598094698-c000.snappy.parquet
part-00005-aba60f69-2b63-4cc1-95ca-542598094698-c000.snappy.parquet
part-00006-aba60f69-2b63-4cc1-95ca-542598094698-c000.snappy.parquet
part-00007-aba60f69-2b63-4cc1-95ca-542598094698-c000.snappy.parquet
part-00008-aba60f69-2b63-4cc1-95ca-542598094698-c000.snappy.parquet
part-00009-aba60f69-2b63-4cc1-95ca-542598094698-c000.snappy.parquet
prepare_data.ipynb
public_train.pqt
sample_submit.csv
submit_2.pqt
tmp.pickle.gz


## Проверка числа уникальных значений в столбцах

In [None]:
colmaps = {}
for file in tqdm([file for file in os.listdir('./') if file.startswith('part')]):
    df = pd.read_parquet(file)
    for col in tqdm(df.columns):
        if col not in colmaps:
            colmaps[col] = {}
        for val in df[col].unique():
            if val in colmaps[col]: continue
            colmaps[col][val] = len(colmaps[col])

In [14]:
for k, v in colmaps.items():
    print(k, len(v))

region_name 81
city_name 985
cpe_manufacturer_name 37
cpe_model_name 599
url_host 199683
cpe_type_cd 4
cpe_model_os_type 3
price 29332
date 396
part_of_day 4
request_cnt 15
user_id 415317


## Сохранение индексов уникальных признаков

In [None]:
os.makedirs('files', exists_ok=True)

In [23]:
for k, v in colmaps.items():
    pd.DataFrame({
                  '%s_idx'%k: np.arange(len(v)),
                  k: list(v.keys()),
                 }).to_csv('files/%s_mapper.tsv.gz'%k,
                           sep='\t',
                           index=False,
                           encoding='utf-8',
                           compression='gzip')

## Переконвертация набора данных в индексы и pickle формат

In [4]:
%%time
for file in tqdm([file for file in os.listdir('./') if file.startswith('part')]):
    df = pd.read_parquet(file)
    df2 = df.copy()
    for col in tqdm(df.columns):
        if col == 'user_id': continue
        map_df = pd.read_csv('files/%s_mapper.tsv.gz'%col, sep='\t', dtype=str)
        df2[col] = df2[col].astype(str)
        df2 = df2.merge(map_df, on=col, how='left').drop(col, axis=1)
    df2 = df2.groupby('user_id').agg(list).reset_index()
    df2.to_pickle('files/%s.pickle.gz'%file.split('.')[0])

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))




HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))




HBox(children=(IntProgress(value=0, max=12), HTML(value='')))




HBox(children=(IntProgress(value=0, max=12), HTML(value='')))




HBox(children=(IntProgress(value=0, max=12), HTML(value='')))




HBox(children=(IntProgress(value=0, max=12), HTML(value='')))




HBox(children=(IntProgress(value=0, max=12), HTML(value='')))




HBox(children=(IntProgress(value=0, max=12), HTML(value='')))




HBox(children=(IntProgress(value=0, max=12), HTML(value='')))



CPU times: user 2h 39min 25s, sys: 23min 58s, total: 3h 3min 24s
Wall time: 3h 1min 43s


## Загрузка и объединение сконвертированных файлов в один

In [None]:
%%time
users = None
for file in tqdm([file for file in os.listdir('./') if file.startswith('part')]):
    if users is not None:
        users = pd.concat([users,
                           pd.read_pickle('files/%s.pickle.gz'%file.split('.')[0])]
                         )
    else:
        users = pd.read_pickle('files/%s.pickle.gz'%file.split('.')[0])

  0%|          | 0/10 [00:00<?, ?it/s]

## Сброс индекса после конкатенации

In [None]:
users = users.reset_index(drop=True)

## Считывание нужных мапок

In [None]:
# было неправильное понимание порядка дня утро-день-вечер-ночь
# в files/part_of_day_mapper.tsv.gz
# на самом деле верный порядок ночь-утро-день-вечер
# поэтому захардкожено
part_of_day_remap = [1, 2, 3, 0] 
request_cnt_remap = pd.read_csv('files/request_cnt_mapper.tsv.gz', sep='\t').request_cnt.tolist()

## Вывод набора данных

In [6]:
users

Unnamed: 0,user_id,region_name_idx,city_name_idx,cpe_manufacturer_name_idx,cpe_model_name_idx,url_host_idx,cpe_type_cd_idx,cpe_model_os_type_idx,price_idx,date_idx,part_of_day_idx,request_cnt_idx
0,4,"[0, 0, 0, 0, 0, 0, 21, 0, 0, 0, 0, 0, 0, 0, 0,...","[210, 210, 7, 210, 210, 210, 309, 210, 210, 21...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[222, 222, 222, 222, 222, 222, 222, 222, 222, ...","[3146, 20, 2, 24, 43, 18, 9, 4, 100, 4, 54, 13...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 8...","[127, 127, 118, 134, 134, 221, 145, 127, 221, ...","[0, 2, 1, 2, 0, 1, 1, 2, 0, 0, 2, 0, 2, 1, 2, ...","[0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 1, 2, 0, 0, 0, ..."
1,16,"[9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, ...","[427, 427, 427, 427, 427, 427, 427, 427, 427, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 3...","[2, 2, 125, 4, 136, 301, 100, 22, 2, 2, 113, 3...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[4048, 4048, 4048, 4048, 4048, 4048, 4048, 404...","[208, 147, 199, 144, 215, 137, 117, 147, 139, ...","[1, 1, 3, 0, 0, 1, 2, 0, 1, 0, 0, 0, 0, 1, 1, ...","[0, 0, 0, 1, 1, 0, 1, 2, 3, 0, 0, 5, 4, 0, 0, ..."
2,18,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92, 9...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 2...","[2, 2, 7, 135, 29, 29, 20, 20, 125, 9, 9, 10, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1601, 1601, 1601, 1601, 1601, 1601, 1601, 160...","[131, 211, 225, 146, 215, 132, 199, 121, 146, ...","[0, 0, 1, 0, 0, 2, 1, 1, 1, 0, 1, 1, 1, 1, 0, ...","[2, 2, 0, 0, 3, 0, 4, 3, 4, 0, 0, 0, 0, 1, 0, ..."
3,26,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[186, 186, 186, 186, 186, 186, 186, 186, 186, ...","[13, 18, 128, 3, 103, 683, 137, 84, 29, 0, 2, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[206, 209, 206, 142, 226, 205, 206, 142, 200, ...","[2, 1, 0, 0, 2, 0, 0, 0, 1, 2, 0, 2, 3, 0, 2, ...","[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
4,27,"[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...","[9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 4...","[13, 2, 304, 4, 103, 11, 100, 58, 24, 29, 20, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 8...","[203, 224, 197, 131, 136, 226, 137, 224, 212, ...","[2, 3, 1, 3, 0, 2, 1, 1, 0, 1, 1, 0, 2, 2, 1, ...","[3, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
415312,415271,[24],[49],[2],[136],[59],[0],[1],[100],[127],[1],[0]
415313,415275,[28],[54],[0],[78],[1],[0],[0],[949],[209],[3],[0]
415314,415276,[27],[65],[1],[177],[3],[0],[1],[720],[124],[0],[0]
415315,415288,"[65, 65, 65]","[294, 294, 294]","[2, 2, 2]","[232, 232, 232]","[20, 13, 3]","[0, 0, 0]","[1, 1, 1]","[2545, 2545, 2545]","[122, 135, 135]","[0, 2, 2]","[0, 0, 0]"


## Считываение конвертора дат в правильные индексы 

In [None]:
date_mapper = \
{k:i for i, k in enumerate(
    pd.read_csv('files/date_mapper.tsv.gz', sep='\t',
            dtype=str).sort_values(by=['date'],
                                   ascending=True).date_idx.values.copy()
)}

## Восстановление значений колонки request_cnt

In [None]:
%%time
users['request_cnt'] = \
users.request_cnt_idx.apply(lambda x: [request_cnt_remap[int(xx)] for xx in x])

## Проверка, что число значений в последовательности ссылок и кол-ва запросов совпадает

In [None]:
assert (users['request_cnt'].apply(len) == users['url_host_idx'].apply(len)).all()

## Сохранение мешков слов с учётом request_cnt

In [25]:


def fillna(row):
    vals = np.array(row, dtype='<U')
    mask = vals == 'nan'
    vals[mask] = '-1'
    vals = vals.astype(int)
    pbar.update(1)
    return list(vals)

uids = users.user_id.values.copy()
cnts = users['request_cnt'].values.copy()
for col in tqdm(
            ['region_name_idx', 'city_name_idx',
             'cpe_manufacturer_name_idx', 'cpe_model_name_idx',
             'cpe_type_cd_idx', 'cpe_model_os_type_idx', 'price_idx',
             'date_idx', 'part_of_day_idx']
    ):
    pbar = tqdm(total=users.shape[0])
    vals = users[col].apply(fillna)

    max_val = vals.apply(max).max()+1

    indices = []
    jindices = []
    data = []

    for i, row in enumerate(tqdm(vals)):
        inds = zip([(xx if xx>=0 else max_val) for xx in row], cnts[i])
        inds = sorted(inds, key=lambda x: x[0])
        indices.extend([i]*len(row))
        jindices.extend(list(map(lambda x: x[0], inds)))
        data.extend(list(map(lambda x: x[1], inds)))

    print(len(indices))
    # csr_matrix автоматом просуммирует значение дубликатов по i, j в одно
    XX = \
    csr_matrix((data, (indices, jindices)),
               shape=(users.shape[0], max_val+1),
               dtype=np.uint32)

    with gzip.open('files/%s_cbag_v2.pickle.gz'%(col.split('_idx')[0]), 'wb') as f:
        pickle.dump(dict(data=XX, uids=uids), f, protocol=-1)

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/415317 [00:00<?, ?it/s]

  0%|          | 0/415317 [00:00<?, ?it/s]

322899435


  0%|          | 0/415317 [00:00<?, ?it/s]

  0%|          | 0/415317 [00:00<?, ?it/s]

322899435


  0%|          | 0/415317 [00:00<?, ?it/s]

  0%|          | 0/415317 [00:00<?, ?it/s]

322899435


  0%|          | 0/415317 [00:00<?, ?it/s]

  0%|          | 0/415317 [00:00<?, ?it/s]

322899435


  0%|          | 0/415317 [00:00<?, ?it/s]

  0%|          | 0/415317 [00:00<?, ?it/s]

322899435


  0%|          | 0/415317 [00:00<?, ?it/s]

  0%|          | 0/415317 [00:00<?, ?it/s]

322899435


  0%|          | 0/415317 [00:00<?, ?it/s]

  0%|          | 0/415317 [00:00<?, ?it/s]

322899435


  0%|          | 0/415317 [00:00<?, ?it/s]

  0%|          | 0/415317 [00:00<?, ?it/s]

322899435


  0%|          | 0/415317 [00:00<?, ?it/s]

  0%|          | 0/415317 [00:00<?, ?it/s]

322899435


## Восстановление упорядоченных индексов для дат и времени суток

In [None]:
%%time
users['date_ordered_idx'] = users.date_idx.apply(lambda x: [date_mapper[xx] for xx in x])
users['part_ordered_idx'] = users.part_of_day_idx.apply(lambda x: [part_of_day_remap[int(xx)] for xx in x])
users

## Получение одной колонки со временем

In [None]:
%%time
users['date_sorting_val'] = users.apply(lambda row: [x*10+y for x,y in 
                                                     zip(row.date_ordered_idx,
                                                         row.part_ordered_idx)
                                                    ], axis=1)
users

## Получение правильного упорядочивания элементов последовательностей по дате

In [None]:
%%time
users['reindex'] = users.date_sorting_val.apply(lambda x: list(np.argsort(x)))
users.head(10)

## Сохранение последовательностей в виде текстовых файлов

In [57]:
cols = ['url_host_idx', 'date_sorting_val', 'region_name_idx', 'city_name_idx',
       'cpe_manufacturer_name_idx', 'cpe_model_name_idx',
       'cpe_type_cd_idx', 'cpe_model_os_type_idx', 'price_idx', 'request_cnt']

for col in tqdm(cols):
    pbar = tqdm(total=users.shape[0])
    vals = users[col].apply(fillna)

    max_val = vals.apply(max).max() + 1

    with gzip.open('files/seqs_%s.gz'%col.split('_idx')[0], 'wt') as f:
        for i, reindex in enumerate(tqdm(users['reindex'].values)):
            row = np.array(vals[i])[reindex].copy()
            mask = row < 0
            row[mask] = max_val
            f.write(' '.join(map(str, row)))
            f.write('\n')

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/415317 [00:00<?, ?it/s]

  0%|          | 0/415317 [00:00<?, ?it/s]

  0%|          | 0/415317 [00:00<?, ?it/s]

  0%|          | 0/415317 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



  0%|          | 0/415317 [00:00<?, ?it/s]

  0%|          | 0/415317 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



  0%|          | 0/415317 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



  0%|          | 0/415317 [00:00<?, ?it/s]

  0%|          | 0/415317 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



  0%|          | 0/415317 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



  0%|          | 0/415317 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



  0%|          | 0/415317 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



  0%|          | 0/415317 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



  0%|          | 0/415317 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceed

## Загрузка мешка слов для ссылок

In [28]:
with gzip.open('files/url_host_cbag_v2.pickle.gz', 'rb') as f:
    datamap = pickle.load(f)
datamap

{'data': <415317x199684 sparse matrix of type '<class 'numpy.uint16'>'
 	with 32277669 stored elements in Compressed Sparse Row format>,
 'uids': array([     4,     16,     18, ..., 415276, 415288, 415293])}

## Формирование tsv файла с таргетами

In [35]:
trg_df = pd.DataFrame(dict(user_id=datamap['uids'])
            ).merge(pd.read_parquet('public_train.pqt'),
                    how='left')
trg_df.to_csv('target.tsv.gz', sep='\t', index=False)
trg_df.sample(10)

Unnamed: 0,user_id,age,is_male
207980,1451,32.0,0.0
151433,267658,29.0,1.0
98679,155052,,
373776,412651,56.0,1.0
274496,251949,28.0,1.0
134467,98784,38.0,0.0
187538,210398,39.0,0.0
155840,311569,62.0,0.0
278240,288276,44.0,1.0
285197,357699,,


## Создание dummy файла скоров для удобства в будущем

In [168]:
sample_df = pd.read_parquet('submit_2.pqt')
sample_df = \
sample_df.merge(pd.DataFrame(dict(user_id=datamap['uids'],
                                  idx=np.arange(len(datamap['uids'])))),
                how='left')

sample_df['is_male'] = np.random.random(size=sample_df.idx.values.size)
sample_df['age'] = np.random.randint(1, 7, size=sample_df.idx.values.size)
sample_df = \
sample_df.sort_values(by='user_id').\
          drop('idx', axis=1)

sample_df.\
    to_csv('baseline_submission_wo_index.csv',
           sep=',',
           index=False,)

sample_df.\
    to_csv('baseline_submission_w_index.csv',
           sep=',',
           index=True,)