## Ограничения на ресурсы для numpy, импорт библиотек

In [1]:
import os
os.environ["OMP_NUM_THREADS"] = "4" # export OMP_NUM_THREADS=4
os.environ["OPENBLAS_NUM_THREADS"] = "4" # export OPENBLAS_NUM_THREADS=4 
os.environ["MKL_NUM_THREADS"] = "6" # export MKL_NUM_THREADS=6
os.environ["VECLIB_MAXIMUM_THREADS"] = "4" # export VECLIB_MAXIMUM_THREADS=4
os.environ["NUMEXPR_NUM_THREADS"] = "6" # export NUMEXPR_NUM_THREADS=6

In [2]:
import pandas as pd
import numpy as np
import time
from tqdm.auto import tqdm
from functools import reduce

import scipy
from scipy.sparse import csr_matrix, hstack, vstack
import gzip
import pickle

import sklearn
from sklearn.utils import resample
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import QuantileTransformer, Normalizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, f1_score

import tensorflow as tf

import matplotlib.pyplot as plt
%matplotlib inline

## Загрузка мешка слов по регионам

In [5]:
with gzip.open('files/region_name_cbag_v2.pickle.gz', 'rb') as f:
    regmap = pickle.load(f)

reg_id = np.array(regmap['data'].argmax(axis=1)).flatten()
pd.Series(
    reg_id
).value_counts().head(50).index

_id_label, _id_cnt = np.unique(reg_id, return_counts=True)
_id_map = {}
regionmap = csr_matrix(regmap['data'][:, _id_label[_id_cnt>20]])
print(regionmap.shape)
for cid in _id_label[_id_cnt>20]:
    _id_map[cid] = len(_id_map)
reg_id = np.array([_id_map.get(cid, len(_id_map)) for cid in reg_id])
del _id_label, _id_cnt, _id_map
reg_id.max()

(415317, 79)


79

## Загрузка мешка слов по городам

In [6]:
with gzip.open('files/city_name_cbag_v2.pickle.gz', 'rb') as f:
    regmap = pickle.load(f)

city_id = np.array(regmap['data'].argmax(axis=1)).flatten()
pd.Series(
    city_id
).value_counts().head(50).index

_id_label, _id_cnt = np.unique(city_id, return_counts=True)
_id_map = {}
citmap = csr_matrix(regmap['data'][:, _id_label[_id_cnt>20]])
print(citmap.shape)
for cid in _id_label[_id_cnt>20]:
    _id_map[cid] = len(_id_map)
city_id = np.array([_id_map.get(cid, len(_id_map)) for cid in city_id])
del _id_label, _id_cnt, _id_map
city_id.max()

(415317, 661)


661

## Загрузка мешка слов по производителю

In [7]:
with gzip.open('files/cpe_manufacturer_name_cbag_v2.pickle.gz', 'rb') as f:
    regmap = pickle.load(f)

cpeman_id = np.array(regmap['data'].argmax(axis=1)).flatten()
pd.Series(
    cpeman_id
).value_counts().head(50).index

_id_label, _id_cnt = np.unique(cpeman_id, return_counts=True)
_id_map = {}
cpemanmap = csr_matrix(regmap['data'][:, _id_label[_id_cnt>20]])
print(cpemanmap.shape)
for cid in _id_label[_id_cnt>20]:
    _id_map[cid] = len(_id_map)
cpeman_id = np.array([_id_map.get(cid, len(_id_map)) for cid in cpeman_id])
del _id_label, _id_cnt, _id_map
cpeman_id.max()

(415317, 27)


27

## Загрузка мешка слов по устройству

In [8]:
with gzip.open('files/cpe_model_name_cbag_v2.pickle.gz', 'rb') as f:
    regmap = pickle.load(f)

cpemodname_id = np.array(regmap['data'].argmax(axis=1)).flatten()
pd.Series(
    cpemodname_id
).value_counts().head(50).index

_id_label, _id_cnt = np.unique(cpemodname_id, return_counts=True)
_id_map = {}
cpemodnamemap = csr_matrix(regmap['data'][:, _id_label[_id_cnt>20]])
print(cpemodnamemap.shape)
for cid in _id_label[_id_cnt>20]:
    _id_map[cid] = len(_id_map)
cpemodname_id = np.array([_id_map.get(cid, len(_id_map)) for cid in cpemodname_id])
del _id_label, _id_cnt, _id_map
cpemodname_id.max()

(415317, 396)


396

## Загрузка мешка слов по cpe_type_cd

In [9]:
with gzip.open('files/cpe_type_cd_cbag_v2.pickle.gz', 'rb') as f:
    regmap = pickle.load(f)

cpetype_id = np.array(regmap['data'].argmax(axis=1)).flatten()
cpetypemap = csr_matrix(regmap['data'])
print(cpetypemap.shape)
pd.Series(
    cpetype_id
).value_counts().head(50).index

(415317, 5)


Int64Index([0, 2, 1, 3], dtype='int64')

## Загрузка мешка слов по датам

In [10]:
with gzip.open('files/date_cbag_v2.pickle.gz', 'rb') as f:
    regmap = pickle.load(f)
    
date_id = np.array(regmap['data'].argmax(axis=1)).flatten()
datemap = csr_matrix(regmap['data'])
print(datemap.shape)
pd.Series(
    date_id
).value_counts().tail(50)

_id_label, _id_cnt = np.unique(date_id, return_counts=True)
_id_map = {}
datemap = csr_matrix(regmap['data'][:, _id_label[_id_cnt>20]])
print(datemap.shape)
for cid in _id_label[_id_cnt>20]:
    _id_map[cid] = len(_id_map)
date_id = np.array([_id_map.get(cid, len(_id_map)) for cid in date_id])
del _id_label, _id_cnt, _id_map
date_id.max()

(415317, 397)
(415317, 203)


203

## Загрузка мешка слов по времени суток

In [11]:
with gzip.open('files/part_of_day_cbag_v2.pickle.gz', 'rb') as f:
    regmap = pickle.load(f)

pod_id = np.array(regmap['data'].argmax(axis=1)).flatten()
podmap = csr_matrix(regmap['data'])
print(podmap.shape)
pd.Series(
    pod_id
).value_counts().tail(50)

(415317, 5)


1    217543
2    126023
0     64969
3      6782
dtype: int64

## Загрузка мешка слов по ценам

In [12]:
with gzip.open('files/price_cbag_v2.pickle.gz', 'rb') as f:
    regmap = pickle.load(f)

prices = np.array(pd.read_csv('files/price_mapper.tsv.gz', sep='\t').price.fillna(20_000).tolist()
                  + [20_000])
price_id = regmap['data']
price_id = (price_id.dot(prices)/np.array(price_id.sum(axis=1)).flatten())
pricemap_id = regmap['data'].dot(KBinsDiscretizer(n_bins=31,
                                                  strategy='kmeans',).fit_transform(prices[:, None]**0.5))

## Загрузка мешка слов по ссылкам

In [4]:
with gzip.open('files/url_host_cbag_v2.pickle.gz', 'rb') as f:
    datamap = pickle.load(f)
datamap

{'data': <415317x199684 sparse matrix of type '<class 'numpy.uint32'>'
 	with 32277669 stored elements in Compressed Sparse Row format>,
 'uids': array([     4,     16,     18, ..., 415276, 415288, 415293])}

## Загрузка признаков тайтлов ссылок

In [14]:
with gzip.open('auxilary/domain20k_title.pickle.gz', 'rb') as f:
    titlemap = pickle.load(f)

map_df = pd.read_csv('auxilary/url_host_mapper_v2.tsv.gz', sep='\t')

titlemap = \
datamap['data'][:, pd.DataFrame(
    dict(url_host=titlemap['domain'])
        ).merge(map_df).url_host_idx.values
               ].dot(CountVectorizer(ngram_range=(1,2),
                                     min_df=2,).\
                     fit_transform(titlemap['title']))

titlemap = csr_matrix((np.log2(1+titlemap.data), titlemap.nonzero()),
                      shape=titlemap.shape, dtype=np.float32)
titlemap

<415317x13118 sparse matrix of type '<class 'numpy.float32'>'
	with 82784390 stored elements in Compressed Sparse Row format>

## Загрузка эмбеддингов скриншотов

In [15]:
with gzip.open('auxilary/clipVIT_scores_20k.pickle.gz', 'rb') as f:
    clipmap = pickle.load(f)

clipmap = \
TfidfTransformer(sublinear_tf=True, norm=None).\
    fit_transform(
        datamap['data'][:, pd.DataFrame(
            dict(url_host=clipmap['domains'])
                ).merge(map_df).url_host_idx.values
                       ]
    ).\
    dot(
        np.array(clipmap['scores'])
    )

clipmap = Normalizer().fit_transform(clipmap)
clipmap = np.float32(clipmap)

clipmap.shape

(415317, 768)

## Загрузка признаков Doc2Vec

In [16]:
# тут не сконкатенированные по 4 доменам эмбеддинги, а только по url_host
with gzip.open('doc2vec_feats_128.pickle.gz', 'rb') as f:
    doc2vec_feats = pickle.load(f)

## Получение частотных ссылок мешка слов url_host

In [17]:
feats_mask = (np.array((datamap['data']>0).sum(axis=0)).flatten() > 40)
feats_mask.sum()

20144

## Считывание файла с таргетами

In [5]:
trg_df = pd.read_csv('target.tsv.gz', sep='\t')
trg_df.sample(10)

Unnamed: 0,user_id,age,is_male
135832,112198,40.0,1.0
154409,297298,51.0,0.0
80640,390344,66.0,0.0
387027,131868,42.0,0.0
160476,357784,51.0,1.0
270408,211076,32.0,1.0
379821,58745,,
307483,164654,55.0,1.0
391925,181328,42.0,1.0
207270,409909,24.0,1.0


# Подготовка таргетов и поднабора юзеров из обучения

In [None]:
key = 'data'

(trg_df.age.isna()|trg_df.is_male.isna()).sum(),\

all_mask = (~trg_df.age.isna()|~trg_df.is_male.isna()).values.copy()
trg_train = trg_df[all_mask].fillna({'is_male': 0.5, 'age':34})
trg_age = trg_train.age.values.copy()
trg_sex = trg_train.is_male.values.copy()


age_bins = [[0, 25], [26, 35], [36, 45], [46, 55], [56, 65], [66, 999]]

print('Train sample:', all_mask.sum())

y_all = 0
for k, age_bin in enumerate(age_bins):
    y = pd.Series(trg_age).between(*age_bin).values.copy()
    y_all += y*(k+1)

X_tr = datamap[key][all_mask][:, feats_mask]

# Объединение мешков слов второстепенных доменов в один

In [20]:
cbag_all = hstack([regionmap,
                   citmap,
                   cpemanmap,
                   cpemodnamemap,
                   cpetypemap,
                   podmap,
                   datemap,
                   pricemap_id
                  ])
cbag_all = QuantileTransformer(n_quantiles=10).fit_transform(cbag_all)
cbag_all = csr_matrix(cbag_all)
cbag_all_train = cbag_all[all_mask]
cbag_all_train.shape

(270000, 1407)

# Переобозначения обучающих поднаборов признаков ради удобства

In [21]:
doc2vec_feats_train = doc2vec_feats[all_mask].copy()
print(doc2vec_feats_train.shape)
clipmap_train = clipmap[all_mask].copy()
print(clipmap_train.shape)
titlemap = csr_matrix(titlemap, dtype=np.float32)
titlemap_train = titlemap[all_mask]
print(titlemap_train.shape)

(270000, 128)
(270000, 768)
(270000, 13118)


# Подготовка мешка слов ссылок

In [22]:
all_data = datamap[key][:, feats_mask]
all_data_sqrt = csr_matrix((all_data.data**0.5, all_data.nonzero()),
                     shape=all_data.shape,
                     dtype=np.float32)
del all_data

# Настройка валидации

In [34]:
RANDOM_SEED = 42
kfold = StratifiedKFold(n_splits=10,
                        shuffle=True,
                        random_state=RANDOM_SEED)
folds = [(train_ind, test_ind) for train_ind, test_ind in
         kfold.split((np.uint8(trg_sex*2)+y_all*10).astype(str),
                     (np.uint8(trg_sex*2)+y_all*10).astype(str))]

## Функция с архитектурой модели (старая версия)

In [26]:
def get_model_all(emb_size=8, dense_size=512, l1_reg=1e-7,
                  base_lr=2e-4, sex_weight=1, age_weight=1,
                  return_sex_feats=False,
                  **kwargs):
    l1reg = tf.keras.regularizers.l1(l1_reg)

    # urls
    inp = tf.keras.layers.Input((X_tr.shape[1],), sparse=False)
    x = inp
    x = tf.keras.layers.Dense(dense_size, activation='relu',
                              use_bias=False,
                              kernel_regularizer=l1reg)(x)

    use_feats = kwargs['use_feats']

    # region id
    inp2 = tf.keras.layers.Input((1,), sparse=False)
    x2 = tf.keras.layers.Embedding(reg_id.max()+1,
                                   kwargs.get('e_region', emb_size),
                                   embeddings_regularizer=l1reg)(inp2)
    x2 = tf.keras.layers.Flatten()(x2)

    # city id
    inp3 = tf.keras.layers.Input((1,), sparse=False)
    x3 = tf.keras.layers.Embedding(city_id.max()+1,
                                   kwargs.get('e_city', emb_size),
                                   embeddings_regularizer=l1reg)(inp3)
    x3 = tf.keras.layers.Flatten()(x3)

    # cpeman id
    inp4 = tf.keras.layers.Input((1,), sparse=False)
    x4 = tf.keras.layers.Embedding(cpeman_id.max()+1,
                                   kwargs.get('e_cpeman', emb_size),
                                   embeddings_regularizer=l1reg)(inp4)
    x4 = tf.keras.layers.Flatten()(x4)

    # cpemodname id
    inp5 = tf.keras.layers.Input((1,), sparse=False)
    x5 = tf.keras.layers.Embedding(cpemodname_id.max()+1,
                                   kwargs.get('e_cpemodname', emb_size),
                                   embeddings_regularizer=l1reg)(inp5)
    x5 = tf.keras.layers.Flatten()(x5)

    # cpetype_id id
    inp6 = tf.keras.layers.Input((1,), sparse=False)
    x6 = tf.keras.layers.Embedding(cpetype_id.max()+1,
                                   kwargs.get('e_cpetype', emb_size),
                                   embeddings_regularizer=l1reg)(inp6)
    x6 = tf.keras.layers.Flatten()(x6)

    # price id
    inp7 = tf.keras.layers.Input((1,), sparse=False)
    x7 = tf.keras.layers.Dense(kwargs.get('e_price', emb_size),
                               activation='tanh',
                               kernel_regularizer=l1reg)(inp7)

    # cbagmap
    inp8 = tf.keras.layers.Input((cbag_all.shape[-1],), sparse=False)
    x8 = tf.keras.layers.Dense(kwargs.get('e_cbag', emb_size*2),
                               activation='relu',
                               kernel_regularizer=l1reg)(inp8)

    # date_id id
    inp9 = tf.keras.layers.Input((1,), sparse=False)
    x9 = tf.keras.layers.Embedding(date_id.max()+1,
                                   kwargs.get('e_date', emb_size),
                                   embeddings_regularizer=l1reg)(inp9)
    x9 = tf.keras.layers.Flatten()(x9)

    # pod_id id
    inp10 = tf.keras.layers.Input((1,), sparse=False)
    x10 = tf.keras.layers.Embedding(pod_id.max()+1,
                                    kwargs.get('e_pod', emb_size),
                                    embeddings_regularizer=l1reg)(inp10)
    x10 = tf.keras.layers.Flatten()(x10)

    # d2v
    inp13 = tf.keras.layers.Input((doc2vec_feats.shape[-1],), sparse=False)
    #x13 = tf.keras.layers.Dense(emb_size, activation='relu',
    #                            kernel_regularizer=l1reg)(inp13)
    x13 = inp13

    # clip
    inp14 = tf.keras.layers.Input((clipmap_train.shape[-1],), sparse=False)
    #x14 = tf.keras.layers.Dense(emb_size, activation='relu',
    #                            kernel_regularizer=l1reg)(inp14)
    x14 = inp14

    # titles
    inp15 = tf.keras.layers.Input((titlemap_train.shape[-1],), sparse=False)
    x15 = tf.keras.layers.Dense(kwargs.get('e_title', emb_size*2),
                                activation='relu',
                                use_bias=False,
                                kernel_regularizer=l1reg)(inp15)

    x_extra = [
                 x2,
                 x3,
                 x4,
                 x5,
                 x6,
                 x7,
                 x8,
                 x9,
                 x10,
                 x13,
                 x14,
                 x15
            ]

    x_extra = [xx for xx, remain in zip(x_extra, use_feats) if remain]

    x_sex0 = tf.keras.layers.concatenate([x] + x_extra)
    x_age0 = x_sex0

    parallel_age = []

    for _ in range(1):
        prev_x_age = [x_age0]
        x_age = x_age0
        for _ in range(2):
            x2 = tf.keras.layers.Dense(x_age.shape[-1], activation='relu',
                                       use_bias=True,
                                       kernel_regularizer=l1reg)(x_age)
            # dense connections
            prev_x_age.append(x2)
            x_age = tf.keras.layers.add(prev_x_age)
        parallel_age.append(x_age)

    if kwargs.get('age_extra_dim', False):
        x_age = tf.keras.layers.concatenate([
            tf.keras.layers.Dense(kwargs.get('age_extra_dim'),
                                  activation='relu',
                                  use_bias=True,
                                  kernel_regularizer=l1reg)(x_age0)
        ] + parallel_age)
    else:
        x_age = parallel_age[0]

    if kwargs.get('sex_extra_dim', False):
        x_sex = tf.keras.layers.concatenate([
            tf.keras.layers.Dense(kwargs.get('sex_extra_dim'),
                                  activation='relu',
                                  use_bias=True,
                                  kernel_regularizer=l1reg)(x_sex0)
        ] + parallel_age)
    else:
        x_sex = parallel_age[0]

    out1 = tf.keras.layers.Dense(1, activation='sigmoid', use_bias=True, name='sex',
                                 kernel_regularizer=tf.keras.regularizers.l1(l1_reg))(x_sex)

    out2 = tf.keras.layers.Dense(6, activation='softmax', use_bias=True, name='age',
                            kernel_regularizer=tf.keras.regularizers.l1(l1_reg))(x_age)

    inps_extra = [
        inp2,
        inp3,
        inp4,
        inp5,
        inp6,
        inp7,
        inp8,
        inp9,
        inp10,
        inp13,
        inp14,
        inp15,
    ]
    inps_extra = [xx for xx, remain in zip(inps_extra, use_feats) if remain]

    output_layers = [out1, out2]
    if return_sex_feats:
        output_layers.append(x_sex)
    model = tf.keras.models.Model([inp] + inps_extra, output_layers)
    model.compile(loss={'sex':'binary_crossentropy',
                        'age':'categorical_crossentropy'},
                  loss_weights={'sex':sex_weight, 'age':age_weight},
                  optimizer=tf.keras.optimizers.Adam(learning_rate=base_lr,
                                                     clipvalue=2),
                 )
    return model


def get_scheduler(base_lr=2e-4, factor=1., offset=0.5):
    def scheduler(epoch, lr):
        return base_lr*10**(-epoch*factor+offset)
    return scheduler

## Функция-генератор фолдов для обучения

In [27]:
X_train = csr_matrix((X_tr.data**0.5, X_tr.nonzero()),
                    shape=X_tr.shape,
                    dtype=np.float32)

y_ohe_age = np.zeros((y_all.size, y_all.max()))
y_ohe_age[np.arange(y_all.size), y_all-1] = 1.

def generate_folds(folds):
    for k, (train_ind, test_ind) in enumerate(tqdm(folds)):
        train_dat = X_train[train_ind]
        train_y_sex = trg_sex[train_ind]
        train_y_age = y_ohe_age[train_ind]
        val_dat = X_train[test_ind]
        val_y_sex = trg_sex[test_ind]
        val_y_age = y_ohe_age[test_ind]

        train_aux_dat = [train_dat,
                         reg_id[all_mask][train_ind, None],
                         city_id[all_mask][train_ind, None],
                         cpeman_id[all_mask][train_ind, None],
                         cpemodname_id[all_mask][train_ind, None],
                         cpetype_id[all_mask][train_ind, None],
                         price_id[all_mask][train_ind, None]**0.5,
                         cbag_all_train[train_ind],
                         date_id[all_mask][train_ind, None],
                         pod_id[all_mask][train_ind, None],
                         doc2vec_feats_train[train_ind],
                         clipmap_train[train_ind],
                         titlemap_train[train_ind],
                        ]

        val_aux_dat = [val_dat,
                       reg_id[all_mask][test_ind, None],
                       city_id[all_mask][test_ind, None],
                       cpeman_id[all_mask][test_ind, None],
                       cpemodname_id[all_mask][test_ind, None],
                       cpetype_id[all_mask][test_ind, None],
                       price_id[all_mask][test_ind, None]**0.5,
                       cbag_all_train[test_ind],
                       date_id[all_mask][test_ind, None],
                       pod_id[all_mask][test_ind, None],
                       doc2vec_feats_train[test_ind],
                       clipmap_train[test_ind],
                       titlemap_train[test_ind],
                      ]

        # использую скоры логрега для первоначальной самодистилляции
        model_lr = \
        Pipeline([('tfidf', TfidfTransformer(sublinear_tf=True, norm='l2')),
                  ('model', LogisticRegression(C=1.5,
                                               penalty='l1',
                                               solver='liblinear',
                                               #class_weight='balanced',
                                               max_iter=5,
                                               dual=False)
                    )])
        scores_lr = cross_val_predict(model_lr,
                                      train_dat,
                                      train_y_age.argmax(axis=1),
                                      cv=5,
                                      method='predict_proba',
                                      n_jobs=5)

        # только для скоров возраста, для пола дистилляция от логрега ухудшает качество
        alpha = 0.1
        train_y_age = train_y_age*(1-alpha)+scores_lr*alpha

        yield [[train_aux_dat, [train_y_sex, train_y_age]],
               [val_aux_dat, [val_y_sex, val_y_age]]]

## Мапка признаков для удобства

In [28]:
feat2idx_map = dict(e_region=0,
                    e_city=1,
                    e_cpeman=2,
                    e_cpemodname=3,
                    e_cpetype=4,
                    e_price=5,
                    e_date=6,
                    e_pod=7,
                    e_cbag=8,
                    e_doc=9,
                    e_clip=10,
                    e_title=11)

## Конфигурация архитектуры нейронной сети для получения oof скоров

In [30]:
model_args = \
{'age_extra_dim': 22,
 'age_weight': 1.8,
 'base_lr': 0.0017782794100389228,
 'batch_size': 256,
 'dense_size': 2048,
 'e_cbag': 362,
 'e_clip': 1,
 'e_cpeman': 53,
 'e_cpemodname': 1,
 'e_cpetype': 8,
 'e_price': 38,
 'emb_size': 11,
 'epochs': 3,
 'factor': 1.25,
 'offset': -0.5,
 'sex_weight': 1.5,
 'steps_subsample': 1.0,
 'use_feats': [1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1]}

## Функция для получения oof скоров для каждого разбиения

In [42]:
def get_oof_score(args):
    # delete useless values
    for k, v in args.copy().items():
        if v is None:
            del args[k]
            continue
        if k.startswith('e_') or 'extra' in k:
            args[k] = int(args[k])

    use_feats = [1 for _ in range(len(feat2idx_map))]
    for k, v in feat2idx_map.items():
        use_feats[v] = int(args.get(k, 100) != 1)

    args['use_feats'] = tuple(use_feats)

    epochs = args['epochs'] = int(args['epochs']) #4
    batch_size = args['batch_size'] = int(args['batch_size']) #128+256
    steps_subsample = args['steps_subsample'] = float('%.3f'%args['steps_subsample']) #0.95
    base_lr = args['base_lr'] = args['base_lr'] #2e-4
    factor = args['factor'] = float('%.3f'%args['factor']) #1.
    offset = args['offset'] = float('%.3f'%args['offset']) #0.5
    emb_size = args['emb_size'] = int(args['emb_size']) # 64
    dense_size = args['dense_size'] = int(args['dense_size']) #1024
    sex_weight = args['sex_weight'] = float('%.3f'%args['sex_weight'])
    age_weight = args['age_weight'] = float('%.3f'%args['age_weight'])
    scheduler = get_scheduler(base_lr, factor, offset)

    print(args)

    K = 1
    for cache, (train_ind, test_ind) in zip(generate_folds(folds), tqdm(folds)):
        [train_aux_dat, train_y],\
        [val_aux_dat, val_y] = cache
        train_y_sex, train_y_age = train_y
        val_y_sex, val_y_age = val_y

        train_aux_dat = [xx for xx, remain in zip(train_aux_dat, [1]+use_feats) if remain]
        val_aux_dat = [xx for xx, remain in zip(val_aux_dat, [1]+use_feats) if remain]

        kfold = StratifiedKFold(n_splits=5,
                        shuffle=True,
                        random_state=RANDOM_SEED)
        folds_inner = [(inner_train_ind, inner_test_ind) for inner_train_ind, inner_test_ind in
                 kfold.split(train_y_sex,
                             (np.uint8(train_y_sex*2)+train_y_age.argmax(axis=1)*10).astype(str))]

        oof_scores_sex = np.zeros(train_y_sex.shape)
        oof_scores_age = np.zeros(train_y_age.shape)
        for inner_train_ind, inner_test_ind in tqdm(folds_inner):

            model_nn = get_model_all(l1_reg=1e-7, **args)

            st_time = time.time()

            model_nn.fit([xx[inner_train_ind] for xx in train_aux_dat],
                         [yy[inner_train_ind] for yy in train_y],
                      batch_size=batch_size,
                      steps_per_epoch=int(steps_subsample*inner_train_ind.size/batch_size),
                      epochs=epochs,
                      callbacks=[tf.keras.callbacks.LearningRateScheduler(scheduler)],
                      verbose=True)

            preds = \
            model_nn.predict([xx[inner_test_ind] for xx in train_aux_dat],
                             batch_size=1024)
            oof_scores_sex[inner_test_ind] = \
                preds[0].flatten().argsort().argsort()/inner_test_ind.size
            oof_scores_age[inner_test_ind] = preds[-1]
        # сохраняем для каждого фолда свои oof скоры, чтобы избежать утечки таргета
        os.makedirs('oof_scores_260223/%d'%RANDOM_SEED, exist_ok=True)
        with gzip.open('oof_scores_260223/%d/%d.pickle.gz'%(RANDOM_SEED, K), 'wb') as f:
            pickle.dump(dict(sex=oof_scores_sex, age=oof_scores_age, fold=K),
                        f,
                        protocol=-1)
        K += 1

## Получение oof скоров по каждому фолду для будущей self-distillation

In [44]:
# нужно выполнить два раза для двух наборов фолдов
# 42 - model assessement фолды, 101010 - model selection фолды
for RANDOM_SEED in [42, 101010]:
    kfold = StratifiedKFold(n_splits=10,
                            shuffle=True,
                            random_state=RANDOM_SEED)
    folds = [(train_ind, test_ind) for train_ind, test_ind in
             kfold.split((np.uint8(trg_sex*2)+y_all*10).astype(str),
                         (np.uint8(trg_sex*2)+y_all*10).astype(str))]
    get_oof_score(model_args)

{'age_extra_dim': 22, 'age_weight': 1.8, 'base_lr': 0.0017782794100389228, 'batch_size': 256, 'dense_size': 2048, 'e_cbag': 362, 'e_clip': 1, 'e_cpeman': 53, 'e_cpemodname': 1, 'e_cpetype': 8, 'e_price': 38, 'emb_size': 11, 'epochs': 3, 'factor': 1.25, 'offset': -0.5, 'sex_weight': 1.5, 'steps_subsample': 1.0, 'use_feats': (1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1)}


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3
Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3
Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3
Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3
Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3


  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3
Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3
Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3
Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3
Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3


  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3
Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3
Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3
Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3
Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3


  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3
Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3
Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3
Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3
Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3


  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3
Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3
Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3
Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3
Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3


  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 2/3
Epoch 3/3
Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3
Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3
Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
 80/759 [==>...........................] - ETA: 37s - loss: 2.7625 - sex_loss: 0.4204 - age_loss: 1.1613

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 2/3
Epoch 3/3
Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 3/3
Epoch 1/3


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

