## Ограничения на ресурсы для numpy, импорт библиотек

In [3]:
import os
os.environ["OMP_NUM_THREADS"] = "4" # export OMP_NUM_THREADS=4
os.environ["OPENBLAS_NUM_THREADS"] = "4" # export OPENBLAS_NUM_THREADS=4 
os.environ["MKL_NUM_THREADS"] = "6" # export MKL_NUM_THREADS=6
os.environ["VECLIB_MAXIMUM_THREADS"] = "4" # export VECLIB_MAXIMUM_THREADS=4
os.environ["NUMEXPR_NUM_THREADS"] = "6" # export NUMEXPR_NUM_THREADS=6

In [4]:
import pandas as pd
import numpy as np
import os
import time
from tqdm.auto import tqdm
from functools import reduce

import scipy
from scipy.sparse import csr_matrix, hstack, vstack
from scipy.optimize import minimize
import gzip
import pickle

import sklearn
from sklearn.utils import resample
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import QuantileTransformer, Normalizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, f1_score

import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf

from IPython.display import clear_output

## Загрузка мешка слов по регионам

In [6]:
with gzip.open('files/region_name_cbag_v2.pickle.gz', 'rb') as f:
    regmap = pickle.load(f)

reg_id = np.array(regmap['data'].argmax(axis=1)).flatten()
pd.Series(
    reg_id
).value_counts().head(50).index

_id_label, _id_cnt = np.unique(reg_id, return_counts=True)
_id_map = {}
regionmap = csr_matrix(regmap['data'][:, _id_label[_id_cnt>20]])
print(regionmap.shape)
for cid in _id_label[_id_cnt>20]:
    _id_map[cid] = len(_id_map)
reg_id = np.array([_id_map.get(cid, len(_id_map)) for cid in reg_id])
del _id_label, _id_cnt, _id_map
reg_id.max()

(415317, 79)


79

## Загрузка мешка слов по городам

In [7]:
with gzip.open('files/city_name_cbag_v2.pickle.gz', 'rb') as f:
    regmap = pickle.load(f)

city_id = np.array(regmap['data'].argmax(axis=1)).flatten()
pd.Series(
    city_id
).value_counts().head(50).index

_id_label, _id_cnt = np.unique(city_id, return_counts=True)
_id_map = {}
citmap = csr_matrix(regmap['data'][:, _id_label[_id_cnt>20]])
print(citmap.shape)
for cid in _id_label[_id_cnt>20]:
    _id_map[cid] = len(_id_map)
city_id = np.array([_id_map.get(cid, len(_id_map)) for cid in city_id])
del _id_label, _id_cnt, _id_map
city_id.max()

(415317, 661)


661

## Загрузка мешка слов по производителю

In [8]:
with gzip.open('files/cpe_manufacturer_name_cbag_v2.pickle.gz', 'rb') as f:
    regmap = pickle.load(f)

cpeman_id = np.array(regmap['data'].argmax(axis=1)).flatten()
pd.Series(
    cpeman_id
).value_counts().head(50).index

_id_label, _id_cnt = np.unique(cpeman_id, return_counts=True)
_id_map = {}
cpemanmap = csr_matrix(regmap['data'][:, _id_label[_id_cnt>20]])
print(cpemanmap.shape)
for cid in _id_label[_id_cnt>20]:
    _id_map[cid] = len(_id_map)
cpeman_id = np.array([_id_map.get(cid, len(_id_map)) for cid in cpeman_id])
del _id_label, _id_cnt, _id_map
cpeman_id.max()

(415317, 27)


27

## Загрузка мешка слов по устройству

In [9]:
with gzip.open('files/cpe_model_name_cbag_v2.pickle.gz', 'rb') as f:
    regmap = pickle.load(f)

cpemodname_id = np.array(regmap['data'].argmax(axis=1)).flatten()
pd.Series(
    cpemodname_id
).value_counts().head(50).index

_id_label, _id_cnt = np.unique(cpemodname_id, return_counts=True)
_id_map = {}
cpemodnamemap = csr_matrix(regmap['data'][:, _id_label[_id_cnt>20]])
print(cpemodnamemap.shape)
for cid in _id_label[_id_cnt>20]:
    _id_map[cid] = len(_id_map)
cpemodname_id = np.array([_id_map.get(cid, len(_id_map)) for cid in cpemodname_id])
del _id_label, _id_cnt, _id_map
cpemodname_id.max()

(415317, 396)


396

## Загрузка мешка слов по cpe_type_cd

In [10]:
with gzip.open('files/cpe_type_cd_cbag_v2.pickle.gz', 'rb') as f:
    regmap = pickle.load(f)

cpetype_id = np.array(regmap['data'].argmax(axis=1)).flatten()
cpetypemap = csr_matrix(regmap['data'])
print(cpetypemap.shape)
pd.Series(
    cpetype_id
).value_counts().head(50).index

(415317, 5)


Int64Index([0, 2, 1, 3], dtype='int64')

## Загрузка мешка слов по датам

In [11]:
with gzip.open('files/date_cbag_v2.pickle.gz', 'rb') as f:
    regmap = pickle.load(f)

date_id = np.array(regmap['data'].argmax(axis=1)).flatten()
datemap = csr_matrix(regmap['data'])
print(datemap.shape)
pd.Series(
    date_id
).value_counts().tail(50)

_id_label, _id_cnt = np.unique(date_id, return_counts=True)
_id_map = {}
datemap = csr_matrix(regmap['data'][:, _id_label[_id_cnt>20]])
print(datemap.shape)
for cid in _id_label[_id_cnt>20]:
    _id_map[cid] = len(_id_map)
date_id = np.array([_id_map.get(cid, len(_id_map)) for cid in date_id])
del _id_label, _id_cnt, _id_map
date_id.max()

(415317, 397)
(415317, 203)


203

## Загрузка мешка слов по времени суток

In [12]:
with gzip.open('files/part_of_day_cbag_v2.pickle.gz', 'rb') as f:
    regmap = pickle.load(f)

pod_id = np.array(regmap['data'].argmax(axis=1)).flatten()
podmap = csr_matrix(regmap['data'])
print(podmap.shape)
pd.Series(
    pod_id
).value_counts().tail(50)

(415317, 5)


1    217543
2    126023
0     64969
3      6782
dtype: int64

## Загрузка мешка слов по ценам

In [13]:
with gzip.open('files/price_cbag_v2.pickle.gz', 'rb') as f:
    regmap = pickle.load(f)

prices = np.array(pd.read_csv('files/price_mapper.tsv.gz', sep='\t').price.fillna(20_000).tolist()
                  + [20_000])
price_id = regmap['data']
price_id = (price_id.dot(prices)/np.array(price_id.sum(axis=1)).flatten())
pricemap_id = regmap['data'].dot(KBinsDiscretizer(n_bins=31,
                                                  strategy='kmeans',).fit_transform(prices[:, None]**0.5))

## Загрузка мешка слов по ссылкам

In [14]:
with gzip.open('files/url_host_cbag_v2.pickle.gz', 'rb') as f:
    datamap = pickle.load(f)
datamap

{'data': <415317x199684 sparse matrix of type '<class 'numpy.uint32'>'
 	with 32277669 stored elements in Compressed Sparse Row format>,
 'uids': array([     4,     16,     18, ..., 415276, 415288, 415293])}

## Загрузка признаков тайтлов ссылок

In [15]:
with gzip.open('auxilary/domain20k_title.pickle.gz', 'rb') as f:
    titlemap = pickle.load(f)

map_df = pd.read_csv('auxilary/url_host_mapper_v2.tsv.gz', sep='\t')

titlemap = \
datamap['data'][:, pd.DataFrame(
    dict(url_host=titlemap['domain'])
        ).merge(map_df).url_host_idx.values
               ].dot(CountVectorizer(ngram_range=(1,2),
                                     min_df=2,).\
                     fit_transform(titlemap['title']))

titlemap = csr_matrix((np.log2(1+titlemap.data), titlemap.nonzero()),
                      shape=titlemap.shape, dtype=np.float32)
titlemap

<415317x13118 sparse matrix of type '<class 'numpy.float32'>'
	with 82784390 stored elements in Compressed Sparse Row format>

## Загрузка эмбеддингов скриншотов

In [16]:
with gzip.open('auxilary/clipVIT_scores_20k.pickle.gz', 'rb') as f:
    clipmap = pickle.load(f)

clipmap = \
TfidfTransformer(sublinear_tf=True, norm=None).\
    fit_transform(
        datamap['data'][:, pd.DataFrame(
            dict(url_host=clipmap['domains'])
                ).merge(map_df).url_host_idx.values
                       ]
    ).\
    dot(
        np.array(clipmap['scores'])
    )

clipmap = Normalizer().fit_transform(clipmap)
clipmap = np.float32(clipmap)

clipmap.shape

(415317, 768)

## Загрузка признаков Doc2Vec

In [17]:
with gzip.open('doc2vec_feats_128x4.pickle.gz', 'rb') as f:
    doc2vec_feats = pickle.load(f)

## Загрузка признаков HTML

In [18]:
%%time

if not os.path.exists('html_feats.pickle.gz'):


    with gzip.open('auxilary/domain20k_html.txt.gz', 'rt',
                   encoding='utf-8') as f:
        html_data = f.read().split('\n=\n=\n')
        print(len(html_data))

    def tokenize(x):
        return x[0], re.findall('(?u)\\b\\w\\w+\\b', x[-1].lower())

    with Pool(20) as pool:
        pool_pbar = tqdm()
        html_tokens = [None for _ in range(len(html_data[:-1]))]
        for i, r in pool.imap(tokenize, enumerate(html_data[:-1])):
            html_tokens[i] = r
            pool_pbar.update(1)

    html_cbag = \
    CountVectorizer(ngram_range=(1,2),
                    min_df=300,
                    lowercase=False,
                    tokenizer=lambda x: x,
                    max_df=0.25).\
                   fit_transform(html_tokens)
    print(html_cbag.shape)

    html_tfidf = TfidfTransformer(sublinear_tf=True).fit_transform(html_cbag)

    from sklearn.decomposition import TruncatedSVD
    html_svd = TruncatedSVD(n_components=256,
                            random_state=10,
                            n_iter=3,).fit_transform(html_tfidf)
    print(html_svd.shape)

    with gzip.open('auxilary/domain20k_title.pickle.gz', 'rb') as f:
        htmlmap = pickle.load(f)
    htmlmap = \
    TfidfTransformer(sublinear_tf=True, norm=None).\
        fit_transform(
            datamap['data'][:, pd.DataFrame(
                dict(url_host=htmlmap['domain'])
                    ).merge(map_df).url_host_idx.values
                           ]
        ).\
        dot(
            html_svd
        )

    htmlmap = Normalizer().fit_transform(htmlmap)
    htmlmap = np.float32(htmlmap)

    print(htmlmap.shape)

    with gzip.open('auxilary/html_feats.pickle.gz', 'wb') as f:
        pickle.dump(htmlmap, f, protocol=-1)

with gzip.open('auxilary/html_feats.pickle.gz', 'rb') as f:
    htmlmap = pickle.load(f)

CPU times: user 2.96 s, sys: 616 ms, total: 3.58 s
Wall time: 3.79 s


## Загрузка признаков SimilarWeb

In [19]:
with gzip.open('auxilary/simweb_domain.pickle.gz', 'rb') as f:
    simweb_feats = pickle.load(f)

## Загрузка признаков Bigram ссылок

In [20]:
with gzip.open('auxilary/bigrams_dense.pickle.gz', 'rb') as f:
    bigrams_feats = pickle.load(f)

## Получение частотных ссылок мешка слов url_host

In [21]:
feats_mask = (np.array((datamap['data']>0).sum(axis=0)).flatten() > 40)
feats_mask.sum()

20144

## Считывание файла с таргетами

In [22]:
trg_df = pd.read_csv('target.tsv.gz', sep='\t')
trg_df.sample(10)

Unnamed: 0,user_id,age,is_male
216374,84565,32.0,1.0
273283,239794,,
331154,400472,,
2211,22680,36.0,0.0
369397,369554,22.0,0.0
45304,36231,35.0,1.0
347938,153309,,
247442,395326,34.0,1.0
49156,75103,,
184430,179214,,


# Подготовка таргетов и поднабора юзеров из обучения

In [23]:
key = 'data'

(trg_df.age.isna()|trg_df.is_male.isna()).sum(),\

all_mask = (~trg_df.age.isna()|~trg_df.is_male.isna()).values.copy()
trg_train = trg_df[all_mask].fillna({'is_male': 0.5, 'age':34})
trg_age = trg_train.age.values.copy()
trg_sex = trg_train.is_male.values.copy()

X_tr = datamap[key][all_mask][:, feats_mask]

age_bins = [[0, 25], [26, 35], [36, 45], [46, 55], [56, 65], [66, 999]]

print('Train sample:', all_mask.sum())

y_all = 0
for k, age_bin in enumerate(age_bins):
    y = pd.Series(trg_age).between(*age_bin).values.copy()
    y_all += y*(k+1)
y_all.min(), y_all.max(), X_tr.shape

Train sample: 270000


(1, 6, (270000, 20144))

# Объединение мешков слов второстепенных доменов в один

In [24]:
cbag_all = hstack([regionmap,
                   citmap,
                   cpemanmap,
                   cpemodnamemap,
                   cpetypemap,
                   podmap,
                   datemap,
                   pricemap_id
                  ])
cbag_all = QuantileTransformer(n_quantiles=10).fit_transform(cbag_all)
cbag_all = csr_matrix(cbag_all)
cbag_all_train = cbag_all[all_mask]
cbag_all_train.shape

(270000, 1407)

# Переобозначения обучающих поднаборов признаков ради удобства

In [25]:
doc2vec_feats_train = doc2vec_feats[all_mask].copy()
print(doc2vec_feats_train.shape)
clipmap_train = clipmap[all_mask].copy()
print(clipmap_train.shape)
titlemap = csr_matrix(titlemap, dtype=np.float32)
titlemap_train = titlemap[all_mask]
print(titlemap_train.shape)
htmlmap_train = htmlmap[all_mask].copy()
print(htmlmap_train.shape)
simweb_feats_train = simweb_feats[all_mask].copy()
print(simweb_feats_train.shape)
bigrams_feats_train = bigrams_feats[all_mask].copy()
print(bigrams_feats_train.shape)

(270000, 512)
(270000, 768)
(270000, 13118)
(270000, 256)
(270000, 199)
(270000, 512)


# Подготовка мешка слов ссылок

In [26]:
all_data = datamap[key][:, feats_mask]
all_data_sqrt = csr_matrix((all_data.data**0.5, all_data.nonzero()),
                     shape=all_data.shape,
                     dtype=np.float32)
del all_data

# Настройка валидации

In [27]:
kfold = StratifiedKFold(n_splits=10,
                        shuffle=True,
                        random_state=42)
folds = [(train_ind, test_ind) for train_ind, test_ind in
         kfold.split((np.uint8(trg_sex*2)+y_all*10).astype(str),
                     (np.uint8(trg_sex*2)+y_all*10).astype(str))]

## Функция с архитектурой модели (последняя версия)

In [30]:
def get_model_all(emb_size=8, dense_size=512, l1_reg=1e-7,
                  base_lr=2e-4, sex_weight=1, age_weight=1,
                  **kwargs):
    l1reg = tf.keras.regularizers.l1(l1_reg)

    # urls
    inp = tf.keras.layers.Input((X_tr.shape[1],), sparse=False)
    x = inp
    x = tf.keras.layers.Dense(dense_size, activation='relu',
                              use_bias=False,
                              kernel_regularizer=l1reg)(x)

    use_feats = kwargs['use_feats']
    activation = kwargs.get('use_emb_act', 'linear')

    # region id
    inp2 = tf.keras.layers.Input((1,), sparse=False)
    x2 = tf.keras.layers.Embedding(reg_id.max()+1,
                                   kwargs.get('e_region', emb_size),
                                   embeddings_regularizer=l1reg)(inp2)
    x2 = tf.keras.layers.Flatten()(x2)
    x2 = tf.keras.layers.Activation(activation)(x2)

    # city id
    inp3 = tf.keras.layers.Input((1,), sparse=False)
    x3 = tf.keras.layers.Embedding(city_id.max()+1,
                                   kwargs.get('e_city', emb_size),
                                   embeddings_regularizer=l1reg)(inp3)
    x3 = tf.keras.layers.Flatten()(x3)
    x3 = tf.keras.layers.Activation(activation)(x3)

    # cpeman id
    inp4 = tf.keras.layers.Input((1,), sparse=False)
    x4 = tf.keras.layers.Embedding(cpeman_id.max()+1,
                                   kwargs.get('e_cpeman', emb_size),
                                   embeddings_regularizer=l1reg)(inp4)
    x4 = tf.keras.layers.Flatten()(x4)
    x4 = tf.keras.layers.Activation(activation)(x4)

    # cpemodname id
    inp5 = tf.keras.layers.Input((1,), sparse=False)
    x5 = tf.keras.layers.Embedding(cpemodname_id.max()+1,
                                   kwargs.get('e_cpemodname', emb_size),
                                   embeddings_regularizer=l1reg)(inp5)
    x5 = tf.keras.layers.Flatten()(x5)
    x5 = tf.keras.layers.Activation(activation)(x5)

    # cpetype_id id
    inp6 = tf.keras.layers.Input((1,), sparse=False)
    x6 = tf.keras.layers.Embedding(cpetype_id.max()+1,
                                   kwargs.get('e_cpetype', emb_size),
                                   embeddings_regularizer=l1reg)(inp6)
    x6 = tf.keras.layers.Flatten()(x6)
    x6 = tf.keras.layers.Activation(activation)(x6)

    # price id
    inp7 = tf.keras.layers.Input((1,), sparse=False)
    x7 = tf.keras.layers.Dense(kwargs.get('e_price', emb_size),
                               activation='tanh',
                               kernel_regularizer=l1reg)(inp7)

    # cbagmap
    inp8 = tf.keras.layers.Input((cbag_all.shape[-1],), sparse=False)
    x8 = tf.keras.layers.Dense(kwargs.get('e_cbag', emb_size*2),
                               activation='relu',
                               kernel_regularizer=l1reg)(inp8)

    # date_id id
    inp9 = tf.keras.layers.Input((1,), sparse=False)
    x9 = tf.keras.layers.Embedding(date_id.max()+1,
                                   kwargs.get('e_date', emb_size),
                                   embeddings_regularizer=l1reg)(inp9)
    x9 = tf.keras.layers.Flatten()(x9)
    x9 = tf.keras.layers.Activation(activation)(x9)

    # pod_id id
    inp10 = tf.keras.layers.Input((1,), sparse=False)
    x10 = tf.keras.layers.Embedding(pod_id.max()+1,
                                    kwargs.get('e_pod', emb_size),
                                    embeddings_regularizer=l1reg)(inp10)
    x10 = tf.keras.layers.Flatten()(x10)
    x10 = tf.keras.layers.Activation(activation)(x10)

    # d2v
    inp13 = tf.keras.layers.Input((doc2vec_feats.shape[-1],), sparse=False)
    if kwargs.get('e_doc', emb_size) == -1:
        x13 = inp13
    else:
        x13 = tf.keras.layers.Dense(kwargs.get('e_doc', emb_size),
                                    activation='relu',
                                    kernel_regularizer=l1reg)(inp13)

    # clip
    inp14 = tf.keras.layers.Input((clipmap_train.shape[-1],), sparse=False)
    if kwargs.get('e_clip', emb_size) == -1:
        x14 = inp14
    else:
        x14 = tf.keras.layers.Dense(kwargs.get('e_clip', emb_size),
                                    activation='relu',
                                    kernel_regularizer=l1reg)(inp14)


    # titles
    inp15 = tf.keras.layers.Input((titlemap_train.shape[-1],), sparse=False)
    if kwargs.get('e_title', emb_size*2) == -1:
        x15 = inp15
    else:
        x15 = tf.keras.layers.Dense(kwargs.get('e_title', emb_size*2),
                                    activation='relu',
                                    use_bias=False,
                                    kernel_regularizer=l1reg)(inp15)

    # html svd
    inp16 = tf.keras.layers.Input((htmlmap_train.shape[-1],), sparse=False)
    if kwargs.get('e_html', emb_size) == -1:
        x16 = inp16
    else:
        x16 = tf.keras.layers.Dense(kwargs.get('e_html', emb_size),
                                    activation='relu',
                                    kernel_regularizer=l1reg)(inp16)

    # simweb
    inp17 = tf.keras.layers.Input((simweb_feats.shape[-1],), sparse=False)
    if kwargs.get('e_simweb', emb_size) == -1:
        x17 = inp17
    else:
        x17 = tf.keras.layers.Dense(kwargs.get('e_simweb', emb_size),
                                    activation='relu',
                                    kernel_regularizer=l1reg)(inp17)

    # bigrams
    inp18 = tf.keras.layers.Input((bigrams_feats.shape[-1],), sparse=False)
    if kwargs.get('e_bigram', emb_size) == -1:
        x18 = inp18
    else:
        x18 = tf.keras.layers.Dense(kwargs.get('e_bigram', emb_size),
                                    activation='relu',
                                    kernel_regularizer=l1reg)(inp18)

    x_extra = [
                 x2,
                 x3,
                 x4,
                 x5,
                 x6,
                 x7,
                 x8,
                 x9,
                 x10,
                 x13,
                 x14,
                 x15,
                 x16,
                 x17,
                 x18,
            ]

    x_extra = [xx for xx, remain in zip(x_extra, use_feats) if remain]

    x_sex0 = tf.keras.layers.concatenate([x] + x_extra)
    if kwargs.get('pre_bn', False):
        x_sex0 = tf.keras.layers.BatchNormalization(
            epsilon=1e-5, momentum=0.1)(x_sex0)
    if kwargs.get('pre_dropout', False):
        x_sex0 = tf.keras.layers.Dropout(0.1)(x_sex0)
    x_age0 = x_sex0


    parallel_age = []

    nn_act = kwargs.get('nn_act', 'relu')
    for _ in range(1):
        prev_x_age = [x_age0]
        x_age = x_age0
        for _ in range(kwargs.get('dense_con_num', 2)):
            x2 = tf.keras.layers.Dense(x_age.shape[-1], activation=nn_act,
                                       use_bias=True,
                                       kernel_regularizer=l1reg)(x_age)
            # dense connections
            prev_x_age.append(x2)
            x_age = tf.keras.layers.add(prev_x_age)
            if kwargs.get('bn', False):
                x_age = tf.keras.layers.BatchNormalization(
                    epsilon=1e-5, momentum=0.1)(x_age)
            if kwargs.get('dropout', False)>0:
                x_age = tf.keras.layers.Dropout(kwargs.get('dropout'))(x_age)
        parallel_age.append(x_age)

    if kwargs.get('age_extra_dim', False):
        x_age = tf.keras.layers.concatenate([
            tf.keras.layers.Dense(kwargs.get('age_extra_dim'),
                                  activation=nn_act,
                                  use_bias=True,
                                  kernel_regularizer=l1reg)(x_age0)
        ] + parallel_age)
    else:
        x_age = parallel_age[0]

    if kwargs.get('sex_extra_dim', False):
        x_sex = tf.keras.layers.concatenate([
            tf.keras.layers.Dense(kwargs.get('sex_extra_dim'),
                                  activation=nn_act,
                                  use_bias=True,
                                  kernel_regularizer=l1reg)(x_sex0)
        ] + parallel_age)
    else:
        x_sex = parallel_age[0]

    out1 = tf.keras.layers.Dense(1, activation='sigmoid', use_bias=True, name='sex',
                                 kernel_regularizer=tf.keras.regularizers.l1(l1_reg))(x_sex)

    out2 = tf.keras.layers.Dense(6, activation='softmax', use_bias=True, name='age',
                            kernel_regularizer=tf.keras.regularizers.l1(l1_reg))(x_age)

    inps_extra = [
        inp2,
        inp3,
        inp4,
        inp5,
        inp6,
        inp7,
        inp8,
        inp9,
        inp10,
        inp13,
        inp14,
        inp15,
        inp16,
        inp17,
        inp18
    ]
    inps_extra = [xx for xx, remain in zip(inps_extra, use_feats) if remain]

    model = tf.keras.models.Model([inp] + inps_extra, [out1, out2])

    max_weight = max(sex_weight, age_weight)
    model.compile(loss={'sex':'binary_crossentropy',
                        'age':'categorical_crossentropy'},
                  loss_weights={'sex':sex_weight/max_weight,
                                'age':age_weight/max_weight},
                  optimizer=tf.keras.optimizers.Adam(learning_rate=base_lr,
                                                     clipvalue=kwargs.get('clipvalue', 2.)),
                 )
    return model


def get_scheduler(base_lr=2e-4, factor=1., offset=0.5):
    def scheduler(epoch, lr):
        return base_lr*10**(-epoch*factor+offset)
    return scheduler

## Генератор фолдов кроссвалидации

In [31]:
X_train = csr_matrix((X_tr.data**0.5, X_tr.nonzero()),
                    shape=X_tr.shape,
                    dtype=np.float32)

y_ohe_age = np.zeros((y_all.size, y_all.max()))
y_ohe_age[np.arange(y_all.size), y_all-1] = 1.

def generate_folds(folds):
    for k, (train_ind, test_ind) in enumerate(tqdm(folds)):
        train_dat = X_train[train_ind]
        train_y_sex = trg_sex[train_ind]
        train_y_age = y_ohe_age[train_ind]
        val_dat = X_train[test_ind]
        val_y_sex = trg_sex[test_ind]
        val_y_age = y_ohe_age[test_ind]

        train_aux_dat = [train_dat,
                         reg_id[all_mask][train_ind, None],
                         city_id[all_mask][train_ind, None],
                         cpeman_id[all_mask][train_ind, None],
                         cpemodname_id[all_mask][train_ind, None],
                         cpetype_id[all_mask][train_ind, None],
                         price_id[all_mask][train_ind, None]**0.5,
                         cbag_all_train[train_ind],
                         date_id[all_mask][train_ind, None],
                         pod_id[all_mask][train_ind, None],
                         doc2vec_feats_train[train_ind],
                         clipmap_train[train_ind],
                         titlemap_train[train_ind],
                         htmlmap_train[train_ind],
                         simweb_feats_train[train_ind],
                         bigrams_feats_train[train_ind],
                        ]

        val_aux_dat = [val_dat,
                       reg_id[all_mask][test_ind, None],
                       city_id[all_mask][test_ind, None],
                       cpeman_id[all_mask][test_ind, None],
                       cpemodname_id[all_mask][test_ind, None],
                       cpetype_id[all_mask][test_ind, None],
                       price_id[all_mask][test_ind, None]**0.5,
                       cbag_all_train[test_ind],
                       date_id[all_mask][test_ind, None],
                       pod_id[all_mask][test_ind, None],
                       doc2vec_feats_train[test_ind],
                       clipmap_train[test_ind],
                       titlemap_train[test_ind],
                       htmlmap_train[test_ind],
                       simweb_feats_train[test_ind],
                       bigrams_feats_train[test_ind],
                      ]


        '''
        model_lr = \
        Pipeline([('tfidf', TfidfTransformer(sublinear_tf=True, norm='l2')),
                  ('model', LogisticRegression(C=1.5,
                                               penalty='l1',
                                               solver='liblinear',
                                               #class_weight='balanced',
                                               max_iter=5,
                                               dual=False)
                    )])
        scores_lr = cross_val_predict(model_lr,
                                      train_dat,
                                      train_y_age.argmax(axis=1),
                                      cv=5,
                                      method='predict_proba',
                                      n_jobs=5)

        alpha = 0.1
        train_y_age = train_y_age*(1-alpha)+scores_lr*alpha
        '''

        with gzip.open('oof_scores_260223/42/%d.pickle.gz'%(k+1), 'rb') as f:
            oof_scores = pickle.load(f)

        yield [[train_aux_dat, [train_y_sex, train_y_age,
                                oof_scores['sex'], oof_scores['age']]],
               [val_aux_dat, [val_y_sex, val_y_age]]]

## Мапка признаков для удобства

In [32]:
feat2idx_map = dict(e_region=0,
                    e_city=1,
                    e_cpeman=2,
                    e_cpemodname=3,
                    e_cpetype=4,
                    e_price=5,
                    e_date=6,
                    e_pod=7,
                    e_cbag=8,
                    e_doc=9,
                    e_clip=10,
                    e_title=11,
                    e_html=12,
                    e_simweb=13,
                    e_bigram=14)

## Функция для получения скоров ансамбля моделей

In [33]:
def change_state(model, mode='train'):
    for i, l in enumerate(model.layers):
        model.layers[i].trainable = (mode=='train')
    return model


def get_score(args):
    # delete useless values
    for k, v in args.copy().items():
        if v is None:
            del args[k]
            continue
        if k.startswith('e_') or 'extra' in k:
            args[k] = int(args[k])

    use_feats = [1 for _ in range(len(args.get('use_feats', feat2idx_map)))]
    for k, v in feat2idx_map.items():
        if v >= len(use_feats): break
        use_feats[v] = int(args.get(k, 100) != 1)

    args['use_feats'] = tuple(use_feats)

    epochs = args['epochs'] = int(args['epochs']) #4
    batch_size = args['batch_size'] = int(args['batch_size']) #128+256
    steps_subsample = args['steps_subsample'] = float('%.3f'%args['steps_subsample']) #0.95
    base_lr = args['base_lr'] = args['base_lr'] #2e-4
    factor = args['factor'] = float('%.3f'%args['factor']) #1.
    offset = args['offset'] = float('%.3f'%args['offset']) #0.5
    emb_size = args['emb_size'] = int(args['emb_size']) # 64
    dense_size = args['dense_size'] = int(args['dense_size']) #1024
    sex_weight = args['sex_weight'] = float('%.3f'%args['sex_weight'])
    age_weight = args['age_weight'] = float('%.3f'%args['age_weight'])
    sex_alpha = args['sex_alpha']
    age_alpha = args['age_alpha']
    scheduler = get_scheduler(base_lr, factor, offset)

    AUX_DAT = [
             all_data_sqrt,
             reg_id[:, None],
             city_id[:, None],
             cpeman_id[:, None],
             cpemodname_id[:, None],
             cpetype_id[:, None],
             price_id[:, None]**0.5,
             cbag_all,
             date_id[:, None],
             pod_id[:, None],
             doc2vec_feats,
             clipmap,
             titlemap,
             htmlmap,
             simweb_feats,
             bigrams_feats,
    ]
    AUX_DAT = [xx for xx, remain in zip(AUX_DAT, [1]+use_feats) if remain]

    print(args)

    model_weights = []
    preds_all = []

    scores = []
    for cache, (train_ind, test_ind) in zip(generate_folds(folds), tqdm(folds)):
        [train_aux_dat, train_y],\
        [val_aux_dat, val_y] = cache
        train_y_sex, train_y_age, oof_sex, oof_age = train_y
        val_y_sex, val_y_age = val_y

        train_aux_dat = [xx for xx, remain in zip(train_aux_dat, [1]+use_feats) if remain]
        val_aux_dat = [xx for xx, remain in zip(val_aux_dat, [1]+use_feats) if remain]

        for _ in range(3):
            model_nn = get_model_all(**args)

            st_time = time.time()

            weights_backup = model_nn.get_weights()

            model_nn.fit(train_aux_dat,
                      [(oof_sex*sex_alpha + train_y_sex*(1 - sex_alpha)),
                       (oof_age*age_alpha + train_y_age*(1 - age_alpha))],
                      batch_size=batch_size,
                      steps_per_epoch=int(steps_subsample*train_ind.size/batch_size),
                      epochs=epochs,
                      callbacks=[tf.keras.callbacks.LearningRateScheduler(scheduler)],
                      verbose=True)

            if args.get('pseudo', False):
                # PSEUDO-labelling
                model_nn = change_state(model_nn, 'test')
                AUX_PREDS = model_nn.predict(AUX_DAT, batch_size=1024)
                model_nn = change_state(model_nn, 'train')
                model_nn.set_weights(weights_backup)
                del weights_backup

                tf.keras.backend.set_value(model_nn.optimizer.lr, args.get('pretrain_lr', base_lr))
                model_nn.fit(AUX_DAT, AUX_PREDS,
                          batch_size=batch_size,
                          epochs=1,
                          verbose=False)

                tf.keras.backend.set_value(model_nn.optimizer.lr, base_lr)
                model_nn.fit(train_aux_dat,
                             [oof_sex*sex_alpha + train_y_sex*(1 - sex_alpha),
                              oof_age*age_alpha + train_y_age*(1 - age_alpha)],
                          batch_size=batch_size,
                          steps_per_epoch=int(steps_subsample*train_ind.size/batch_size),
                          epochs=epochs,
                          callbacks=[tf.keras.callbacks.LearningRateScheduler(scheduler)],
                          verbose=False)

            model_nn = change_state(model_nn, 'test')
            preds = \
            model_nn.predict(val_aux_dat, batch_size=1024)

            preds_all.append(preds)

            model_weights.append(model_nn.get_weights())

            end_time = time.time()
            spent_seconds = end_time - st_time

            f1 = \
            f1_score(val_y_age.argmax(axis=1),
                     preds[-1].argmax(axis=1), average='weighted')*100
            rocauc = \
            roc_auc_score(val_y_sex[val_y_sex!=0.5],
                          preds[0].flatten()[val_y_sex!=0.5])*100

            score = f1*2 + (rocauc-50)*2
            scores.append((score, f1, rocauc, spent_seconds))

            print(*scores[-1], sep='\t')
    return model_weights, preds_all, scores


## Пример набора конфигураций архитектур моделей

In [37]:
# 2023-03-29
model_argses = [
    {'age_alpha': 0.7000000000000001,
    'age_extra_dim': 32,
    'age_weight': 4.2,
    'base_lr': 0.0017782794100389228,
    'batch_size': 256,
    'bn': True,
    'clipvalue': 1.0,
    'dense_size': 608,
    'dropout': 0.05,
    'e_bigram': 107,
    'e_cbag': 181,
    'e_city': 90,
    'e_cpeman': 38,
    'e_cpemodname': 1,
    'e_cpetype': 90,
    'e_doc': -1,
    'e_html': -1,
    'e_pod': 64,
    'e_region': 90,
    'emb_size': 76,
    'epochs': 4,
    'factor': 1.0,
    'l1_reg': 3.162277660168379e-07,
    'nn_act': 'elu',
    'offset': 0.2,
    'pre_dropout': True,
    'pretrain_lr': 0.0001,
    'pseudo': False,
    'sex_alpha': 0.55,
    'sex_weight': 2.2,
    'steps_subsample': 1.0,
    'use_emb_act': 'tanh',
    'use_feats': (1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)},
    {'age_alpha': 0.7000000000000001,
    'age_extra_dim': 32,
    'age_weight': 3.9,
    'base_lr': 0.0031622776601683794,
    'batch_size': 256,
    'bn': True,
    'clipvalue': 1.0,
    'dense_size': 861,
    'dropout': 0.05,
    'e_bigram': 128,
    'e_cbag': 304,
    'e_city': 152,
    'e_cpeman': 19,
    'e_cpemodname': 1,
    'e_cpetype': 32,
    'e_doc': -1,
    'e_html': -1,
    'e_pod': 53,
    'e_region': 53,
    'emb_size': 64,
    'epochs': 4,
    'factor': 1.1,
    'l1_reg': 1e-07,
    'nn_act': 'elu',
    'offset': 0.0,
    'pre_dropout': True,
    'pretrain_lr': 0.00031622776601683794,
    'pseudo': False,
    'sex_alpha': 0.75,
    'sex_weight': 2.2,
    'steps_subsample': 1.0,
    'use_emb_act': 'tanh',
    'use_feats': (1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)}
]

## Функция для подбора оптимального смещения

In [39]:
def f1_weighted(x, *args):
    x, s = x[:len(age_bins)], scipy_s
    x = np.maximum(x, scipy_eps)
    x /= x.sum()
    return -f1_score(y_all-1,
                     (preds_age_logits+x*s).argmax(axis=1),
                     average='weighted')

## Получение предсказаний от ансамбля нейронок

In [None]:
# generate preds
savefile = 'preds_290323_age_p%d.pickle.gz'
K = 1
np.random.choice(42)
order = np.random.permutation(len(model_argses))
while True:
    if os.path.exists(savefile%K):
        K += 1
        continue

    model_args = model_argses[order[(K-1)%len(model_argses)]]
    model_args['pseudo'] = K % 3 == 0
    print(model_args['pseudo'])

    model_weights, preds_all, scores = \
        get_score(model_args)

    preds_age = np.zeros_like(y_all)
    preds_age_logits = np.zeros((y_all.shape[0], np.unique(y_all).size))
    preds_sex = np.zeros_like(trg_sex)
    for i, (train_ind, test_ind) in enumerate(tqdm(folds)):
        for k in range(len(model_weights)//len(folds)):
            offset = len(model_weights)//len(folds)*i + k
            preds_age_logits[test_ind] += preds_all[offset][-1]
            preds_sex[test_ind] += (
                preds_all[offset][0].flatten().argsort().argsort() / preds_all[offset][0].size
            )
        preds_age[test_ind] = preds_age_logits[test_ind].argmax(axis=1)
    preds_age_logits /= (len(model_weights)//len(folds))
    assert preds_age_logits.max() <= 1

    print(
        f1_score(y_all-1,
                 preds_age,
                 average='weighted'),\
        roc_auc_score(trg_sex[trg_sex!=0.5],
                      preds_sex[trg_sex!=0.5])
        )

    bscore = f1_score(y_all-1, preds_age_logits.argmax(axis=1),
                  average='weighted')
    random_biases = np.random.dirichlet(np.ones(len(age_bins))/len(age_bins),
                                        size=100)*\
                    np.random.random(size=(100, 1))*\
                    0.5
    for random_bias in tqdm(random_biases):
        score = \
        f1_score(y_all-1,
                 (preds_age_logits+random_bias).argmax(axis=1),
                 average='weighted')
        if score > bscore:
            bscore = score
            bias = random_bias
            print(bscore, bias)

    # cyborg-ml подобрал это значение, потом уже не менял
    scipy_s = 0.3
    scipy_eps = 1e-10
    scipy_res = None
    for _ in tqdm(range(5)):
        scipy_res = minimize(f1_weighted,
                             np.ones(len(age_bins)) if scipy_res is None else scipy_res.x,
                             method='COBYLA')
        scipy_bias = np.maximum(scipy_res.x, scipy_eps)/\
                     np.maximum(scipy_res.x, scipy_eps).sum()*\
                     scipy_s
        print(abs(scipy_res.fun), scipy_bias)

    all_preds_age = 0
    all_preds_sex = 0

    use_feats = [1] + list(model_args['use_feats'])
    all_feats = [all_data_sqrt,
                 reg_id[:, None],
                 city_id[:, None],
                 cpeman_id[:, None],
                 cpemodname_id[:, None],
                 cpetype_id[:, None],
                 price_id[:, None]**0.5,
                 cbag_all,
                 date_id[:, None],
                 pod_id[:, None],
                 doc2vec_feats,
                 clipmap,
                 titlemap,
                 htmlmap,
                 simweb_feats,
                 bigrams_feats,
                ]
    all_feats = [xx for xx, remain in zip(all_feats, use_feats) if remain]
    model_nn = get_model_all(**model_args)
    model_nn = change_state(model_nn, 'test')

    for ws in tqdm(model_weights):
        model_nn.set_weights(ws)
        preds = \
            model_nn.predict(all_feats, batch_size=2048)

        all_preds_sex += preds[0].flatten().argsort().argsort()/all_data_sqrt.shape[0]
        all_preds_age += preds[-1]
        del preds
    all_preds_sex /= len(model_weights)
    all_preds_age /= len(model_weights)

    all_preds_age += scipy_bias
    all_preds_age_label = all_preds_age.argmax(axis=1) + 1

    all_preds_sex[all_mask] = preds_sex
    all_preds_age[all_mask] = preds_age_logits + scipy_bias

    with gzip.open(savefile%K, 'wb') as f:
        pickle.dump(dict(sex=all_preds_sex,
                         age=all_preds_age,
                         age_bias=scipy_bias,
                         cv_stats=scores,
                         model_params=model_args),
                    f, protocol=-1)

    K += 1

    clear_output()
    print(K)

True
{'age_alpha': 0.2, 'age_extra_dim': 256, 'age_weight': 2.6, 'base_lr': 5.623413251903491e-05, 'batch_size': 192, 'bn': True, 'clipvalue': 4.0, 'dense_con_num': 3, 'dense_size': 1024, 'e_cbag': 32, 'e_city': 1, 'e_clip': 1, 'e_cpemodname': 9, 'e_cpetype': 1, 'e_pod': 1, 'e_price': 1, 'e_simweb': -1, 'e_title': 1, 'emb_size': 13, 'epochs': 4, 'factor': 1.4, 'l1_reg': 3.162277660168379e-07, 'nn_act': 'elu', 'offset': 1.0, 'pretrain_lr': 0.0005623413251903491, 'pseudo': True, 'sex_alpha': 0.35000000000000003, 'sex_weight': 2.1, 'steps_subsample': 0.95, 'use_feats': (1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1)}


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1/4


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/4
Epoch 3/4
Epoch 4/4


  0%|          | 0/9 [00:00<?, ?it/s]

173.97472761871808	48.39572427590716	88.59163953345188	350.71255445480347
Epoch 1/4


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/4
Epoch 3/4
Epoch 4/4


  0%|          | 0/9 [00:00<?, ?it/s]

174.1719040565693	48.47452400504317	88.61142802324149	390.9869201183319
Epoch 1/4


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/4
Epoch 3/4
Epoch 4/4


  0%|          | 0/9 [00:00<?, ?it/s]

174.09780585947487	48.50420629196513	88.5446966377723	407.76906752586365
Epoch 1/4


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/4
Epoch 3/4
Epoch 4/4


  0%|          | 0/9 [00:00<?, ?it/s]

174.1971953611388	48.235646273810076	88.86295140675932	403.336975812912
Epoch 1/4


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/4
Epoch 3/4
Epoch 4/4


  0%|          | 0/9 [00:00<?, ?it/s]

174.23362795552833	48.230154464893396	88.88665951287076	399.49020886421204
Epoch 1/4


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/4
Epoch 3/4
Epoch 4/4


  0%|          | 0/9 [00:00<?, ?it/s]

173.43053957288572	47.89506559882889	88.82020418761397	394.3228905200958
Epoch 1/4


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/4
Epoch 3/4
Epoch 4/4


  0%|          | 0/9 [00:00<?, ?it/s]

174.00067615636695	48.19739728479435	88.80294079338913	381.6654200553894
Epoch 1/4


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/4
Epoch 3/4
Epoch 4/4


  0%|          | 0/9 [00:00<?, ?it/s]

173.74316629959458	48.02121147071683	88.85037167908045	356.58821058273315
Epoch 1/4


  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


Epoch 2/4
Epoch 3/4
Epoch 4/4

Аналогично формируются файлики скоров для пола