## Ограничения на ресурсы для numpy, импорт библиотек

In [1]:
import os
os.environ["OMP_NUM_THREADS"] = "4" # export OMP_NUM_THREADS=4
os.environ["OPENBLAS_NUM_THREADS"] = "4" # export OPENBLAS_NUM_THREADS=4 
os.environ["MKL_NUM_THREADS"] = "6" # export MKL_NUM_THREADS=6
os.environ["VECLIB_MAXIMUM_THREADS"] = "4" # export VECLIB_MAXIMUM_THREADS=4
os.environ["NUMEXPR_NUM_THREADS"] = "6" # export NUMEXPR_NUM_THREADS=6

In [19]:
import pandas as pd
import numpy as np
import os
import time
from tqdm.auto import tqdm
from functools import reduce

import scipy
from scipy.sparse import csr_matrix, hstack, vstack
from scipy.optimize import minimize
import gzip
import pickle

import sklearn
from sklearn.utils import resample
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import QuantileTransformer, Normalizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, f1_score
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.model_selection import cross_val_predict

import matplotlib.pyplot as plt
%matplotlib inline

## Загрузка мешка слов по ссылкам

In [3]:
with gzip.open('files/url_host_cbag_v2.pickle.gz', 'rb') as f:
    datamap = pickle.load(f)
datamap

{'data': <415317x199684 sparse matrix of type '<class 'numpy.uint32'>'
 	with 32277669 stored elements in Compressed Sparse Row format>,
 'uids': array([     4,     16,     18, ..., 415276, 415288, 415293])}

## Получение частотных ссылок мешка слов url_host

In [4]:
feats_mask = (np.array((datamap['data']>0).sum(axis=0)).flatten() > 40)
feats_mask.sum()

20144

## Считывание файла с таргетами

In [5]:
trg_df = pd.read_csv('target.tsv.gz', sep='\t')
trg_df.sample(10)

Unnamed: 0,user_id,age,is_male
175229,88118,39.0,0.0
240299,324114,44.0,0.0
308114,171028,26.0,1.0
369355,369120,50.0,0.0
307013,159777,52.0,1.0
13318,132129,37.0,0.0
285557,361222,36.0,0.0
125185,5367,,
150019,253386,33.0,0.0
85200,20466,,


# Подготовка таргетов и поднабора юзеров из обучения

In [6]:
key = 'data'

(trg_df.age.isna()|trg_df.is_male.isna()).sum(),\

all_mask = (~trg_df.age.isna()|~trg_df.is_male.isna()).values.copy()
trg_train = trg_df[all_mask].fillna({'is_male': 0.5, 'age':34})
trg_age = trg_train.age.values.copy()
trg_sex = trg_train.is_male.values.copy()


age_bins = [[0, 25], [26, 35], [36, 45], [46, 55], [56, 65], [66, 999]]

print('Train sample:', all_mask.sum())

y_all = 0
for k, age_bin in enumerate(age_bins):
    y = pd.Series(trg_age).between(*age_bin).values.copy()
    y_all += y*(k+1)

X_tr = datamap[key][all_mask][:, feats_mask]

y_all.min(), y_all.max(), X_tr.shape

Train sample: 270000


(1, 6, (270000, 20144))

# Настройка валидации

In [7]:
RANDOM_SEED = 42
kfold = StratifiedKFold(n_splits=10,
                        shuffle=True,
                        random_state=RANDOM_SEED)
folds = [(train_ind, test_ind) for train_ind, test_ind in
         kfold.split((np.uint8(trg_sex*2)+y_all*10).astype(str),
                     (np.uint8(trg_sex*2)+y_all*10).astype(str))]

# Загрузка скоров

In [8]:
all_preds_sex = []
all_preds_age = []
model_names = []
for k_start, k_end, file in [(1, 4, 'preds_290323_age_p'),
                             (1, 6, 'preds_290323_sex_p'),
                             (1, 5, 'preds_280323_sex_p'),
                             (1, 6, 'preds_280323_age_p'),
                             (1, 7, 'preds_270323_sex_p'),
                             (1, 10, 'preds_270323_age_p'),
                             (1, 21, 'preds_230323_age_p'),
                             (1, 16, 'preds_230323_sex_p'),
                             (1, 18, 'preds_120323_sex_p'),
                             (1, 14, 'preds_120323_age_p'),
                             (1, 14, 'preds_100323_sex_p'),
                             (1, 12, 'preds_100323_age_p'),
                             (1, 9, 'preds_070323_age_p'),
                             (1, 9, 'preds_070323_sex_p'),
                            ]:
    tmp_preds_sex = []
    tmp_preds_age = []
    for k in tqdm(range(k_start, k_end)):
        filename = '%s%d.pickle.gz'%(file, k)
        with gzip.open(filename, 'rb') as f:
            predsmap = pickle.load(f)
            tmp_preds_age.append(predsmap['age'])
            tmp_preds_sex.append(predsmap['sex'])
            model_names.append(filename)
    all_preds_sex.extend(tmp_preds_sex)
    all_preds_age.extend(tmp_preds_age)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

## Лучшая модель по полу

In [9]:
roc_aucs = []
for probs in tqdm(all_preds_sex):
    roc_aucs.append(
        roc_auc_score(trg_sex[trg_sex!=0.5],
                      probs[all_mask][trg_sex!=0.5])
    )
model_names[np.argmax(roc_aucs)], np.max(roc_aucs)

  0%|          | 0/137 [00:00<?, ?it/s]

('../preds_120323_sex_p8.pickle.gz', 0.8909671840463607)

## Лучшая модель по возрасту

In [10]:
f1s = []
for probs in tqdm(all_preds_age):
    f1s.append(
        f1_score(y_all-1,
             probs.argmax(axis=1)[all_mask],
             average='weighted'
        )
    )
model_names[np.argmax(f1s)], np.max(f1s)

  0%|          | 0/137 [00:00<?, ?it/s]

('../preds_120323_age_p1.pickle.gz', 0.4909293614502094)

## Лучший скор 1 модели

In [11]:
np.max(f1s)*2 + np.max(roc_aucs)*2 - 1

1.7637930909931399

## Stacking Logreg для пола 

In [13]:
lr = \
LogisticRegression(C=1e-2, penalty='l2',
                   solver='liblinear', max_iter=10)

probs = \
cross_val_predict(lr,
                  np.array(all_preds_sex
                                 ).T.reshape(-1,
                                             len(all_preds_sex)
                                            )[all_mask],
                  trg_sex.astype(int),
                  cv=folds,
                  verbose=2,
                  n_jobs=len(folds),
                  method='predict_proba')[:, 1]

lr.fit(np.array(all_preds_sex
             ).T.reshape(-1,
                         len(all_preds_sex)
                        )[all_mask],
       trg_sex.astype(int))

cv_scores = []
for _, test_ind in folds:
    probs[test_ind] = probs[test_ind].argsort().argsort() / test_ind.size
    cv_score = \
    roc_auc_score(trg_sex[test_ind][trg_sex[test_ind]!=0.5],
                        probs[test_ind][trg_sex[test_ind]!=0.5])
    cv_scores.append(cv_score)
    print(cv_score)

print('='*50)

print(roc_auc_score(trg_sex[trg_sex!=0.5], probs[trg_sex!=0.5]))

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:   12.1s remaining:   28.3s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:   15.3s finished


0.8918029226553036
0.8942087044774247
0.8932897496369843
0.8928045083021292
0.8935212828850132
0.8931581642824244
0.8958370687504125
0.8952329401364499
0.8934173359008301
0.8942469017355863
0.8937529397263528


## Взвешивание скоров пола по коэффициентам логрега

In [14]:
all_preds_sex = lr.coef_.flatten().dot(all_preds_sex)

## Stacking One-VS-Rest Ridge для возраста

In [16]:
scipy_tmp = np.array(all_preds_age).T
print(scipy_tmp.shape)
age_coefs = []
age_scores = []

scipy_tmp_all = np.array(scipy_tmp).transpose([1, 0, 2]).reshape(all_preds_sex.size, -1)
age_probs_cv = np.zeros((all_mask.sum(), len(age_bins)))
age_probs_train = np.zeros((all_mask.sum(), len(age_bins)))
age_probs_all = np.zeros((all_preds_sex.size, len(age_bins)))

for trg_val in tqdm(range(1, 7)):
    trg_pair = [trg_val - 1]
    age_X_tr = np.concatenate(scipy_tmp[trg_pair].transpose([0, 2, 1]), axis=0).T
    for train_ind, test_ind in tqdm(folds):
        lr = Ridge(alpha=15,
                   fit_intercept=True)

        lr.fit(age_X_tr[all_mask][train_ind],
               (y_all==trg_val)[train_ind])

        tmp_preds = lr.predict(age_X_tr) - lr.intercept_
        age_probs_cv[test_ind, trg_val-1] = tmp_preds[all_mask][test_ind]
        age_probs_train[train_ind, trg_val-1] = tmp_preds[all_mask][train_ind]
        age_probs_all[:, trg_val-1] += tmp_preds

cm = confusion_matrix(y_all - 1, age_probs_cv.argmax(axis=1))
# распределение бинов возраста в public_lb
true_weights = np.array([11.970, 32.675, 28.855, 15.886, 8.643, 1.970]) / 1e2
print((2 * np.diag(cm) / (cm.sum(axis=1) + cm.sum(axis=0))).dot(true_weights))

(6, 415317, 137)


  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

0.49338040839524056


## Оценка распределения бинов на private_lb

In [17]:
# восстанавливаем границы бинов как они указаны в правилах
age_bins_ = age_bins.copy()
age_bins_[0] = [19, 25]
age_labels, age_cnts = \
np.unique(
    np.concatenate([trg_df.age.between(*age_bin).values[:, None] for age_bin in age_bins_],
               axis=1).argmax(axis=1)[all_mask],
    return_counts=True
)
# оценка распределения бинов возраста как взвешенная сумма распределений на train и public
# в train 270к примеров, по оценке в паблик 80к примеров
age_cnts_estimator = \
age_cnts / age_cnts.sum() * 270_000 + true_weights * 80_000

true_weights, \
age_cnts / age_cnts.sum(), \
age_cnts_estimator / age_cnts_estimator.sum()

(array([0.1197 , 0.32675, 0.28855, 0.15886, 0.08643, 0.0197 ]),
 array([0.12488519, 0.32322222, 0.28698519, 0.15719259, 0.08733333,
        0.02038148]),
 array([0.12370028, 0.32402931, 0.28734351, 0.15757407, 0.08712706,
        0.02022576]))

## Финальный подбор вектора-смещений, максимизирующий f1_w

In [24]:
def maximize_public_f1(bias):
    bias = np.round(bias, decimals=4)
    cm = confusion_matrix(y_all - 1,
                          (age_probs_cv + bias).argmax(axis=1))
    true_weights = age_cnts_estimator / age_cnts_estimator.sum()
    return -(2 * np.diag(cm) / (cm.sum(axis=1) + cm.sum(axis=0))).dot(true_weights)

scipy_eps = 1e-10
scipy_res = None
bf1 = 0
bias = 0.
max_iters = 2000
# стохастический поиск для инициализации смещения
for n_iter in tqdm(range(max_iters)):
    x = np.random.normal(size=len(age_bins)) \
        / np.random.choice([50., 100., 150., 200.])
    x += bias * (1 + x)
    fun = maximize_public_f1(x)
    if abs(fun) > bf1:
        print(abs(fun), x)
        bf1 = abs(fun)
        bias = np.round(x, decimals=4)
# уточнения смещения с помощью COBYLA
for _ in range(10):
    scipy_res = minimize(maximize_public_f1,
                     bias,
                     method='COBYLA')
    print(abs(scipy_res.fun), scipy_res.x)
    if abs(scipy_res.fun) > bf1:
        bf1 = abs(scipy_res.fun)
        bias = np.round(scipy_res.x, decimals=4)

  0%|          | 0/2000 [00:00<?, ?it/s]

0.4908357576361695 [-0.02874765 -0.03408524  0.00556411 -0.0418203   0.00133967 -0.00287613]
0.49182633722398944 [-4.34965430e-02 -3.34949068e-02  2.63146731e-05 -3.01812489e-02
  5.28102074e-03 -1.23504065e-02]
0.4927392314462021 [-0.06045191 -0.03874057 -0.0152765  -0.00653757 -0.01034066 -0.00627375]
0.49297593576638843 [-0.05870689 -0.04105045 -0.01174162 -0.00798199 -0.00982969 -0.00546295]
0.493005566764838 [-0.05221374 -0.03682986 -0.00980327 -0.0063023  -0.01229111 -0.00620983]
0.4930785537628196 [-0.05361593 -0.03813562 -0.01242069 -0.01628108 -0.02326835  0.01082136]
0.4932179617459513 [-0.05855348 -0.02456908 -0.01870446 -0.01645858 -0.01892284  0.00987437]
0.49330413564395165 [-0.05062976 -0.02345836 -0.02221764 -0.0259574  -0.01351932  0.02017845]
0.49343054180720747 [-0.06108646 -0.04699273 -0.02805985 -0.03302649 -0.03484441  0.0443784 ]
0.4935964470122288 [-0.05918954 -0.05068    -0.04124891 -0.03784762 -0.03133535  0.0631746 ]
0.4937103984587975 [-0.05528023 -0.0452506

## Получение бинов возраста

In [25]:
all_preds_age_label = (age_probs_all + bias).argmax(axis=1) + 1
all_preds_age_label

array([2, 3, 3, ..., 5, 3, 2])

## Скор после stacking'а

In [27]:
2 * bf1 + 2 * roc_auc_score(trg_sex[trg_sex!=0.5], probs[trg_sex!=0.5]) - 1

1.7749266763703009

## Формирование посылки

In [28]:
sample_df = pd.read_csv('baseline_submission_wo_index.csv')
sample_df = \
sample_df.merge(pd.DataFrame(dict(user_id=datamap['uids'],
                                  idx=np.arange(len(datamap['uids'])))),
                how='left')

sample_df['is_male'] = all_preds_sex[sample_df.idx.values].argsort().argsort()/sample_df.shape[0]
sample_df['age'] = all_preds_age_label[sample_df.idx.values]
sample_df = \
sample_df.sort_values(by='user_id').\
          drop('idx', axis=1)

sample_df.\
    to_csv('20230329r_submission_wo_index.csv',
           sep=',',
           index=False)