In [1]:
import os
if 'ozom671games.zip' not in os.listdir():
    from google.colab import drive
    drive.mount('/content/drive')
    !pip install transformers
    !pip install sentencepiece
    !pip install bitsandbytes
    !cp drive/MyDrive/ozon/ozom671games.zip ozom671games.zip
    !unzip ozom671games.zip
    !pip install catboost

In [2]:
import pandas as pd
import json
from tqdm.notebook import tqdm
import numpy as np
from scipy.spatial.distance import cosine, euclidean
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import StratifiedKFold, GroupKFold
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool
import catboost

In [3]:
train = pd.read_parquet('train_data.parquet')
target = pd.read_parquet('train_pairs.parquet')

In [4]:
ltr = len(target)

In [5]:
test = pd.read_parquet('test_data.parquet')
test_target = pd.read_parquet('test_pairs_wo_target.parquet')

In [6]:
target = pd.concat([target, test_target]).reset_index(drop = True)

In [7]:
train['characteristic_attributes_mapping'] = train['characteristic_attributes_mapping'].fillna('{}').apply(json.loads )
train['categories'] = train['categories'].fillna('{}').apply(json.loads)

test['characteristic_attributes_mapping'] = test['characteristic_attributes_mapping'].fillna('{}').apply(json.loads )
test['categories'] = test['categories'].fillna('{}').apply(json.loads)

In [8]:
data = pd.concat([train, test[~test.variantid.isin(train['variantid'].unique())] ])

In [9]:
data = target.merge(data, right_on = 'variantid', left_on = 'variantid1', how = 'left').merge(
    data, right_on = 'variantid', left_on = 'variantid2', how = 'left', suffixes=('_1', '_2'))

In [10]:
main_pic_dist_cos = []
main_pic_dist_euc = []
for emb1, emb2 in tqdm(data[['main_pic_embeddings_resnet_v1_1', 'main_pic_embeddings_resnet_v1_2']].values):
    if type(emb1) != np.ndarray or type(emb2) != np.ndarray:
        main_pic_dist_cos += [-1]
        main_pic_dist_euc += [-1]
    else:
        main_pic_dist_cos += [ cosine(emb1[0], emb2[0]) ]
        main_pic_dist_euc += [ euclidean(emb1[0], emb2[0]) ]

  0%|          | 0/324624 [00:00<?, ?it/s]

In [34]:
metrics = ['cosine', 'euclidean']

dict_pic_dist_cos = {m:[] for m in metrics}

for emb1, emb2 in tqdm(data[['pic_embeddings_resnet_v1_1', 'pic_embeddings_resnet_v1_2']].values):
    if type(emb1) != np.ndarray or type(emb2) != np.ndarray:
        for metric in metrics:
            dict_pic_dist_cos[metric] += [(np.zeros(4) - 1).tolist()]
        continue
    list_th = [0.1, 1]
    emb1 = np.array([x for x in emb1])
    emb2 = np.array([x for x in emb2])

    for metric, th in zip(metrics, list_th):
        dist = pairwise_distances(emb1, emb2, metric = metric)
        dict_pic_dist_cos[metric] += [ [np.mean(dist), np.max(dist), np.min(dist), dist[dist <= th].shape[0]] ]

  0%|          | 0/324624 [00:00<?, ?it/s]

In [12]:
name_bert_dist_cos = []
name_bert_dist_euc = []
for emb1, emb2 in tqdm(data[['name_bert_64_1', 'name_bert_64_2']].values):
    if type(emb1) != np.ndarray or type(emb2) != np.ndarray:
        name_bert_dist_cos += [-1]
        name_bert_dist_euc += [-1]
    else:
        name_bert_dist_cos += [ cosine(emb1, emb2) ]
        name_bert_dist_euc += [ euclidean(emb1, emb2) ]

  0%|          | 0/324624 [00:00<?, ?it/s]

In [35]:
data['main_pic_dist_cos'] = main_pic_dist_cos
data['main_pic_dist_euc'] = main_pic_dist_euc

data['name_bert_dist_cos'] = main_pic_dist_cos
data['name_bert_dist_euc'] = main_pic_dist_euc

for metric in metrics:
    vals = np.array(dict_pic_dist_cos[metric])
    for i, name in enumerate(['mean', 'max', 'min', 'th']):
        data[f'{metric}_{name}'] = vals[:, i]

In [36]:
import pickle
with open('drive/MyDrive/preds_train_final.pickle', 'rb') as f:
    dict_predicts = pickle.load(f)

In [37]:
for keys in dict_predicts:
    list_predict_0, list_predict_1, train_predict_0, train_predict_1 = dict_predicts[keys]
    data[f'{keys}_pred'] = -1
    data.loc[:ltr - 1, f'{keys}_pred'] = train_predict_0
    data.loc[ltr:, f'{keys}_pred'] = np.mean(list_predict_0, 0)

    data[f'{keys}_pred_1'] = -1
    data.loc[:ltr - 1, f'{keys}_pred_1'] = train_predict_1
    data.loc[ltr:, f'{keys}_pred_1'] = np.mean(list_predict_1, 0)

In [16]:
data['cat_3'] = [x['3'] for x in data['categories_1']]

In [17]:
data['tmp'] = data['target'].astype('str') + '_' + data['cat_3']

In [18]:
def standart_split(data, target, n_splits = 5):
    split_list = []
    kf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = 228)
    for train_index, test_index in kf.split(data.loc[:ltr-1,:], data['tmp'][:ltr]) :
        split_list += [(train_index, test_index)]
    return split_list

split_list = standart_split(data, 'target')



In [19]:
inter_cat_1 = []
inter_cat_2 = []

for cat1, cat2 in data[['color_parsed_1', 'color_parsed_2']].values:
    if type(cat1) == np.ndarray and type(cat2) == np.ndarray:
        intersec = set(cat1) & set(cat2)
        inter_cat_1 += [len(intersec) - len(set(cat1))]
        inter_cat_2 += [len(intersec) - len(set(cat2))]
    else:
        inter_cat_1 += [-1]
        inter_cat_2 += [-1]       

In [20]:
data['inter_cat_1'] = inter_cat_1
data['inter_cat_2'] = inter_cat_2

In [46]:
data['full_sum'] = data[['model_ozon_3_fold_pred',
 'model_ozon_3_fold_pred_1',
 'model_ozon_5_fold_2_pred',
 'model_ozon_5_fold_2_pred_1',
 'model_ozon_4_fold_pred',
 'model_ozon_4_fold_pred_1',
 'model_ozon_5_fold_3_pred',
 'model_ozon_5_fold_3_pred_1',
 'model_ozon_9_fold_pred',
 'model_ozon_9_fold_pred_1', 
 'model_ozon_8_fold_pred',
 'model_ozon_8_fold_pred_1',
 'model_ozon_7_fold_pred',
 'model_ozon_7_fold_pred_1'
 ]].mean(1)

In [47]:
data['full_sum_1'] = data[[
 'model_ozon_5_fold_2_pred',
 'model_ozon_5_fold_2_pred_1',
 'model_ozon_4_fold_pred',
 'model_ozon_4_fold_pred_1',
 'model_ozon_9_fold_pred',
 'model_ozon_9_fold_pred_1', 
 'model_ozon_8_fold_pred',
 'model_ozon_8_fold_pred_1',
 'model_ozon_7_fold_pred',
 'model_ozon_7_fold_pred_1'
 ]].mean(1)

In [22]:
from itertools import islice

def is_sublist(source, target):
    slen = len(source)
    return any(all(item1 == item2 for (item1, item2) in zip(source, islice(target, i, i+slen))) for i in range(len(target) - slen + 1))

def long_substr_by_word(data):
    subseq = []
    data_seqs = [s.split(' ') for s in data]
    if len(data_seqs) > 1 and len(data_seqs[0]) > 0:
        for i in range(len(data_seqs[0])):
            for j in range(len(data_seqs[0])-i+1):
                if j > len(subseq) and all(is_sublist(data_seqs[0][i:i+j], x) for x in data_seqs):
                    subseq = data_seqs[0][i:i+j]
    return ' '.join(subseq)

list_ls = []
for w1,  w2 in tqdm(data[['name_1', 'name_2']].values):
    list_ls += [ long_substr_by_word([w1, w2]) ]

data['list_ls'] = list_ls

data['l1'] = [len(x[0].split()) / len(x[1].split()) for x in data[['list_ls', 'name_1']].values]
data['l2'] = [len(x[0].split()) / len(x[1].split()) for x in data[['list_ls', 'name_2']].values]

data['l3'] = [len(x[0].split()) - len(x[1].split()) for x in data[['list_ls', 'name_1']].values]
data['l4'] = [len(x[0].split()) - len(x[1].split()) for x in data[['list_ls', 'name_2']].values]
data['l5'] = [len( x[0].split()) for x in data[['list_ls', 'name_2']].values]

  0%|          | 0/324624 [00:00<?, ?it/s]

In [23]:
vc = data.loc[ltr:, 'cat_3'].value_counts().head(25).index
data['cat_30'] = data['cat_3']
data.loc[~data['cat_3'].isin(vc), 'cat_30'] = 'rest'

In [48]:
drop_cols = ['target', 'variantid1', 'variantid2', 'variantid_1', 'name_1',
       'categories_1', 'color_parsed_1', 'pic_embeddings_resnet_v1_1',
       'main_pic_embeddings_resnet_v1_1', 'name_bert_64_1',
       'characteristic_attributes_mapping_1', 'variantid_2', 'name_2',
       'categories_2', 'color_parsed_2', 'pic_embeddings_resnet_v1_2',
       'main_pic_embeddings_resnet_v1_2', 'name_bert_64_2', 
       'new_target', 'len_2', 'tmp', 'cat_3',  'type','len_1', 'list_ls', 'full_std',
       'characteristic_attributes_mapping_2', 'cat_3_check', 'cat3_grouped']
train_cols = [x for x in data.columns if x not in drop_cols]

train_cols

['main_pic_dist_cos',
 'main_pic_dist_euc',
 'name_bert_dist_cos',
 'name_bert_dist_euc',
 'cosine_mean',
 'cosine_max',
 'cosine_min',
 'cosine_th',
 'euclidean_mean',
 'euclidean_max',
 'euclidean_min',
 'euclidean_th',
 'model_ozon_3_fold_pred',
 'model_ozon_3_fold_pred_1',
 'model_ozon_4_fold_pred',
 'model_ozon_4_fold_pred_1',
 'model_ozon_5_fold_2_pred',
 'model_ozon_5_fold_2_pred_1',
 'model_ozon_5_fold_3_pred',
 'model_ozon_5_fold_3_pred_1',
 'model_ozon_7_fold_pred',
 'model_ozon_7_fold_pred_1',
 'model_ozon_8_fold_pred',
 'model_ozon_8_fold_pred_1',
 'model_ozon_9_fold_pred',
 'model_ozon_9_fold_pred_1',
 'inter_cat_1',
 'inter_cat_2',
 'full_sum',
 'l1',
 'l2',
 'l3',
 'l4',
 'l5',
 'cat_30',
 'full_sum_1']

In [41]:
params_cat = {
    'loss_function' :'Logloss', 
     'max_depth' : 6, 
    'eval_metric' :'AUC', 
    'learning_rate' : 0.05, 
    'grow_policy' : 'SymmetricTree',
    'l2_leaf_reg' : 100, 
    'random_strength' : 1,
    'random_state' : 42 ,
}

In [49]:
bst_list = []
score_list = []
preds = []
preds_train1 = np.zeros(len(data) - len(test_target))
for i , (train_index, test_index) in enumerate(split_list):
    pool_train = Pool(data.loc[train_index, train_cols], label= data.loc[train_index, 'target'], cat_features = ['cat_30'])
    pool_eval = Pool(data.loc[test_index, train_cols], label= data.loc[test_index, 'target'], cat_features = ['cat_30'])
    bst = catboost.train(pool_train, params_cat, eval_set = pool_eval, iterations = 1300, early_stopping_rounds = 250, verbose = 100)
    score_list += [bst.get_best_score()['validation']['AUC']]
    preds_train1[test_index] = bst.predict(data.loc[test_index, train_cols].values) 
    # preds += [bst.predict(test[train_cols].values)]
    bst_list += [bst]

0:	test: 0.9206331	best: 0.9206331 (0)	total: 115ms	remaining: 2m 29s
100:	test: 0.9321310	best: 0.9321310 (100)	total: 9.31s	remaining: 1m 50s
200:	test: 0.9324707	best: 0.9324707 (200)	total: 17.7s	remaining: 1m 36s
300:	test: 0.9326071	best: 0.9326079 (299)	total: 26s	remaining: 1m 26s
400:	test: 0.9326186	best: 0.9326202 (362)	total: 33.9s	remaining: 1m 16s
500:	test: 0.9326805	best: 0.9326805 (500)	total: 42.1s	remaining: 1m 7s
600:	test: 0.9327150	best: 0.9327177 (586)	total: 50.6s	remaining: 58.8s
700:	test: 0.9327400	best: 0.9327447 (682)	total: 58.4s	remaining: 49.9s
800:	test: 0.9327442	best: 0.9327516 (761)	total: 1m 6s	remaining: 41.3s
900:	test: 0.9327636	best: 0.9327651 (891)	total: 1m 13s	remaining: 32.7s
1000:	test: 0.9327744	best: 0.9327744 (1000)	total: 1m 21s	remaining: 24.3s
1100:	test: 0.9327945	best: 0.9327945 (1100)	total: 1m 28s	remaining: 16.1s
1200:	test: 0.9327944	best: 0.9327995 (1161)	total: 1m 36s	remaining: 7.97s
1299:	test: 0.9328284	best: 0.9328308 (128

In [50]:
preds = []
for i , (train_index, test_index) in enumerate(split_list):
    preds += [bst_list[i].predict(data.loc[ltr:, train_cols].values, prediction_type  = 'Probability')[:, 1] ]

In [52]:
from scipy.stats import rankdata

ss = pd.read_csv('submission_example.csv')
test_target['target'] =  np.mean([x for x in preds], 0) 
# tmp = test_target[(test_target['variantid1'].astype('str') + '_' + test_target['variantid2'].astype('str')).isin(w)]
ss = ss[['variantid1', 'variantid2']].merge(test_target, how = 'left')
ss[['variantid1', 'variantid2', 'target']].to_csv('sub_final_total.csv', index = None)