In [1]:
import pandas as pd
import numpy as np
import torch
import json
from functools import partial
import re
from transformers import XLMRobertaTokenizer, XLMRobertaModel
from fuzzywuzzy import process, fuzz
from string import punctuation
from ordered_set import OrderedSet

from tqdm.notebook import tqdm

import os
os.environ['CUDA_VISIBLE_DEVICES'] = "1"

In [2]:
# train_pairs = pd.read_parquet("./hackathon_files_for_participants_ozon/fake_ontrain.parquet")
train_pairs = pd.read_parquet("./hackathon_files_for_participants_ozon/train_pairs_after_split_v2.parquet")
# train_pairs = pd.read_parquet("./hackathon_files_for_participants_ozon/syntetic_pos_neg_bad_cats.parquet")
train_pairs['variantid1'] = train_pairs['variantid1'].apply(str)
train_pairs['variantid2'] = train_pairs['variantid2'].apply(str)
data = pd.read_parquet("./hackathon_files_for_participants_ozon/train_data.parquet")
data['variantid'] = data['variantid'].apply(str)
val_pairs = pd.read_parquet("./hackathon_files_for_participants_ozon/val_pairs_v2.parquet")

In [3]:
train_features = (
    train_pairs
    .merge(
        data
        .add_suffix('1'),
        on="variantid1",
        how='inner'
    )
    .merge(
        data
        .add_suffix('2'),
        on="variantid2",
        how='inner'
    )
)

In [4]:
val_features = (
    val_pairs
    .merge(
        data
        .add_suffix('1'),
        on="variantid1",
        how='inner'
    )
    .merge(
        data
        .add_suffix('2'),
        on="variantid2",
        how='inner'
    )
)

In [3]:
test_pairs = pd.read_parquet("./hackathon_files_for_participants_ozon/test_pairs_wo_target.parquet")
# test_pairs = pd.read_parquet("./hackathon_files_for_participants_ozon/fake_ontrain.parquet")
test_pairs['variantid1'] = test_pairs['variantid1'].apply(str)
test_pairs['variantid2'] = test_pairs['variantid2'].apply(str)
test_data = pd.read_parquet("./hackathon_files_for_participants_ozon/test_data.parquet")
# test_data = pd.read_parquet("./hackathon_files_for_participants_ozon/train_data.parquet")
test_data['variantid'] = test_data['variantid'].apply(str)

In [4]:
test_pairs.head()

Unnamed: 0,variantid1,variantid2
0,52076340,290590137
1,64525522,204128919
2,77243372,479860557
3,86065820,540678372
4,91566575,258840506


In [5]:
test_features = (
    test_pairs
    .merge(
        test_data
        .add_suffix('1'),
        on="variantid1",
        how='inner'
    )
    .merge(
        test_data
        .add_suffix('2'),
        on="variantid2",
        how='inner'
    )
)

In [6]:
tokenizer = XLMRobertaTokenizer.from_pretrained("../models/ru_roberta_base-sentence/")

In [7]:
def get_words(text):
    tokens = tokenizer.convert_ids_to_tokens(tokenizer(text, add_special_tokens=False).input_ids)
    split_chr = chr(9601)
    punct_set = set(',.?!*;|')
    cur_words = []
    cur_text = ""
    for x in tokens:
        if x.startswith(split_chr):
            if len(cur_text) > 0:
                cur_text = cur_text.replace(split_chr, '')
                cur_text = cur_text[:-1] if cur_text[-1] in punct_set else cur_text
                cur_words.append(cur_text)
            cur_text = x
        else:
            cur_text += x
    if len(cur_text) > 0:
        cur_text = cur_text.replace(split_chr, '')
        cur_text = cur_text[:-1] if cur_text[-1] in punct_set else cur_text
        cur_words.append(cur_text)
    return OrderedSet(cur_words)

In [8]:
def split_name(name):
    for_bert = []
    for_fuzz = []
    for n in tqdm(name, leave=False):
        words = get_words(n)
        f_b = []
        f_f = []
        for x in words:
            if x.isalpha():
                f_b.append(x)
            else:
                f_f.append(x)

        for_bert.append(' '.join(f_b))
        for_fuzz.append(' '.join(f_f))
    return for_bert, for_fuzz

In [9]:
def add_dif(name1, name2):
    name_diff1 = []
    name_diff2 = []
    for n1, n2 in tqdm(zip(name1, name2), total=len(name1), leave=False):
        diff1 = get_words(n1)
        diff2 = get_words(n2)
        name_diff1.append(' '.join(diff1 - diff2))
        name_diff2.append(' '.join(diff2 - diff1))
    return name_diff1, name_diff2

In [8]:
def create_text_features(arr, pos):
    text = ""
#     text += f"Название: {arr[f'name{pos}']}\n"
    
#     d_dict = arr[f"categories{pos}"]
#     if d_dict is not None:
#         d_dict = eval(d_dict)
#         text += f"Категории: {', '.join(str(x) for x in d_dict.values())}"
#         for v in d_dict.values():
#             text += str()
#         text += '\n'
    d_dict = arr[f"characteristic_attributes_mapping{pos}"]
    if d_dict is not None:
        d_dict = eval(d_dict)
        d_dict.pop('Партномер', None)
        d_dict.pop('Комплектация', None)
        d_dict.pop('Артикул', None)
        d_dict.pop('Гарантийный срок', None)
        d_dict.pop('Диапазон рабочей температуры', None)
        d_dict.pop('Вариант', None)
        text += "Характеристики: "
        char_text = ""
        for k, v in d_dict.items():
            char_text += f"{str(k)}: {','.join(str(x) for x in v)}; "
        text += char_text
    else:
        text += "Характеристики: "
    return text

In [10]:
test_features['text_features1'] = test_features.apply(partial(create_text_features, pos='1'), axis=1)
test_features['text_features2'] = test_features.apply(partial(create_text_features, pos='2'), axis=1)

In [11]:
a, b = add_dif(test_features['text_features1'].values, test_features['text_features2'].values)
test_features['diff_text_characteristics_1'] = a
test_features['diff_text_characteristics_2'] = b

  0%|          | 0/45312 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (8727 > 4096). Running this sequence through the model will result in indexing errors


In [12]:
test_features['name_toked_1'] = test_features['name1'].apply(lambda x: list(get_words(x)))
test_features['name_toked_2'] = test_features['name2'].apply(lambda x: list(get_words(x)))

In [13]:
a, b = split_name(test_features['name1'].values)
test_features['clean_name_1'] = a
test_features['service_name_1'] = b
a, b = split_name(test_features['name2'].values)
test_features['clean_name_2'] = a
test_features['service_name_2'] = b

  0%|          | 0/45312 [00:00<?, ?it/s]

  0%|          | 0/45312 [00:00<?, ?it/s]

In [14]:
a, b = add_dif(test_features['name1'].values, test_features['name2'].values)
test_features['diff_name_1'] = a
test_features['diff_name_2'] = b

  0%|          | 0/45312 [00:00<?, ?it/s]

## With BERT model

In [10]:
def mean_pooling(token_embeddings, att_mask):
    input_mask_expanded = att_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [11]:
model = XLMRobertaModel.from_pretrained("../models/ru_roberta_base-sentence/")

In [12]:
model.eval()
model.cuda(0)
print("On GPU!")

On GPU!


In [13]:
cosine = torch.nn.CosineSimilarity(dim=1)

In [14]:
def get_embs(texts):
    BS = 512 * 2
    embs = []
    for pos in tqdm(range(0, len(texts), BS), leave=False):
        toked = tokenizer(texts[pos:pos+BS], truncation=True, padding=True, max_length=512, return_tensors='pt')
        with torch.set_grad_enabled(False), torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16):
            emb = model(**toked.to('cuda:0')).last_hidden_state.cpu()
            emb = mean_pooling(emb, toked.attention_mask.cpu())
        embs.append(emb)
    return torch.cat(embs, dim=0)

In [20]:
embs1 = get_embs(test_features['clean_name_1'].values.tolist())
embs2 = get_embs(test_features['clean_name_2'].values.tolist())

  0%|          | 0/177 [00:00<?, ?it/s]

  0%|          | 0/177 [00:00<?, ?it/s]

In [21]:
test_features['bert_clean_name'] = cosine(embs1, embs2).numpy()

In [22]:
embs1 = get_embs(test_features['diff_name_1'].values.tolist())
embs2 = get_embs(test_features['diff_name_2'].values.tolist())

  0%|          | 0/177 [00:00<?, ?it/s]

  0%|          | 0/177 [00:00<?, ?it/s]

In [23]:
test_features['bert_diff_name'] = cosine(embs1, embs2).numpy()

In [24]:
embs1 = get_embs(test_features['diff_text_characteristics_1'].values.tolist())
embs2 = get_embs(test_features['diff_text_characteristics_2'].values.tolist())

  0%|          | 0/177 [00:00<?, ?it/s]

  0%|          | 0/177 [00:00<?, ?it/s]

In [25]:
test_features['bert_diff_characteristics'] = cosine(embs1, embs2).numpy()

In [18]:
test_features.to_parquet("./hackathon_files_for_participants_ozon/test_data_characteristics_diff_v1.parquet", engine='pyarrow')

In [None]:
test_features = pd.read_parquet("./hackathon_files_for_participants_ozon/test_data_characteristics_diff_v1.parquet")

In [26]:
test_features[[
    'variantid1', 'variantid2', 'clean_name_1', 'service_name_1',
    'clean_name_2', 'service_name_2', 'diff_name_1', 'diff_name_2',
    'bert_clean_name', 'bert_diff_name', 'name_toked_1', 'name_toked_2',
    'diff_text_characteristics_1', 'diff_text_characteristics_2',
    'bert_diff_characteristics'
]].to_parquet("./hackathon_files_for_participants_ozon/syntetic_pos_neg_bad_cats_names_smart_tok_bert_cosine+characteristics.parquet", engine='pyarrow')

## Aggregating all features

In [19]:

data_diff_characteristics = pd.read_parquet("./hackathon_files_for_participants_ozon/data_characteristics_diff_v1.parquet")
data_diff_names_bert = pd.read_parquet("./hackathon_files_for_participants_ozon/data_names_smart_tok_bert_cosine.parquet")
train_data_bm25 = pd.read_parquet("./hackathon_files_for_participants_ozon/train_data_bm25_name+characteristics.parquet")
val_data_bm25 = pd.read_parquet("./hackathon_files_for_participants_ozon/val_data_bm25_name+characteristics.parquet")
data_bm_25 = pd.concat([train_data_bm25, val_data_bm25], axis=0)
data = pd.merge(data_diff_characteristics, data_diff_names_bert, how='inner', on=['variantid1', 'variantid2'])
data = pd.merge(data, data_bm_25, how='inner', on=['variantid1', 'variantid2'])


In [21]:
data.to_parquet("./hackathon_files_for_participants_ozon/data_names_smart_tok_bert_diff_characteristics_bm25_char.parquet")

In [20]:

test_data_bm25 = pd.read_parquet("./hackathon_files_for_participants_ozon/test_data_bm25_name+characteristics.parquet")
test_data_diff_characteristics = pd.read_parquet("./hackathon_files_for_participants_ozon/test_data_characteristics_diff_v1.parquet")
test_data_diff_names_bert = pd.read_parquet("./hackathon_files_for_participants_ozon/test_data_names_smart_tok_bert_cosine.parquet")
test_data = pd.merge(test_data_bm25, test_data_diff_characteristics, how='inner', on=['variantid1', 'variantid2'])
test_data = pd.merge(test_data, test_data_diff_names_bert, how='inner', on=['variantid1', 'variantid2'])


In [24]:

test_data.to_parquet("./hackathon_files_for_participants_ozon/test_data_names_smart_tok_bert_diff_characteristics_bm25_char.parquet")


## Bert Sim of intersection attributes and sim of non-intersection

In [15]:
def creating_attributes_features(df):
    first_attrs = df[f"characteristic_attributes_mapping1"].values
    second_attrs = df[f"characteristic_attributes_mapping2"].values
    res = []
    non_intersections_1 = []
    non_intersections_2 = []
    intersections_1 = []
    intersections_2 = []
    intersections_small_1 = []
    intersections_small_2 = []
    lengths = []
    ids = []
    for id1, id2, d1, d2 in tqdm(zip(df['variantid1'].values, df['variantid2'].values, first_attrs, second_attrs), total=len(df)):
        if d1 is not None and d2 is not None:
            d1 = eval(d1)
            d2 = eval(d2)
            non_intersection_1 = {}
            non_intersection_2 = {}
            intersection_1 = {}
            intersection_2 = {}
            for k in d1.keys():
                if k in d2:
                    intersection_1[k] = d1[k]
                    intersection_2[k] = d2[k]
                else:
                    non_intersection_1[k] = d1[k]
            
            for k in d2.keys():
                if k not in d2:
                    non_intersection_2[k] = d2[k]
                    
            non_intersection_1 = ';'.join(f"{k}: {v} " for k, v in non_intersection_1.items())
            non_intersection_2 = ';'.join(f"{k}: {v} " for k, v in non_intersection_2.items())
            non_intersections_1.append(non_intersection_1)
            non_intersections_2.append(non_intersection_2)
#             non_intersection_score = cosine(get_embs([non_intersection_1]), get_embs([non_intersection_2]))
            
            int_1 = ';'.join(f"{k}: {v} " for k, v in intersection_1.items())
            int_2 = ';'.join(f"{k}: {v} " for k, v in intersection_2.items())
            intersections_1.append(int_1)
            intersections_2.append(int_2)
            
#             intersection_score = cosine(get_embs([int_1]), get_embs([int_2]))
            
            int_1, int_2 = [], []
            for k in intersection_1.keys():
                int_1.append(f"{k}: {intersection_1[k]}")
                int_2.append(f"{k}: {intersection_2[k]}")
            intersections_small_1 += int_1
            intersections_small_2 += int_2
            lengths.append(len(int_1))
            ids.append((id1, id2))
#             intersections = cosine(get_embs(int_1), get_embs(int_2))
            
#             res.append({
#                 'intersection_score': intersection_score,
#                 'non_intersection_score': non_intersection_score,
#                 'intersection_mean': intersections.mean().item(),
#                 'intersection_max': intersections.max().item(),
#                 'intersection_min': intersections.min().item(),
#                 'intersection_sum': intersections.sum().item(),
#             })
        else:
            pass

    non_intersection_score = cosine(get_embs(non_intersections_1), get_embs(non_intersections_2))
    intersection_score = cosine(get_embs(intersections_1), get_embs(intersections_2))
    
#     intersection_score_small = cosine(get_embs(intersections_small_1), get_embs(intersections_small_2))
    
    pos = 0
    for n_i, i, l, (id1, id2) in zip(non_intersection_score, intersection_score, lengths, ids):
#         intersections = intersection_score_small[pos:pos+l]
        res.append({
            'variantid1': id1,
            'variantid2': id2,
            'intersection_score': i,
            'non_intersection_score': n_i,
#             'intersection_mean': intersections.mean().item(),
#             'intersection_max': intersections.max().item(),
#             'intersection_min': intersections.min().item(),
#             'intersection_sum': intersections.sum().item(),
        })
        pos += l
    
        
        
    return pd.DataFrame(res)

In [16]:
test_attr_feats_2 = creating_attributes_features(test_features)

  0%|          | 0/18084 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

In [None]:
val_attr_feats_2 = creating_attributes_features(val_features)

In [14]:
train_attr_feats_2 = creating_attributes_features(train_features)

  0%|          | 0/45312 [00:00<?, ?it/s]

  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/45 [00:00<?, ?it/s]

In [None]:
train_attr_feats = creating_attributes_features(train_features)

  0%|          | 0/286331 [00:00<?, ?it/s]

  0%|          | 0/560 [00:00<?, ?it/s]

  0%|          | 0/560 [00:00<?, ?it/s]

  0%|          | 0/560 [00:00<?, ?it/s]

  0%|          | 0/560 [00:00<?, ?it/s]

  0%|          | 0/10099 [00:00<?, ?it/s]

In [19]:
test_attr_feats_2['intersection_score'] = test_attr_feats_2['intersection_score'].apply(lambda x: x.item())
test_attr_feats_2['non_intersection_score'] = test_attr_feats_2['non_intersection_score'].apply(lambda x: x.item())

In [25]:
val_attr_feats.head()

Unnamed: 0,variantid1,variantid2,intersection_score,non_intersection_score,intersection_mean,intersection_max,intersection_min,intersection_sum
0,78292644,311184051,1.0,0.603996,1.0,1.0,1.0,3.0
1,78509805,90849346,0.995929,0.54267,0.93788,1.0,0.480445,32.825794
2,89722250,177811799,0.855937,0.553346,0.829601,1.0,0.488802,2.488802
3,91147277,473287073,0.985562,0.522836,0.897102,1.0,0.351171,23.324646
4,91630181,388828373,0.982966,0.542459,0.937011,1.0,0.49609,7.49609


In [20]:

test_attr_feats_2.to_parquet("./hackathon_files_for_participants_ozon/test_data_inter_non_inter_scores.parquet", engine='pyarrow')


## BM25

In [None]:
from rank_bm25 import BM25Okapi

In [None]:
bm25_scores = []
for x, y in tqdm(train_features[['text_features1', 'text_features2']].values):
    bm25 = BM25Okapi([list(get_words(x.lower()))])
    bm25_scores.append(bm25.get_scores(list(get_words(y.lower())))[0])
train_features['bm25_smart_tok'] = bm25_scores

In [None]:
bm25_scores = []
for x, y in tqdm(val_features[['text_features1', 'text_features2']].values):
    bm25 = BM25Okapi([list(get_words(x.lower()))])
    bm25_scores.append(bm25.get_scores(list(get_words(y.lower())))[0])
val_features['bm25_smart_tok'] = bm25_scores

In [None]:
bm25_scores = []
for x, y in tqdm(test_features[['text_features1', 'text_features2']].values):
    bm25 = BM25Okapi([list(get_words(x.lower()))])
    bm25_scores.append(bm25.get_scores(list(get_words(y.lower())))[0])
test_features['bm25_smart_tok'] = bm25_scores

## Getting positives and negatives

In [None]:
from copy import deepcopy

In [None]:
train_pos = train_pairs[train_pairs.target == 1]
train_neg = train_pairs[train_pairs.target == 0]

In [None]:
id2id_pos = {}
for id1, id2 in train_pos[['variantid1', 'variantid2']].values:
    if id1 in id2id_pos:
        id2id_pos[id1].add(id2)
    else:
        id2id_pos[id1] = set([id2])
    if id2 in id2id_pos:
        id2id_pos[id2].add(id1)
    else:
        id2id_pos[id2] = set([id1])
id2id_neg = {}
for id1, id2 in train_neg[['variantid1', 'variantid2']].values:
    if id1 in id2id_neg:
        id2id_neg[id1].add(id2)
    else:
        id2id_neg[id1] = set([id2])
    if id2 in id2id_neg:
        id2id_neg[id2].add(id1)
    else:
        id2id_neg[id2] = set([id1])

In [None]:
id2id_pos_ = deepcopy(id2id_pos)
id2id_neg_ = deepcopy(id2id_neg)

In [None]:
for id1 in tqdm(id2id_pos_.keys()):
    cur_set = list(id2id_pos_[id1])
    for id2 in cur_set:
        id2id_pos_[id1].update(id2id_pos_[id2])
for id1 in tqdm(id2id_pos_.keys()):
    cur_set_pos = list(id2id_pos_[id1])
    for id2 in cur_set_pos:
        if id2 not in id2id_neg_:
            continue
        if id1 in id2id_neg_:
            id2id_neg_[id1].update(id2id_neg_[id2])
        else:
            id2id_neg_[id1] = id2id_neg_[id2]

In [None]:
new_pairs_pos = set()
for id1 in tqdm(id2id_pos_.keys()):
    new_pairs_pos.update([tuple(sorted([id1, x])) for x in id2id_pos_[id1] if x != id1])
new_pairs_neg = set()
for id1 in tqdm(id2id_neg_.keys()):
    new_pairs_neg.update([tuple(sorted([id1, x])) for x in id2id_neg_[id1] if x != id1])

In [None]:
old_pairs_pos = set([tuple(sorted([x, y])) for x, y in train_pos[['variantid1', 'variantid2']].values] +
                 [tuple(sorted([y, x])) for x, y in train_pos[['variantid1', 'variantid2']].values])
old_pairs_neg = set([tuple(sorted([x, y])) for x, y in train_neg[['variantid1', 'variantid2']].values] +
                 [tuple(sorted([y, x])) for x, y in train_neg[['variantid1', 'variantid2']].values])

In [None]:
new_pairs_pos = new_pairs_pos - old_pairs_pos
new_pairs_neg = new_pairs_neg - old_pairs_neg

In [None]:
new_pairs_pos = pd.DataFrame(list(new_pairs_pos), columns=['variantid1', 'variantid2'])
new_pairs_pos['target'] = 1
new_pairs_neg = pd.DataFrame(list(new_pairs_neg), columns=['variantid1', 'variantid2'])
new_pairs_neg['target'] = 0

In [None]:
new_pairs_pos_features = (
    new_pairs_pos
    .merge(
        data[['variantid', 'name', 'cat3']]
        .add_suffix('1'),
        on="variantid1",
        how='inner'
    )
    .merge(
        data[['variantid', 'name', 'cat3']]
        .add_suffix('2'),
        on="variantid2",
        how='inner'
    )
)

In [None]:
new_pairs_neg_features = (
    new_pairs_neg
    .merge(
        data[['variantid', 'name', 'cat3']]
        .add_suffix('1'),
        on="variantid1",
        how='inner'
    )
    .merge(
        data[['variantid', 'name', 'cat3']]
        .add_suffix('2'),
        on="variantid2",
        how='inner'
    )
)

In [None]:
important_bad_cats = set([
    'Видеокарты и графические ускорители',
    'Запчасти для ноутбуков',
    'Мониторы и запчасти',
    'Чехол',
    'Оперативная память',
    'Компьютер',
    'Электронные модули',
    'Сетевые фильтры, разветвители и удлинители',
    'Системы охлаждения для компьютеров'
])

bad_cat = set(['Защитные пленки и стекла', 'Запчасти для аудио/видеотехники'])

In [None]:

mask_pos = (new_pairs_pos_features['cat31'] == new_pairs_pos_features['cat32']) &\
    (new_pairs_pos_features['cat31'].isin(important_bad_cats) | new_pairs_pos_features['cat31'].isin(bad_cat))

mask_neg = (new_pairs_neg_features['cat31'] == new_pairs_neg_features['cat32']) &\
    (new_pairs_neg_features['name1'] == new_pairs_neg_features['name2']) &\
    (new_pairs_neg_features['cat31'].isin(important_bad_cats) | new_pairs_neg_features['cat31'].isin(bad_cat))


In [None]:
new_positives_bad_cats = new_pairs_pos_features[mask_pos]
new_negatives_bad_cats = new_pairs_neg_features[mask_neg]

In [None]:
new_data = pd.concat([new_negatives_bad_cats, new_positives_bad_cats])

## TRAIN/VAL SPLIT

In [None]:
train_pairs = pd.read_parquet("./hackathon_files_for_participants_ozon/train_pairs.parquet")
train_pairs['variantid1'] = train_pairs['variantid1'].apply(str)
train_pairs['variantid2'] = train_pairs['variantid2'].apply(str)
train_data = pd.read_parquet("./hackathon_files_for_participants_ozon/train_data.parquet") 

In [None]:
train_data["cat3"] = train_data["categories"].apply(lambda x: json.loads(x)["3"])
train_data['variantid'] = train_data['variantid'].apply(str)
cat3_counts = train_data["cat3"].value_counts().to_dict()
cat2id = {k: i for i, k in enumerate(cat3_counts.keys())}
train_data["cat3_grouped"] = train_data["cat3"].apply(lambda x: cat2id[x] if cat3_counts[x] > 1000 else len(cat2id))
id2cat = {k : v for v, k in cat2id.items()}

In [None]:
train_pairs = train_pairs.merge(
        train_data[['variantid', 'cat3']]
        .add_suffix('1'),
        on="variantid1",
        how='inner'
    )

In [None]:
test_pairs = pd.read_parquet("./hackathon_files_for_participants_ozon/test_pairs_wo_target.parquet")
test_pairs['variantid1'] = test_pairs['variantid1'].apply(str)
test_pairs['variantid2'] = test_pairs['variantid2'].apply(str)

In [None]:
sub_example = pd.read_csv("./sub.csv")

In [None]:
from collections import defaultdict
must_have_groups = defaultdict(lambda: 0)
for k, v in sub_example.cat3_grouped.value_counts().items():
    if k == 'rest':
        continue
    must_have_groups[k] = v

In [None]:
def split_unseen(pairs, must_be_in_train, must_have_groups):
    from collections import defaultdict
    
    def find_examples(pairs, train_ids, val_ids):
        train = []
        val = []
        for tgt, id1, id2, cat in pairs.values.tolist():
            in_train1 = id1 in train_ids
            in_train2 = id2 in train_ids
            in_val1 = id1 in val_ids
            in_val2 = id2 in val_ids
            if in_train1 and in_val1:
                raise RuntimeError(f"First id both in train and in val!")
            elif in_train1 and in_val2:
                raise RuntimeError(f"First in train and second in val!")
            elif in_train2 and in_val1:
                raise RuntimeError(f"First in val and second in train!")
            elif in_train2 and in_val2:
                raise RuntimeError(f"Second id both in train and in val!")
            elif in_train1 or in_train2:
                train.append((tgt, id1, id2))
            elif in_val1 or in_val2:
                val.append((tgt, id1, id2))
            else:
                raise RuntimeError("Unknown situation")
        return train, val
    
    def find_all_ids(cur_set):
        tmp_set = set()
        for x in cur_set:
            tmp_set = tmp_set.union(id2id[x])
            tmp_set.add(x)
        to_add = len(tmp_set) - len(cur_set)
        if to_add == 0:
            return tmp_set
        else:
            return find_all_ids(tmp_set)
    
    cur_groups = defaultdict(lambda: np.inf, {k: 0 for k in must_have_groups.keys()})
    id2group = {}
    for id1, id2, cat in pairs[['variantid1', 'variantid2', 'cat31']].values:
        if id1 not in id2group:
            id2group[id1] = cat
        if id2 not in id2group:
            id2group[id2] = cat
    
    id2id = defaultdict(set)
    all_ids = set()
    for _, id1, id2, _ in pairs.values.tolist():
        id2id[id1].add(id2)
        id2id[id2].add(id1)
        all_ids.update([id1, id2])
    
    train_ids = set()
    val_ids = set()

    while len(all_ids) > 0:
        to_pop = set()
        for x in all_ids:
            to_pop.add(x)
            break
        to_pop = find_all_ids(to_pop)
        for x in to_pop:
            all_ids.remove(x)
        if any(x in must_be_in_train for x in to_pop):
            train_ids.update(to_pop)
        else:
            if any(cur_groups[id2group[x]] < must_have_groups[id2group[x]] for x in to_pop):
                val_ids.update(to_pop)
                for x in to_pop:
                    cur_groups[id2group[x]] += 1
            elif random.random() <= 0.06:
                val_ids.update(to_pop)
                for x in to_pop:
                    cur_groups[id2group[x]] += 1
            else:
                train_ids.update(to_pop)
    
    return find_examples(pairs, train_ids, val_ids)

In [None]:
train_p = set((x, y) for x, y in train_pairs[['variantid1', 'variantid2']].values.tolist())
test_p = set((x, y) for x, y in test_pairs[['variantid1', 'variantid2']].values.tolist())

In [None]:
must_be_in_train = test_p.intersection(train_p)
must_be_in_train = set(sum(must_be_in_train, ()))

In [None]:
train, val = split_unseen(train_pairs, must_be_in_train, must_have_groups)

In [None]:
id2group = {}
for id1, id2, cat in train_pairs[['variantid1', 'variantid2', 'cat31']].values:
    if id1 not in id2group:
        id2group[id1] = cat
    if id2 not in id2group:
        id2group[id2] = cat

In [None]:
id2id = defaultdict(set)
for _, id1, id2, _ in train_pairs.values.tolist():
    id2id[id1].add(id2)
    id2id[id2].add(id1)

In [None]:
val_pairs = pd.DataFrame(val, columns=['target', 'variantid1', 'variantid2'])
train_pairs = pd.DataFrame(train, columns=['target', 'variantid1', 'variantid2'])

In [None]:
val_pairs['cat3'] = val_pairs.variantid1.apply(lambda x: id2group[x])

In [None]:
test_gr = {k: v for k, v in must_have_groups.items() if v > 0}
cur_groups = {k: v for k, v in val_pairs.cat3.value_counts().items() if k in test_gr}

In [None]:
val_to_change = set()
for id1, id2, cat in val_pairs[['variantid1', 'variantid2', 'cat3']].values:
    if cat in cur_groups:
        if cur_groups[cat] > must_have_groups[cat] + 400:
            val_to_change.update([id1, id2])
            cur_groups[cat] -= 1
to_add = []
for x in val_to_change:
    to_add += list(id2id[x])
val_to_change.update(to_add)

In [None]:
mask = (val_pairs.variantid1.isin(val_to_change) & val_pairs.variantid2.isin(val_to_change))

In [None]:
new_val_pairs = val_pairs[~mask]

In [None]:
new_train_pairs = pd.concat([train_pairs, val_pairs[mask][['target', 'variantid1', 'variantid2',]]], axis=0)