In [262]:
import gdown
import pandas as pd

In [263]:
# gdown.download('https://drive.google.com/file/d/17NBXx6l_6znStM52RJOKKKlxnHigavzq/view', fuzzy=True)

In [264]:
# tar -xvzf hackathon_objects.tar.gz

In [265]:
goods_data = pd.read_parquet('files/datasets/train_data.parquet')

goods_pairs = pd.read_parquet('files/datasets/train_pairs_w_target.parquet')

In [266]:
goods_pairs.head()

Unnamed: 0,target,variantid1,variantid2
0,0.0,51197862,51198054
1,1.0,53062686,536165289
2,1.0,53602615,587809782
3,1.0,53888651,89598677
4,0.0,56930698,551526166


In [267]:
goods_data.head()

Unnamed: 0,variantid,characteristic_attributes_mapping,name,categories,color_parsed,pic_embeddings_resnet_v1,main_pic_embeddings_resnet_v1,name_bert_64
0,51195767,"{""85"":[""Партнер-Электро""],""8229"":[""Удлинитель ...","Удлинитель Партнер-Электро ПВС 2х0,75 ГОСТ,6A,...","{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",[оранжевый],,"[[0.04603629, 0.18839523, -0.09973055, -0.6636...","[-0.47045058, 0.67237014, 0.48984158, -0.54485..."
1,51196903,"{""85"":[""TDM Electric""],""8229"":[""Удлинитель быт...",Колодка TDM Electric пятиместная без заземлени...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",[белый],"[[0.42044494, -0.33794826, -0.037247024, 0.165...","[[0.16211876, -0.4455993, 0.6131705, 0.5954206...","[-0.48503304, 0.6264443, 0.6406273, -0.4696772..."
2,52061880,"{""85"":[""MINAMOTO""],""8229"":[""Батарейка""],""5111""...",Батарейка литиевая CR 1/3N 3V (CR11108) Minamo...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Батарей...",,"[[0.66729844, -0.023996592, 0.42721167, 0.0205...","[[-0.8638098, -0.1625915, -0.034600798, 0.1337...","[-0.15832633, 0.44736174, 0.31883216, -0.55559..."
3,52067481,"{""85"":[""Duracell""],""8229"":[""Батарейка""],""5111""...","Батарейки DURACELL Ultra Power AA (LR6), 4 шт","{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Батарей...",,"[[0.3516068, 0.6306597, 0.80962396, -0.0104141...","[[-0.26286322, 0.22858405, 0.4346232, 0.023486...","[-0.42703134, 0.16898727, 0.53922343, -0.53523..."
4,52610752,"{""85"":[""HP""],""8229"":[""Картридж""],""5708"":[""для ...",Картридж лазерный HP 12A Q2612A черный для LJ ...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Расходн...","[черный, чер]","[[-0.009966308, 0.70965785, 0.10790472, -0.411...","[[0.09032486, 0.74205226, 0.30355096, -0.04755...","[-0.4892143, 0.47568643, 0.6505941, -0.4106509..."


In [268]:
goods_data['main_pic_embeddings_resnet_v1'][0][0][0]

0.04603629

In [269]:
def symm_pairs(df):
    # todo: complete cliques of goods
    
    df2 = df.copy()
    cols = ["variantid1", "variantid2"]
    for c in cols:
        if c not in df2.columns:
            raise UserWarning
    df2 = df2.rename(columns={cols[0]: cols[1], cols[1]: cols[0]})
    return pd.concat([df, df2]).drop_duplicates()

In [270]:
import torch
import random
import numpy as np
from torch.utils.data import Dataset, DataLoader


class ContrastiveDataset(Dataset):
    def __init__(self, original_dataset, pairs_dataset, embedding_columns):
        self.original_dataset = original_dataset
        self.pairs_dataset = pairs_dataset
        self.embedding_columns = embedding_columns

    def __getitem__(self, index):
        # Get original data
        original_data = self.get_embeddings(index)
        original_id = self.original_dataset["variantid"].iloc[index]

        # Find positive and negative pairs
        positive_pair = self.find_positive_pair(original_id)
        negative_pair = self.find_negative_pair(original_id)

        return original_data, positive_pair, negative_pair

    def __len__(self):
        return len(self.original_dataset)

    def get_embeddings(self, index):
        embeddings = []
        for column in self.embedding_columns:
            embedding = self.original_dataset[column].iloc[index]
            if embedding.shape[0] == 1:
                embedding = embedding[0]
            embeddings.append(embedding)
        return torch.tensor(np.concatenate(embeddings))

    def find_positive_pair(self, original_id):
        positive_pairs = self.pairs_dataset[self.pairs_dataset["target"] == 1]
        matching_pairs = positive_pairs[(positive_pairs["variantid1"] == original_id)]

        if len(matching_pairs) > 0:
            # Randomly choose a positive pair if multiple pairs are found
            chosen_pair = matching_pairs.sample(1)

            # Get the ID of the other data point in the pair
            pair_id = chosen_pair["variantid2"].values[0]

            # Retrieve the corresponding data from the original dataset
            positive_row = self.get_embeddings(
                self.original_dataset[
                    self.original_dataset["variantid"] == pair_id
                ].index[0]
            )

            return positive_row

        # Return original data if positive pair is not found
        original_row = self.get_embeddings(
            self.original_dataset[
                self.original_dataset["variantid"] == original_id
            ].index[0]
        )
        return original_row

    def find_negative_pair(self, original_id):
        negative_pairs = self.pairs_dataset[self.pairs_dataset["target"] == 0]
        matching_pairs = negative_pairs[(negative_pairs["variantid1"] == original_id)]

        if len(matching_pairs) > 0:
            # Randomly choose a positive pair if multiple pairs are found
            chosen_pair = matching_pairs.sample(1)

            # Get the ID of the other data point in the pair
            pair_id = chosen_pair["variantid2"].values[0]

            # Retrieve the corresponding data from the original dataset
            negative_row = self.get_embeddings(
                self.original_dataset[
                    self.original_dataset["variantid"] == pair_id
                ].index[0]
            )

            return negative_row

        # Return random data if positive pair is not found
        random_row = self.get_embeddings(random.choice(self.original_dataset.index))
        return random_row


# Usage example
original_dataset = pd.DataFrame(
    {
        "variantid": [1, 2, 3, 4, 5],
        "main_pic_embeddings_resnet_v1": [
            np.random.rand(2),
            np.random.rand(2),
            np.random.rand(2),
            np.random.rand(2),
            np.random.rand(2),
        ],
        "other_embeddings": [
            np.random.rand(3),
            np.random.rand(3),
            np.random.rand(3),
            np.random.rand(3),
            np.random.rand(3),
        ],
    }
)

pairs_dataset = pd.DataFrame(
    {
        "variantid1": [1, 2, 3, 4, 5],
        "variantid2": [2, 3, 4, 5, 1],
        "target": [1, 1, 0, 1, 0],
    }
)

pairs_dataset_symm = symm_pairs(pairs_dataset)

embedding_columns = ["main_pic_embeddings_resnet_v1", "other_embeddings"]

contrastive_dataset = ContrastiveDataset(
    original_dataset, pairs_dataset_symm, embedding_columns
)
dataloader = DataLoader(contrastive_dataset, batch_size=2, shuffle=True)

for original_data, positive_pair, negative_pair in dataloader:
    # Use the data for training or inference
    print("Original Data:", original_data)
    print("Positive Pair:", positive_pair)
    print("Negative Pair:", negative_pair)


Original Data: tensor([[0.2153, 0.6319, 0.5349, 0.7366, 0.1102],
        [0.7397, 0.5251, 0.0387, 0.9687, 0.2887]], dtype=torch.float64)
Positive Pair: tensor([[0.4922, 0.8236, 0.9081, 0.6742, 0.3004],
        [0.4097, 0.2647, 0.2717, 0.2404, 0.0680]], dtype=torch.float64)
Negative Pair: tensor([[0.4097, 0.2647, 0.2717, 0.2404, 0.0680],
        [0.4922, 0.8236, 0.9081, 0.6742, 0.3004]], dtype=torch.float64)
Original Data: tensor([[0.4922, 0.8236, 0.9081, 0.6742, 0.3004],
        [0.9249, 0.8772, 0.0431, 0.6729, 0.5220]], dtype=torch.float64)
Positive Pair: tensor([[0.2153, 0.6319, 0.5349, 0.7366, 0.1102],
        [0.2153, 0.6319, 0.5349, 0.7366, 0.1102]], dtype=torch.float64)
Negative Pair: tensor([[0.7397, 0.5251, 0.0387, 0.9687, 0.2887],
        [0.4097, 0.2647, 0.2717, 0.2404, 0.0680]], dtype=torch.float64)
Original Data: tensor([[0.4097, 0.2647, 0.2717, 0.2404, 0.0680]], dtype=torch.float64)
Positive Pair: tensor([[0.7397, 0.5251, 0.0387, 0.9687, 0.2887]], dtype=torch.float64)
Nega

# Actual data

In [271]:
symm_goods_pairs = symm_pairs(goods_pairs)

In [272]:
embedding_columns = ['main_pic_embeddings_resnet_v1', 'name_bert_64']

contrastive_dataset = ContrastiveDataset(goods_data, symm_goods_pairs, embedding_columns)
a = contrastive_dataset[0]

In [273]:
a[0].shape

torch.Size([192])

In [274]:
contrastive_dataloader = DataLoader(contrastive_dataset, batch_size=128, shuffle=False)
next(iter(contrastive_dataloader))[0].shape

torch.Size([128, 192])

# Basic train example

In [275]:
goods_data.iloc[12]

variantid                                                                     53929742
characteristic_attributes_mapping    {"85":["HP"],"8229":["Картридж"],"5708":["для ...
name                                 Картридж струйный HP (CZ109AE) Deskjet Ink Adv...
categories                           {"1": "EPG", "2": "Электроника", "3": "Расходн...
color_parsed                                                                  [черный]
pic_embeddings_resnet_v1             [[1.2208006, 0.66162866, 0.49201033, 0.1131612...
main_pic_embeddings_resnet_v1        [[0.9229982, -0.17476001, 0.37529987, 0.263404...
name_bert_64                         [-0.19159806, 0.5509367, 0.50015163, -0.362934...
Name: 12, dtype: object

In [276]:
goods_data.shape

(457063, 8)

In [277]:
goods_data['variantid'].unique().shape

(457063,)

In [278]:
pd.concat([goods_pairs['variantid1']])

0          51197862
1          53062686
2          53602615
3          53888651
4          56930698
            ...    
306535    817327230
306536    817560551
306537    817854719
306538    820036017
306539    821514120
Name: variantid1, Length: 306540, dtype: int64

In [279]:
pd.concat([goods_pairs['variantid1'], goods_pairs['variantid2']]).unique().shape

(456741,)

In [280]:
goods_pairs[goods_pairs['variantid1'] == 53929742]

Unnamed: 0,target,variantid1,variantid2


In [281]:
symm_goods_pairs[symm_goods_pairs['variantid1'] == 53929742]

Unnamed: 0,target,variantid1,variantid2


In [282]:
contrastive_dataset[25][0].shape

torch.Size([192])

In [283]:
for i, (original_data, positive_pair, negative_pair) in enumerate(contrastive_dataloader):
    print(i, original_data.shape)
    break

0 torch.Size([128, 192])


In [284]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

from tqdm.notebook import tqdm

In [285]:
#!g1.1

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
#!g1.1
class ContrastiveModel(nn.Module):
    def __init__(self, input_size, hidden_size, embedding_size):
        super(ContrastiveModel, self).__init__()
        
        self.input_size = input_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, embedding_size)
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.2)
        
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

def triplet_loss(anchor, positive, negative, margin=1.0):
    distance_positive = torch.norm(anchor - positive, dim=1)
    distance_negative = torch.norm(anchor - negative, dim=1)
    loss = torch.clamp(distance_positive - distance_negative + margin, min=0.0).mean()
    return loss

# Example parameters
input_size = 192
hidden_size = 64
embedding_size = 32
learning_rate = 0.001
num_epochs = 10

# Create an instance of the model
model = ContrastiveModel(input_size, hidden_size, embedding_size).to(device)

# Define the optimizer and the loss function
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train loop
for epoch in tqdm(range(num_epochs)):
    total_loss = 0.0
    
    for original_data, positive_pair, negative_pair in tqdm(contrastive_dataloader):
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        anchor = model(original_data.to(device))
        positive = model(positive_pair.to(device))
        negative = model(negative_pair.to(device))
        
        # Compute the triplet loss
        loss = triplet_loss(anchor, positive, negative)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        print(loss.item())
        total_loss += loss.item()
    
    average_loss = total_loss / len(contrastive_dataloader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Average Loss: {average_loss:.4f}")


In [None]:
#!g1.1
