In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install faiss_gpu editdistance
!pip install timm
!pip install cupy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faiss_gpu
  Downloading faiss_gpu-1.7.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[K     |████████████████████████████████| 85.5 MB 1.2 MB/s 
Installing collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting timm
  Downloading timm-0.6.7-py3-none-any.whl (509 kB)
[K     |████████████████████████████████| 509 kB 8.5 MB/s 
Installing collected packages: timm
Successfully installed timm-0.6.7


In [None]:
%cd /content/drive/MyDrive/AIC_HCM/Video_Retrieval/Top2_Shopee

k = 50
conf_th = 0.7

import pandas as pd
from pathlib import Path


def load_data():
    # nrows = 1000
    # df = pd.read_csv('/content/drive/MyDrive/AIC_HCM/Video_Retrieval/Top2_Shopee/features_npy/train.csv', usecols=['posting_id', 'image', 'title'])
#         nrows = None
    df = pd.read_csv('/content/drive/MyDrive/AIC_HCM/Video_Retrieval/Top2_Shopee/features_npy/train.csv', usecols=['posting_id', 'image', 'title']).append(
          pd.read_csv('/content/drive/MyDrive/AIC_HCM/Video_Retrieval/Top2_Shopee/features_npy/train.csv', usecols=['posting_id', 'image', 'title'])).reset_index(drop=True)
    img_dir = Path('/content/drive/MyDrive/AIC_HCM/DOLG/DOLG-pytorch/dataset/data/train')
    return df, img_dir

/content/drive/.shortcut-targets-by-id/1m0rrLIYvPFV92b1Sh7VRMwu4Gyv4k4Vo/AIC_HCM/Video_Retrieval/Top2_Shopee


## GAT

In [None]:
import sys
# sys.path.append('../input/timm045/')
import timm

from itertools import zip_longest
import json
import math
import gc
import os
from pathlib import Path

import faiss
import numpy as np
# import cupy as cp
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

from tqdm import tqdm
from PIL import Image
import joblib
import lightgbm as lgb
from scipy.sparse import hstack, vstack, csc_matrix, csr_matrix
import editdistance
import networkx as nx

import string
import nltk
from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

NUM_CLASSES = 11014
NUM_WORKERS = 2
SEED = 0

class GraphDataset(Dataset):

    def __init__(self, feats=None, labels=None, weights=None, pair_tuples=None, k=50, top_neighbors=None):
        self.feats = feats
        self.labels = labels
        self.weights = weights
        self.pair_tuples = pair_tuples
        self.k = k
        self.top_neighbors = top_neighbors

    def __getitem__(self, index):
        i, j = self.pair_tuples[index]
        feat = torch.FloatTensor(self.feats[i][j])

        padding_i = [[0] * feat.shape[0]] * (self.k - len(self.top_neighbors[i]))
        neighbor_feats_i = torch.FloatTensor([
            self.feats[i][neighbor]
            for neighbor in self.top_neighbors[i]
        ] + padding_i)
        padding_j = [[0] * feat.shape[0]] * (self.k - len(self.top_neighbors[j]))
        neighbor_feats_j = torch.FloatTensor([
            self.feats[j][neighbor]
            for neighbor in self.top_neighbors[j]
        ] + padding_j)
        neighbor_feats = torch.cat([feat.unsqueeze(0), neighbor_feats_i, neighbor_feats_j], dim=0)

        outputs = (feat, neighbor_feats)
        if self.labels is not None:
            outputs += (self.labels[i] == self.labels[j],)
        if self.weights is not None:
            outputs += (self.weights[i],)

        return outputs

    def __len__(self):
        return len(self.pair_tuples)


class GraphAttentionLayer(nn.Module):

    def __init__(self, in_features, out_features, dropout=0.6, alpha=0.2, concat=True):
        super().__init__()
        self.dropout = dropout
        self.in_features = in_features
        self.out_features = out_features
        self.alpha = alpha
        self.concat = concat

        self.W = nn.Parameter(torch.empty(size=(in_features, out_features)))
        nn.init.xavier_uniform_(self.W.data, gain=1.414)
        self.a = nn.Parameter(torch.empty(size=(2 * out_features, 1)))
        nn.init.xavier_uniform_(self.a.data, gain=1.414)

        self.leakyrelu = nn.LeakyReLU(self.alpha)

    def forward(self, h):
        Wh = h @ self.W  # h.shape: (B, N, in_features), Wh.shape: (B, N, out_features)
        a_input = self._prepare_attentional_mechanism_input(Wh)
        e = self.leakyrelu(torch.matmul(a_input, self.a).squeeze(3))

        attention = F.softmax(e, dim=1)
        attention = F.dropout(attention, self.dropout, training=self.training)
        h_prime = torch.bmm(attention, Wh)

        if self.concat:
            return F.elu(h_prime)
        else:
            return h_prime

    def _prepare_attentional_mechanism_input(self, Wh):
        B, N, D = Wh.shape

        Wh_repeated_in_chunks = Wh.repeat_interleave(N, dim=1)
        Wh_repeated_alternating = Wh.repeat(1, N, 1)

        all_combinations_matrix = torch.cat([Wh_repeated_in_chunks, Wh_repeated_alternating], dim=2)
        return all_combinations_matrix.view(-1, N, N, 2 * D)

    def __repr__(self):
        return self.__class__.__name__ + ' (' + str(self.in_features) + ' -> ' + str(self.out_features) + ')'


class GATPairClassifier(nn.Module):
    def __init__(self, nfeat, nhid=8, nclass=1, dropout=0.6, alpha=0.2, nheads=8, pooling='first'):
        super().__init__()
        self.dropout = dropout
        self.pooling = pooling

        self.attentions = [GraphAttentionLayer(nfeat, nhid, dropout=dropout, alpha=alpha, concat=True) for _ in range(nheads)]
        for i, attention in enumerate(self.attentions):
            self.add_module('attention_{}'.format(i), attention)

        self.out_att = GraphAttentionLayer(nhid * nheads, nhid, dropout=dropout, alpha=alpha, concat=False)

        self.classifier = nn.Sequential(
            nn.Linear(nfeat + nhid, nhid),
            nn.PReLU(),
            nn.BatchNorm1d(nhid),
            nn.Linear(nhid, nclass),
        )

    def forward_gat(self, x):
        x = F.dropout(x, self.dropout, training=self.training)
        x = torch.cat([att(x) for att in self.attentions], dim=2)
        x = F.dropout(x, self.dropout, training=self.training)
        x = F.elu(self.out_att(x))
        if self.pooling == 'first':
            return x[:, 0]
        elif self.pooling == 'mean':
            return x.mean(dim=1)

    def forward(self, feats, neighbor_feats):
        gat_feats = self.forward_gat(neighbor_feats)
        cat_feats = torch.cat([feats, gat_feats], dim=1)
        return self.classifier(cat_feats).squeeze(1)


import time
from contextlib import contextmanager
from collections import defaultdict
map_used_time = defaultdict(float)
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    tt = time.time() - t0
    map_used_time[title] += tt
    print("  {} - done in {:.5f}s".format(title, tt))


df, img_dir = load_data()

stop_words = set([
    'promo','diskon','baik','terbaik', 'murah',
    'termurah', 'harga', 'price', 'best', 'seller',
    'bestseller', 'ready', 'stock', 'stok', 'limited',
    'bagus', 'kualitas', 'berkualitas', 'hari', 'ini',
    'jadi', 'gratis',
])


titles = [
    title.translate(str.maketrans({_: ' ' for _ in string.punctuation}))
    for title in df['title'].str.lower().values
]

# print(len(titles))
# print(df.shape)
tokenizer = TweetTokenizer()
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, 
                                   binary=True, 
                                   min_df=2, 
                                   token_pattern='(?u)\\b\\w+\\b', 
                                   tokenizer=tokenizer.tokenize,
                                   dtype=np.float32,
                                   norm='l2')
tfidf_feats = tfidf_vectorizer.fit_transform(titles)
simmat_tfidf = tfidf_feats @ tfidf_feats.T
# print(simmat_tfidf.shape)

with timer('load'):
    st_sizes, img_hs, img_ws = joblib.load('/content/drive/MyDrive/AIC_HCM/Video_Retrieval/Top2_Shopee/features_npy/lyk_img_meta_data.pkl')
    similarities_img = np.load('/content/drive/MyDrive/AIC_HCM/Video_Retrieval/Top2_Shopee/features_npy/img_D_qe.npy')[:, :k]
    indexes_img = np.load('/content/drive/MyDrive/AIC_HCM/Video_Retrieval/Top2_Shopee/features_npy/img_I_qe.npy')[:, :k]
    # print(indexes_img.shape)
    # print(indexes_img)

    similarities_bert = np.load('/content/drive/MyDrive/AIC_HCM/Video_Retrieval/Top2_Shopee/features_npy/brt_D_qe.npy')[:, :k]
    indexes_bert = np.load('/content/drive/MyDrive/AIC_HCM/Video_Retrieval/Top2_Shopee/features_npy/brt_I_qe.npy')[:, :k]

    similarities_mm = np.load('/content/drive/MyDrive/AIC_HCM/Video_Retrieval/Top2_Shopee/features_npy/mut_D_qe.npy')[:, :k]
    indexes_mm = np.load('/content/drive/MyDrive/AIC_HCM/Video_Retrieval/Top2_Shopee/features_npy/mut_I_qe.npy')[:, :k]
    
    row = indexes_bert.ravel()
    col = np.arange(len(indexes_bert)).repeat(k)
    data = similarities_bert.ravel()
    simmat_bert = {(i, j): d for i, j, d in zip(col, row, data)}

    row = indexes_img.ravel()
    col = np.arange(len(indexes_img)).repeat(k)
    data = similarities_img.ravel()
    simmat_img = {(i, j): d for i, j, d in zip(col, row, data)}

    row = indexes_mm.ravel()
    col = np.arange(len(indexes_mm)).repeat(k)
    data = similarities_mm.ravel()
    simmat_mm = {(i, j): d for i, j, d in zip(col, row, data)}

del row, col, data
gc.collect()

ckpt = torch.load('/content/drive/MyDrive/AIC_HCM/Video_Retrieval/Top2_Shopee/features_npy/v135.pth')
params = ckpt['params']

top_neighbors = defaultdict(list)
feats = defaultdict(lambda: defaultdict())

pair_tuples = []
for i in tqdm(range(len(df))):
    right_indexes = set(indexes_img[i, :k].tolist() + indexes_bert[i, :k].tolist())
    right_indexes.remove(i)  # remove self

    right_indexes = list(right_indexes)
    # print('aaaaaaaaaaaa',len(right_indexes))
    # print(right_indexes)
    scores = {}
    for j in right_indexes:
        pair_tuples.append((i, j))

        sim_img = simmat_img.get((i, j), 0)
        sim_bert = simmat_bert.get((i, j), 0)
        sim_mm = simmat_mm.get((i, j), 0)
        sim_tfidf = simmat_tfidf[i, j]
        if sim_img == 0 and sim_bert == 0:
            continue

        feats[i][j] = [
            sim_img,
            sim_tfidf,
            sim_bert,
            sim_mm,
        ]
        scores[j] = sim_img + sim_tfidf + sim_bert + sim_mm

    top_neighbors[i] = sorted(right_indexes, key=lambda x: scores[x], reverse=True)[:params['k']]

dataset = GraphDataset(
    feats=feats,
    pair_tuples=pair_tuples,
    k=params['k'],
    top_neighbors=top_neighbors,
)
loader = DataLoader(dataset, batch_size=2 ** 5, shuffle=False, drop_last=False, num_workers=2, pin_memory=True)

gat = GATPairClassifier(nfeat=len(feats[i][j]), nhid=params['nhid'],
                        dropout=params['dropout'], nheads=params['nheads'], pooling=params['pooling'])
gat.to('cuda').eval()
gat.load_state_dict(ckpt['model'])

del tfidf_feats
gc.collect()
###

preds = []
for feats, neighbor_feats in tqdm(loader, desc='predict', leave=False):
    feats = feats.to('cuda', non_blocking=True)
    neighbor_feats = neighbor_feats.to('cuda', non_blocking=True)
    with torch.no_grad():
        pred = gat(feats, neighbor_feats).sigmoid().detach().cpu().numpy().tolist()
        preds.extend(pred)

conf_th_gcn = 0.3
df_pair = pd.DataFrame()
col, row = list(zip(*pair_tuples))
df_pair['i'] = col
df_pair['j'] = row

df_pair['posting_id'] = df['posting_id'].values[df_pair['i'].values]
df_pair['posting_id_target'] = df['posting_id'].values[df_pair['j'].values]

df_pair = df_pair[['posting_id', 'posting_id_target']]
df_pair['pred'] = preds
df_pair['pred'] -= conf_th_gcn

# df_pair.to_pickle('submission_lyak_gcn.pkl')
df_pair

68500
(68500, 3)
(68500, 68500)
(68500, 50)
[[    0 34250 67411 ... 46015 56472 22222]
 [    1 34251 52933 ... 10444 14206 48456]
 [    2 34252 49918 ... 37179 46093 11843]
 ...
 [68497 34247 57306 ... 53171 48952 14702]
 [68498 34248 13887 ... 11229 13554 47804]
 [68499 34249 68042 ...  4576 13998 48249]]
  load - done in 12.56996s


100%|██████████| 68500/68500 [03:16<00:00, 348.46it/s]


Unnamed: 0,posting_id,posting_id_target,pred
0,train_129225211,train_378997795,-0.299922
1,train_129225211,train_3982974369,-0.299712
2,train_129225211,train_3829647969,-0.299964
3,train_129225211,train_2058577656,-0.299969
4,train_129225211,train_752725135,-0.299929
...,...,...,...
5745293,train_1792180725,train_1060076630,-0.299979
5745294,train_1792180725,train_1954734897,-0.299984
5745295,train_1792180725,train_2193000950,-0.299973
5745296,train_1792180725,train_511635458,-0.299985


In [None]:
# !pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu113

In [None]:
# import torch
# checkpoint1 = torch.load('/content/drive/MyDrive/AIC_HCM/Video_Retrieval/Top2_Shopee/features_npy/v45.pth')
# checkpoint3 = torch.load('/content/drive/MyDrive/AIC_HCM/Video_Retrieval/Top2_Shopee/features_npy/v79.pth')
# params1 = checkpoint1['params']
# params3 = checkpoint3['params']

# checkpoint2 = torch.load('/content/drive/MyDrive/AIC_HCM/Video_Retrieval/Top2_Shopee/features_npy/v34.pth')
# params2 = checkpoint2['params']
# print(params2['backbone'], params1['backbone'],params3['backbone'])

dm_nfnet_f0 vit_deit_base_distilled_patch16_384 dm_nfnet_f0
