In [None]:
import csv
import np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch import Tensor
from torch.utils.tensorboard import SummaryWriter

In [None]:
def word_hash(word):
    res = set()
    for i in word.lower().split():
        for k in range(0, len(i) - 3, 3):
            res.add(tuple([i[k:k+3], i[k+1:k+4], i[k+2:k+5]]))
    return list(res)

In [None]:
print(word_hash("hello world"))

In [None]:
train: pd.DataFrame = pd.merge(pd.read_csv("data/train.csv", encoding = "ISO-8859-1"), pd.read_csv("data/product_descriptions.csv", encoding = "ISO-8859-1"), on="product_uid")
test: pd.DataFrame = pd.merge(pd.read_csv("data/test.csv", encoding = "ISO-8859-1"), pd.read_csv("data/product_descriptions.csv", encoding = "ISO-8859-1"), on="product_uid")

In [None]:
train[train["search_term"] == "angle bracket"]

In [None]:
a = train["search_term"].unique()
features = {}
batch_size = 1024
trigram_dimension = 30000

In [None]:
for i in a:
    for j in train[train["search_term"] == i].iterrows():
        document = word_hash(f"{j[1][2]} {j[1][5]}")
        if j[1][3] not in features:
            features[word_hash(j[1][3])] = {"positive": [], "negative": []}            
        elif j[1][4] < 2:
            features[word_hash(j[1][3])]["negative"].append(document)
        else:
            features[word_hash(j[1][3])]["positive"].append(document)
        for f in features:
            if f != word_hash(j[1][3]) and len(features[f]["negative"]) < 4 and document not in features[f]["positive"] and document not in features[f]["negative"]:
                features[f]["negative"].append(document)

In [None]:
np.save("data/features.npy", features)

In [None]:
class DSSM(nn.Module):
    def __init__(self):
        super(DSSM, self).__init__()
        assert (trigram_dimension == 30000)
        self.l1 = nn.Linear(trigram_dimension, 300)
        nn.init.xavier_uniform_(self.l1.weight)
        self.l2 = nn.Linear(300, 300)
        nn.init.xavier_uniform_(self.l2.weight)
        self.l3 = nn.Linear(300, 128)
        nn.init.xavier_uniform_(self.l3.weight)

    def forward(self, x):
        x = F.tanh(self.l1(x))
        x = F.tanh(self.l2(x))
        x = F.tanh(self.l3(x))
        return x

In [None]:
model = DSSM().to("cuda")
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)

In [None]:
def train(features, epoch=5):
    for e_idx in range(epoch):
        batch_idx = 0
        negative = []
        for feature in features:
            negative = features[feature]["negative"]
            for positive in features[feature]["positive"]:
                # size: (batch_size, 128)
                optimizer.zero_grad()
                
                out_q = model(Tensor(feature))
                out_p = model(Tensor(positive))
                out_n1 = model(Tensor(negative[0]))
                out_n2 = model(Tensor(negative[1]))
                out_n3 = model(Tensor(negative[2]))
                out_n4 = model(Tensor(negative[3]))

                # # Relevance measured by cosine similarity
                # size: (batch_size)
                cos_qp = torch.cosine_similarity(out_q, out_p, dim=1)
                cos_qn1 = torch.cosine_similarity(out_q, out_n1, dim=1)
                cos_qn2 = torch.cosine_similarity(out_q, out_n2, dim=1)
                cos_qn3 = torch.cosine_similarity(out_q, out_n3, dim=1)
                cos_qn4 = torch.cosine_similarity(out_q, out_n4, dim=1)
                cos_uni = torch.cat((cos_qp, cos_qn1, cos_qn2, cos_qn3, cos_qn4), 1)  # size: (batch_size,5)

                # # posterior probability computed by softmax
                softmax_qp = F.softmax(cos_uni, dim=1)[:, 0]  # size: (batch_size)
                loss = -torch.log(torch.prod(softmax_qp))

                loss.backward()
                optimizer.step()
                batch_idx += 1

def predict(data, model):
    return [model(q, f"{r} {d}") for q, r, d in data][0].data.detach().numpy()

In [None]:
features = np.load("data/features.npy", "r+")
train(features, model)

In [None]:
res = {}
for row, predict in (test.iterrows(), predict(test, model)):
    model.eval()
    res[row[1][0]] = predict + 2
    
with open('submission.csv', 'w') as f:
    w = csv.DictWriter(f, res.keys())
    w.writeheader()
    w.writerow(res)