In [1]:
import numpy as np

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

%load_ext autoreload
%autoreload 2
import my_baselines
import utils

In [2]:
with open('train.json') as f:
    data = f.readlines()

import ast
data = [ast.literal_eval(x) for x in data]

In [3]:
data[0]

{'categories': [['Clothing, Shoes & Jewelry', 'Women'],
  ['Clothing, Shoes & Jewelry',
   'Novelty, Costumes & More',
   'Novelty',
   'Clothing',
   'Women',
   'Leggings']],
 'categoryID': 0,
 'helpful': {'nHelpful': 0, 'outOf': 0},
 'itemID': 'I402344648',
 'rating': 4.0,
 'reviewHash': 'R798569390',
 'reviewText': "The model in this picture has them rolled up at the top because they are actually very high waisted! that's my only complaint though, because they are very good quality, and fit really well! I am 5'2&#34; 120lbs with thick thighs and i love them i can't wait to wear them out!",
 'reviewTime': '09 26, 2013',
 'reviewerID': 'U490934656',
 'summary': 'High Waisted',
 'unixReviewTime': 1380153600}

In [4]:
item_popularity = {}
user_buy_count = {}
item2cat = {}

alpha = 0.
beta_user = {}
beta_item = {}
beta_category = {}

for d in data:
    reviewer = d['reviewerID']
    item = d['itemID']
    rating = d['rating']
    cat = d['categoryID']
    
    item2cat[item] = cat
    
    p = item_popularity.get(item, 0)
    item_popularity[item] = p + 1
    
    uc = user_buy_count.get(reviewer, 0)
    user_buy_count[reviewer] = uc + 1
    
    alpha += rating
    
    bu = beta_user.get(reviewer, (0., 0))
    bu = (bu[0]+rating, bu[1]+1)
    beta_user[reviewer] = bu
    
    bi = beta_item.get(item, (0., 0))
    bi = (bi[0]+rating, bi[1]+1)
    beta_item[item] = bi
    
    bc = beta_category.get(cat, (0., 0))
    bc = (bc[0]+rating, bc[1]+1)
    beta_category[cat] = bc

item_rank = sorted(item_popularity.items(), key=lambda x: x[1], reverse=True)
item_rank = {x[0]:rank for rank, x in enumerate(item_rank)}

user_rank = sorted(user_buy_count.items(), key=lambda x: x[1], reverse=True)
user_rank = {x[0]:rank for rank, x in enumerate(user_rank)}

alpha /= len(data)
beta_user = {r:(b[0]/b[1]) for r,b in beta_user.items()}
beta_item = {r:(b[0]/b[1]) for r,b in beta_item.items()}
beta_category = {r:(b[0]/b[1]) for r,b in beta_category.items()}

In [21]:
def getFeature(u, i):
    x = [1.]
    
    try:
        c = item2cat[i]
        x.append(item_rank[i]) # Item popularity
        x.append(beta_item[i]) # Item average rating
        x.append(beta_category[c]) # User rating on this category
    except:
        x.append(alpha)
        x.append(alpha)
        x.append(alpha)
        
    try:
        x.append(user_rank[u]) # User buy rank
        x.append(beta_user[u]) # User rating on this item
    except:
        x.append(alpha)
        x.append(alpha)
    
    return x

def getFeatures(X):
    return np.array([getFeature(x[0], x[1]) for x in X])

def create_batch(X, Y, batch_size):
    m = X.shape[0]
    n_batch = int(m / batch_size)

    X_batches = []
    Y_batches = []

    permutation = np.random.permutation(m)
    X_shuffle = X[permutation, :]
    Y_shuffle = Y[permutation, :]

    for i in range(n_batch):
        X_batch = X_shuffle[i * batch_size: (i+1) * batch_size, :]
        Y_batch = Y_shuffle[i * batch_size: (i+1) * batch_size, :]
        X_batches.append(X_batch)
        Y_batches.append(Y_batch)

    if m % n_batch != 0:
        X_batch = X_shuffle[n_batch * batch_size:, :]
        Y_batch = Y_shuffle[n_batch * batch_size:, :]
        X_batches.append(X_batch)
        Y_batches.append(Y_batch)
        n_batch += 1

    return X_batches, Y_batches, n_batch

In [17]:
reviewer_item_pair = {}
reviewer_list = []
item_list = []

for d in data:
    reviewer = d['reviewerID']
    item = d['itemID']
    pair = reviewer_item_pair.get(reviewer, [])
    pair.append(item)
    reviewer_item_pair[reviewer] = pair
    reviewer_list.append(reviewer)
    item_list.append(item)
    
import random
cnt = 0
neg_pair = []

while(cnt < 200000):
    reviewer = random.choice(reviewer_list)
    item = random.choice(item_list)
    if item not in reviewer_item_pair[reviewer]:
        neg_pair.append((reviewer, item))
        cnt += 1    
        
dataX = []
dataY = []

for d in data:
    dataX.append((d['reviewerID'], d['itemID']))
    dataY.append(1)

dataX += neg_pair
dataY += len(neg_pair) * [0]

holdout_ratio = 0.2
trainX, trainY, valX, valY = utils.split_data(dataX, dataY, holdout_ratio, shuffle=True)

In [48]:
trainX = getFeatures(trainX)
trainY = np.array(trainY).reshape(-1, 1)
valX = getFeatures(valX)
valY = np.array(valY).reshape(-1, 1)
trainX.shape, trainY.shape, valX.shape, valY.shape

((320000, 6), (320000, 1), (80000, 6), (80000, 1))

In [49]:
class Net(nn.Module):
    
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(6, 6)
        self.fc2 = nn.Linear(6, 6)
        self.fc3 = nn.Linear(6, 1)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [50]:
net = Net().to(device)

In [51]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01)

In [64]:
def targetTensor(target):
    return torch.tensor(target, dtype=torch.long, device=device).view(-1)

def inputTensor(x):
    return torch.tensor(x, dtype=torch.float, device=device)

In [65]:
def train(x, y):
    optimizer.zero_grad()
    output = net(x)
    loss = criterion(output, y)
    loss.backward()
    optimizer.step()
    return loss.item()

def trainIter(X, Y, n_iter, batch_size=16):
    for it in range(n_iter):
        X_batches, Y_batches, n_batch = create_batch(X, Y, batch_size)

        loss = 0.
        for i in range(n_batch):
            x = inputTensor(X_batches[i])
            y = targetTensor(Y_batches[i])
            loss += train(x, y)

        loss /= X.shape[0]
        print('#%d iters, loss:%f' % (it, loss))

In [66]:
trainIter(trainX, trainY, 10)

torch.Size([16])


RuntimeError: cuda runtime error (59) : device-side assert triggered at /pytorch/aten/src/THC/generic/THCTensorMath.cu:15

In [26]:
trainX[0], trainY[0]

(array([1.00000000e+00, 1.35650000e+04, 3.80000000e+00, 4.18738596e+00,
        1.36480000e+04, 1.80000000e+00]), array([1]))

In [56]:
trainY[0]

array([1])