In [1]:
import pandas as pd
import numpy as np
import torch
import torch.autograd as autograd 
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
import random

In [2]:
data = pd.read_csv('training.csv')
items_features = pd.read_csv('item_feature.csv')
data.head()

Unnamed: 0,user_id,item_id,context_feature_id
0,0,28366,2
1,0,16109,2
2,0,11500,3
3,0,20750,2
4,0,8759,2


In [3]:
n = data.shape[0]

# Zero Sampling

In [4]:
data['feedback'] = 1

In [5]:
data.shape

(970245, 4)

In [6]:
def zero_sampling(df_, users_list, item_list, context_list, all_users = False, all_items = False):
    N = df_.shape[0]
    result_array = []
    for i in range(N):
        rand_user = random.choice(users_list)
        rand_item = random.choice(item_list)
        rand_context = random.choice(context_list)
        fake_feedback = 0
        random_gen = (rand_user, rand_item, rand_context, fake_feedback)
        result_array.append(random_gen)
    
    if all_users: 
        for user in users_list:
            rand_item = random.choice(item_list)
            rand_context = random.choice(context_list)
            fake_feedback = 0
            random_gen = (user, rand_item, rand_context, fake_feedback)
            result_array.append(random_gen)
            
    if all_items:
        for item in item_list:
            rand_user = random.choice(users_list)
            rand_context = random.choice(context_list)
            fake_feedback = 0
            random_gen = (rand_user, item, rand_context, fake_feedback)
            result_array.append(random_gen)
    
    zero_df = pd.DataFrame(result_array, columns=['user_id', 'item_id', 'context_feature_id', 'feedback'])
    
    df_with_zeros = pd.concat([df_, zero_df])
    df_with_zeros = df_with_zeros.drop_duplicates(subset=['user_id', 'item_id'], keep='first')
    return df_with_zeros

In [7]:
# unique users items and context
users = np.sort(data.user_id.unique())
unique_items = np.sort(data.item_id.unique())
context = np.sort(data.context_feature_id.unique())

In [8]:
# call zero sampling to get dataframe with zeros
df = zero_sampling(data, users, unique_items, context, all_users=True, all_items=True)

# Train Test Split

In [9]:
# train test split
np.random.seed(0)
mask = np.random.rand(len(df)) < 0.8
train = df[mask].reset_index()
valid = df[~mask].reset_index()

# Matrix Factorization

In [10]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100, seed=23):
        super(MF, self).__init__()
        torch.manual_seed(seed)
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_bias = nn.Embedding(num_items, 1)
        # init 
        self.user_emb.weight.data.uniform_(-0.05,0.05)
        self.item_emb.weight.data.uniform_(-0.05,0.05)
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)

    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        b_i = self.user_bias(u).squeeze()
        c_j = self.item_bias(v).squeeze()
        
        return torch.sigmoid((U * V).sum(1) + b_i + c_j)

In [11]:
num_users = len(train.user_id.unique()) + 10
num_items = len(train.item_id.unique()) + 10
print(num_users, num_items) 

169707 37988


In [12]:
# def change_id(df_, col, random_choices):
#     """This function will randomly assign user_id or item_id
#     if the id = -1'"""
# #     random_ = np.arange(1, len(random_choices)-1)
#     for index, row in df_.iterrows():
#         if row[col] == -1:
#             rand = np.random.choice(random_choices)
#             df_.at[index, col] = rand

In [13]:
# encode the user_id
train_user_ids = np.sort(np.unique(train.user_id.values))
userid2idx = {o:i for i,o in enumerate(train_user_ids)}

# map the encoding
train["user_id"] = train["user_id"].apply(lambda x: userid2idx[x])
valid["user_id"] = valid["user_id"].apply(lambda x: userid2idx.get(x, -1))

In [14]:
# encode the item_id
train_item_ids = np.sort(np.unique(train.item_id.values))
itemid2idx = {o:i for i,o in enumerate(train_item_ids)}

# map the encoding
train["item_id"] = train["item_id"].apply(lambda x: itemid2idx[x])
valid["item_id"] = valid["item_id"].apply(lambda x: itemid2idx.get(x, -1))

In [15]:
new_user_ids = np.sort(np.unique(train.user_id.values))
new_item_ids = np.sort(np.unique(train.item_id.values))

In [16]:
def change_id_to_max(df_, col, max_choice):
    """This function will assign user_id or item_id
    if the id = -1 to the max'"""
#     random_ = np.arange(1, len(random_choices)-1)
    for index, row in df_.iterrows():
        if row[col] == -1:
            df_.at[index, col] = max_choice

In [17]:
max_userid = train.user_id.values.max() + 1
max_itemid = train.item_id.values.max() + 1

In [18]:
# randomly assign the cold starts to something seen in the training
change_id_to_max(valid, 'user_id', max_userid)
change_id_to_max(valid, 'item_id', max_itemid)

In [19]:
def valid_loss(model):
    model.eval()
    users = torch.LongTensor(valid.user_id.values) 
    items = torch.LongTensor(valid.item_id.values) 
    feedback = torch.FloatTensor(valid.feedback.values) 
    y_hat = model(users, items)
    loss = F.binary_cross_entropy(y_hat, feedback)
    return loss.item()

In [20]:
# here we are not using data loaders because our data fits well in memory
def train_epocs(model, epochs=10, lr=0.01, wd=0.0):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    for i in range(epochs):
        model.train()
        users = torch.LongTensor(train.user_id.values)  
        items = torch.LongTensor(train.item_id.values) 
        feedback = torch.FloatTensor(train.feedback.values)  
    
        y_hat = model(users, items)
        
        loss = F.binary_cross_entropy(y_hat, feedback)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        testloss = valid_loss(model)
        print('Epoch Number:', i+1,"train loss %.3f valid loss %.3f" % (loss.item(), testloss)) 

In [21]:
model = MF(num_users, num_items, emb_size=100) 

In [22]:
train_epocs(model, epochs=13, lr=0.1, wd = 0.0000001)

Epoch Number: 1 train loss 0.693 valid loss 0.656
Epoch Number: 2 train loss 0.614 valid loss 0.609
Epoch Number: 3 train loss 0.429 valid loss 0.557
Epoch Number: 4 train loss 0.231 valid loss 0.513
Epoch Number: 5 train loss 0.090 valid loss 0.496
Epoch Number: 6 train loss 0.027 valid loss 0.503
Epoch Number: 7 train loss 0.008 valid loss 0.520
Epoch Number: 8 train loss 0.002 valid loss 0.533
Epoch Number: 9 train loss 0.001 valid loss 0.538
Epoch Number: 10 train loss 0.001 valid loss 0.534
Epoch Number: 11 train loss 0.000 valid loss 0.522
Epoch Number: 12 train loss 0.000 valid loss 0.504
Epoch Number: 13 train loss 0.001 valid loss 0.482


In [23]:
train_epocs(model, epochs=12, lr=0.05, wd = 0.0000001)

Epoch Number: 1 train loss 0.001 valid loss 0.451
Epoch Number: 2 train loss 0.002 valid loss 0.429
Epoch Number: 3 train loss 0.005 valid loss 0.412
Epoch Number: 4 train loss 0.009 valid loss 0.399
Epoch Number: 5 train loss 0.016 valid loss 0.388
Epoch Number: 6 train loss 0.025 valid loss 0.378
Epoch Number: 7 train loss 0.032 valid loss 0.368
Epoch Number: 8 train loss 0.036 valid loss 0.357
Epoch Number: 9 train loss 0.036 valid loss 0.345
Epoch Number: 10 train loss 0.033 valid loss 0.334
Epoch Number: 11 train loss 0.028 valid loss 0.323
Epoch Number: 12 train loss 0.023 valid loss 0.314


- 5 epochs and the 10 epochs for final model

# Test Model

In [24]:
train = df.copy()
test = pd.read_csv('test_kaggle.csv')
test.head()

Unnamed: 0,id,user_id,item_id,context_feature_id
0,0,4,16835,2
1,1,4,22590,3
2,2,4,1978,1
3,3,4,28916,1
4,4,4,14427,2


In [25]:
num_users = len(train.user_id.unique()) + 10
num_items = len(train.item_id.unique()) + 10
print(num_users, num_items) 

169708 37988


In [26]:
# encode the user_id
train_user_ids = np.sort(np.unique(train.user_id.values))
userid2idx = {o:i for i,o in enumerate(train_user_ids)}

# map the encoding
train["user_id"] = train["user_id"].apply(lambda x: userid2idx[x])
test["user_id"] = test["user_id"].apply(lambda x: userid2idx.get(x, -1))

In [27]:
# encode the item_id
train_item_ids = np.sort(np.unique(train.item_id.values))
itemid2idx = {o:i for i,o in enumerate(train_item_ids)}

# map the encoding
train["item_id"] = train["item_id"].apply(lambda x: itemid2idx[x])
test["item_id"] = test["item_id"].apply(lambda x: itemid2idx.get(x, -1))

In [28]:
max_userid = train.user_id.values.max() + 1
max_itemid = train.item_id.values.max() + 1

# randomly assign the cold starts to something seen in the training
change_id_to_max(test, 'user_id', max_userid)
change_id_to_max(test, 'item_id', max_itemid)

In [29]:
model = MF(num_users, num_items, emb_size=100) 

In [30]:
train_epocs(model, epochs=13, lr=0.1, wd = 0.000001)

Epoch Number: 1 train loss 0.693 valid loss 0.655
Epoch Number: 2 train loss 0.654 valid loss 0.613
Epoch Number: 3 train loss 0.549 valid loss 0.570
Epoch Number: 4 train loss 0.436 valid loss 0.526
Epoch Number: 5 train loss 0.314 valid loss 0.485
Epoch Number: 6 train loss 0.213 valid loss 0.451
Epoch Number: 7 train loss 0.149 valid loss 0.427
Epoch Number: 8 train loss 0.121 valid loss 0.412
Epoch Number: 9 train loss 0.118 valid loss 0.401
Epoch Number: 10 train loss 0.126 valid loss 0.393
Epoch Number: 11 train loss 0.134 valid loss 0.388
Epoch Number: 12 train loss 0.139 valid loss 0.385
Epoch Number: 13 train loss 0.141 valid loss 0.385


In [31]:
train_epocs(model, epochs=12, lr=0.05, wd = 0.000001)

Epoch Number: 1 train loss 0.144 valid loss 0.381
Epoch Number: 2 train loss 0.155 valid loss 0.378
Epoch Number: 3 train loss 0.154 valid loss 0.377
Epoch Number: 4 train loss 0.156 valid loss 0.375
Epoch Number: 5 train loss 0.158 valid loss 0.373
Epoch Number: 6 train loss 0.160 valid loss 0.371
Epoch Number: 7 train loss 0.160 valid loss 0.369
Epoch Number: 8 train loss 0.158 valid loss 0.366
Epoch Number: 9 train loss 0.154 valid loss 0.364
Epoch Number: 10 train loss 0.152 valid loss 0.362
Epoch Number: 11 train loss 0.150 valid loss 0.360
Epoch Number: 12 train loss 0.149 valid loss 0.359


In [32]:
# get the predictions
test_users = torch.LongTensor(test.user_id.values)
test_items = torch.LongTensor(test.item_id.values)
y_hat = model(test_users, test_items)

In [33]:
# detach to numpy
y_hat = y_hat.detach().numpy()

In [34]:
test['rating'] = y_hat

In [35]:
pred = test['rating']

In [36]:
pred.to_csv('weight_decay')