In [1]:
# PyTorch imports
import torch
import torch.nn.functional as F
from torch.autograd import Variable
from torch import nn
from torch.utils.data import Dataset, DataLoader
torch.manual_seed(0)

# Python imports
from time import time
import numpy as np
import scipy.sparse as sp
import pickle
np.random.seed(7)

device = "cpu"

In [None]:
class MovieLensDataset(Dataset):
    def __init__(self, train_file_name, num_negatives_train=5):
        self.trainMatrix = self.load_rating_file_as_matrix(train_file_name)
        self.num_users, self.num_items = self.trainMatrix.shape
        self.user_input, self.item_input, self.ratings = self.get_train_instances(self.trainMatrix, num_negatives_train)

    def __len__(self):
        return len(self.user_input)

    def __getitem__(self, index):
        user_id = self.user_input[index]
        item_id = self.item_input[index]
        rating = self.ratings[index]

        return {'user_id': user_id,
                'item_id': item_id,
                'rating': rating}

    def get_train_instances(self, train, num_negatives):
        user_input, item_input, ratings = [], [], []
        num_users, num_items = train.shape
        for (u, i) in train.keys():
            # positive instance
            user_input.append(u)
            item_input.append(i)
            ratings.append(1)
            # negative instances
            for _ in range(num_negatives):
                j = np.random.randint(1, num_items)
                # while train.has_key((u, j)):
                while (u, j) in train:
                    j = np.random.randint(1, num_items)
                user_input.append(u)
                item_input.append(j)
                ratings.append(0)
        return user_input, item_input, ratings

    def load_rating_file_as_matrix(self, filename):
        # Get number of users and items
        num_users, num_items = 0, 0
        with open(filename, "r") as f:
            line = f.readline()
            while line != None and line != "":
                arr = line.split("\t")
                u, i = int(arr[0]), int(arr[1])
                num_users = max(num_users, u)
                num_items = max(num_items, i)
                line = f.readline()
        # Construct matrix
        mat = sp.dok_matrix((num_users+1, num_items+1), dtype=np.float32)
        with open(filename, "r") as f:
            line = f.readline()
            while line != None and line != "":
                arr = line.split("\t")
                user, item, rating = int(arr[0]), int(arr[1]), float(arr[2])
                if (rating > 0):
                    mat[user, item] = 1.0
                line = f.readline()
        return mat

In [2]:
class MLP(nn.Module):

    def __init__(self, n_users, n_items, layers=[16, 8], dropout=False):
        super().__init__()
        assert (layers[0] % 2 == 0), "layers[0] must be an even number"
        self.__alias__ = "MLP {}".format(layers)
        self.__dropout__ = dropout

        # user and item embedding layers
        embedding_dim = int(layers[0]/2)
        self.user_embedding = torch.nn.Embedding(n_users, embedding_dim)
        self.item_embedding = torch.nn.Embedding(n_items, embedding_dim)

        # list of weight matrices
        self.fc_layers = torch.nn.ModuleList()
        # hidden dense layers
        for _, (in_size, out_size) in enumerate(zip(layers[:-1], layers[1:])):
            self.fc_layers.append(torch.nn.Linear(in_size, out_size))
        # final prediction layer
        self.output_layer = torch.nn.Linear(layers[-1], 1)

    def forward(self, feed_dict):
        users = feed_dict['user_id']
        items = feed_dict['item_id']
        user_embedding = self.user_embedding(users)
        item_embedding = self.item_embedding(items)
        # concatenate user and item embeddings to form input
        x = torch.cat([user_embedding, item_embedding], 1)
        for idx, _ in enumerate(range(len(self.fc_layers))):
            x = self.fc_layers[idx](x)
            x = F.relu(x)
            x = F.dropout(x,  p=self.__dropout__, training=self.training)
        logit = self.output_layer(x)
        rating = torch.sigmoid(logit)
        return rating

    def predict(self, feed_dict):
        # return the score, inputs and outputs are numpy arrays
        for key in feed_dict:
            if type(feed_dict[key]) != type(None):
                feed_dict[key] = torch.from_numpy(
                    feed_dict[key]).to(dtype=torch.long, device=device)
        output_scores = self.forward(feed_dict)
        return output_scores.cpu().detach().numpy()

    def get_alias(self):
        return self.__alias__

In [3]:
def train_one_epoch(model, data_loader, loss_fn, optimizer, device):
    t1 = time()
    epoch_loss = []
    model.train()
    for feed_dict in data_loader:
        for key in feed_dict:
            if type(feed_dict[key]) != type(None):
                feed_dict[key] = feed_dict[key].to(dtype = torch.long, device = device)
        prediction = model(feed_dict)
        rating = feed_dict['rating']
      
        rating = rating.float().view(prediction.size())  
        loss = loss_fn(prediction, rating)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss.append(loss.item())
    epoch_loss = np.mean(epoch_loss)
    return epoch_loss

In [15]:
def main():
    train_data_path = 'Data/movielens.train_implicit_ds'
    layers = eval('[16,32,16,8]')
    weight_decay = 0.00001
    num_negatives_train = 4
    num_negatives_test = 100
    dropout = 0
    learner = 'adam'
    learning_rate = 0.001
    batch_size = 256
    epochs = 30

    topK = 10

    full_dataset = MovieLensDataset(train_data_path, num_negatives_train=num_negatives_train)
    
    train = full_dataset.trainMatrix
    num_users, num_items = train.shape

    training_data_generator = DataLoader(full_dataset, batch_size=batch_size, shuffle=True, num_workers=0)

    model = MLP(num_users, num_items, layers=layers, dropout=dropout)

    loss_fn = torch.nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), weight_decay=weight_decay)

    for epoch in range(epochs):
        epoch_loss = train_one_epoch(model, training_data_generator, loss_fn, optimizer, device)
        print("Epoch = {} loss = {}".format(epoch, epoch_loss))
        
    test_items = np.load('../Data/test_items.npy', allow_pickle=True)
    test_users = np.load('../Data/test_users.npy', allow_pickle=True)
    predictions = []
    for users, items in zip(test_users, test_items):
        feed_dict={'user_id': users, 'item_id': items}
        p = model.predict(feed_dict)
        predictions.append(p)
    
    np.save('Predictions/mlp', predictions)

In [16]:
main()

Epoch = 0 loss = 0.4368752856895289
Epoch = 1 loss = 0.363552815384335
Epoch = 2 loss = 0.3566942249850709
Epoch = 3 loss = 0.3530544972234918
Epoch = 4 loss = 0.349941099721948
Epoch = 5 loss = 0.3468975736035241
Epoch = 6 loss = 0.3432164560531769
Epoch = 7 loss = 0.33800392281023417
Epoch = 8 loss = 0.33125228371269017
Epoch = 9 loss = 0.3247277679831483
Epoch = 10 loss = 0.31915132159256504
Epoch = 11 loss = 0.31444638052652046
Epoch = 12 loss = 0.31053612923745344
Epoch = 13 loss = 0.30703486965762244
Epoch = 14 loss = 0.3038026545090885
Epoch = 15 loss = 0.30069175192398
Epoch = 16 loss = 0.29755062993028675
Epoch = 17 loss = 0.2944786645106259
Epoch = 18 loss = 0.2915656596345187
Epoch = 19 loss = 0.28892169788826344
Epoch = 20 loss = 0.2866338291448524
Epoch = 21 loss = 0.2844755217625497
Epoch = 22 loss = 0.2827113013935952
Epoch = 23 loss = 0.28096287657894214
Epoch = 24 loss = 0.2793001576911571
Epoch = 25 loss = 0.2778460839691088
Epoch = 26 loss = 0.2763638101498902
Epoch 