In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install implicit
import numpy as np
import pandas as pd
from tqdm import trange
%cd /content/drive/My Drive/Colab Notebooks/Thesis

/content/drive/My Drive/Colab Notebooks/Thesis


In [4]:
pip install -r final/bpr-master/requirements.txt



In [5]:
import os
import gzip
import json
import math
import random
import pickle
import pprint
import argparse

import numpy as np
import pandas as pd


class DatasetLoader(object):
    def load(self):
        """Minimum condition for dataset:
          * All users must have at least one item record.
          * All items must have at least one user record.
        """
        raise NotImplementedError

class Hetrec(DatasetLoader):
    def __init__(self, data_dir):
        self.fpath = os.path.join('/content/drive/My Drive/Colab Notebooks/Thesis/final/hetrec2011-lastfm-2k', 'user_artists.dat')

    def load(self):
        # Load data
        df = pd.read_csv(self.fpath,
                         sep="\s+",
                         engine='python',
                         names=['userID', 'itemID', 'weight'])
        # TODO: Remove negative rating?
        # df = df[df['rate'] >= 3]
        return df


def convert_unique_idx(df, column_name):
    column_dict = {x: i for i, x in enumerate(df[column_name].unique())}
    # print(column_dict)
    df[column_name] = df[column_name].apply(column_dict.get)
    # df[column_name] = ''.join(map(str,df[column_name])) #converting the list into string
    # df[column_name] = np.float(df[column_name])
    # df[column_name] = df[column_name].astype('float')
    # df[column_name] = df[column_name].astype('int64')
    df[column_name] = round(df[column_name])
    # assert df[column_name].min() == 0
    # assert df[column_name].max() == len(column_dict) - 1
    return df, column_dict


def create_user_list(df, user_size):
    user_list = [list() for u in range(user_size)]
    for row in df.itertuples():
        user_list[row.userID].append(row.itemID)
    return user_list


def split_train_test(df, user_size, test_size=0.25, time_order=False):
    """Split a dataset into `train_user_list` and `test_user_list`.
    Because it needs `user_list` for splitting dataset as `time_order` is set,
    Returning `user_list` data structure will be a good choice."""
    # TODO: Handle duplicated items
    if not time_order:
        test_idx = np.random.choice(len(df), size=int(len(df)*test_size))
        train_idx = list(set(range(len(df))) - set(test_idx))
        test_df = df.loc[test_idx].reset_index(drop=True)
        train_df = df.loc[train_idx].reset_index(drop=True)
        test_user_list = create_user_list(test_df, user_size)
        train_user_list = create_user_list(train_df, user_size)
    else:
        total_user_list = create_user_list(df, user_size)
        train_user_list = [None] * len(user_list)
        test_user_list = [None] * len(user_list)
        for user, item_list in enumerate(total_user_list):
            # Choose latest item
            item_list = sorted(item_list, key=lambda x: x[0])
            # Split item
            test_item = item_list[math.ceil(len(item_list)*(1-test_size)):]
            train_item = item_list[:math.ceil(len(item_list)*(1-test_size))]
            # Register to each user list
            test_user_list[user] = test_item
            train_user_list[user] = train_item
    # # Remove time --> dont have time
    # test_user_list = [list(map(lambda x: x[1], l)) for l in test_user_list]
    # train_user_list = [list(map(lambda x: x[1], l)) for l in train_user_list]
    return train_user_list, test_user_list


def create_pair(user_list):
    pair = []
    for user, item_list in enumerate(user_list):
        pair.extend([(user, item) for item in item_list])
    return pair


def main():
    dataset = "hetrec"
    data_dir = "hetrec2011-lastfm-2k"
    output_data = os.path.join('preprocessed', 'hetrec.pickle')
    test_size = 0.25
    time_order = False
    
    if dataset == 'ml-1m':
        df = MovieLens1M(data_dir).load()
    elif dataset == 'hetrec':
        df = Hetrec(data_dir).load()
    elif dataset == 'ml-20m':
        df = MovieLens20M(data_dir).load()
    elif dataset == 'amazon-beauty':
        df = AmazonBeauty(data_dir).load()
    else:
        raise NotImplementedError
    df, user_mapping = convert_unique_idx(df, 'userID')
    df, item_mapping = convert_unique_idx(df, 'itemID')
    print('Complete assigning unique index to user and item')

    user_size = len(df['userID'].unique())
    item_size = len(df['itemID'].unique())

    train_user_list, test_user_list = split_train_test(df,
                                                       user_size,
                                                       test_size=test_size,
                                                       time_order=time_order)
    print('Complete spliting items for training and testing')

    train_pair = create_pair(train_user_list)
    print('Complete creating pair')

    dataset = {'user_size': user_size, 'item_size': item_size, 
               'user_mapping': user_mapping, 'item_mapping': item_mapping,
               'train_user_list': train_user_list, 'test_user_list': test_user_list,
               'train_pair': train_pair}
    dirname = os.path.dirname(os.path.abspath(output_data))
    os.makedirs(dirname, exist_ok=True)
    with open(output_data, 'wb') as f:
        pickle.dump(dataset, f, protocol=pickle.HIGHEST_PROTOCOL)

if __name__ == '__main__':
    main()
    # # Parse argument
    # parser = argparse.ArgumentParser()
    # parser.add_argument('--dataset',
    #                     choices=['ml-1m', 'ml-20m', 'amazon-beauty', 'gowalla','hetrec'])
    # parser.add_argument('--data_dir',
    #                     type=str,
    #                     default=os.path.join('data', 'hetrec'),
    #                     help="File path for raw data")
    # parser.add_argument('--output_data',
    #                     type=str,
    #                     default=os.path.join('preprocessed', 'hetrec.pickle'),
    #                     help="File path for preprocessed data")
    # parser.add_argument('--test_size',
    #                     type=float,
    #                     default=0.25,
    #                     help="Proportion for training and testing split")

    # args = parser.parse_args()
    # main(args)

Complete assigning unique index to user and item
Complete spliting items for training and testing
Complete creating pair


In [11]:
import os
import random
import pickle
import argparse
from collections import deque

import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import time
from torch.utils.data import IterableDataset, DataLoader, get_worker_info
from torch.utils.tensorboard import SummaryWriter


class TripletUniformPair(IterableDataset):
    def __init__(self, num_item, user_list, pair, shuffle, num_epochs):
        self.num_item = num_item
        self.user_list = user_list
        self.pair = pair
        self.shuffle = shuffle
        self.num_epochs = num_epochs

    def __iter__(self):
        worker_info = get_worker_info()
        # Shuffle per epoch
        self.example_size = self.num_epochs * len(self.pair)
        self.example_index_queue = deque([])
        self.seed = 0
        if worker_info is not None:
            self.start_list_index = worker_info.id
            self.num_workers = worker_info.num_workers
            self.index = worker_info.id
        else:
            self.start_list_index = None
            self.num_workers = 1
            self.index = 0
        return self

    def __next__(self):
        if self.index >= self.example_size:
            raise StopIteration
        # If `example_index_queue` is used up, replenish this list.
        while len(self.example_index_queue) == 0:
            index_list = list(range(len(self.pair)))
            if self.shuffle:
                random.Random(self.seed).shuffle(index_list)
                self.seed += 1
            if self.start_list_index is not None:
                index_list = index_list[self.start_list_index::self.num_workers]
                # Calculate next start index
                self.start_list_index = (self.start_list_index + (self.num_workers - (len(self.pair) % self.num_workers))) % self.num_workers
            self.example_index_queue.extend(index_list)
        result = self._example(self.example_index_queue.popleft())
        self.index += self.num_workers
        return result

    def _example(self, idx):
        u = self.pair[idx][0]
        i = self.pair[idx][1]
        j = np.random.randint(self.num_item)
        while j in self.user_list[u]:
            j = np.random.randint(self.num_item)
        return u, i, j


class BPR(nn.Module):
    def __init__(self, user_size, item_size, dim, weight_decay):
        super().__init__()
        self.W = nn.Parameter(torch.empty(user_size, dim))
        self.H = nn.Parameter(torch.empty(item_size, dim))
        nn.init.xavier_normal_(self.W.data)
        nn.init.xavier_normal_(self.H.data)
        self.weight_decay = weight_decay

    def forward(self, u, i, j):
        """Return loss value.
        
        Args:
            u(torch.LongTensor): tensor stored user indexes. [batch_size,]
            i(torch.LongTensor): tensor stored item indexes which is prefered by user. [batch_size,]
            j(torch.LongTensor): tensor stored item indexes which is not prefered by user. [batch_size,]
        
        Returns:
            torch.FloatTensor
        """
        u = self.W[u, :]
        i = self.H[i, :]
        j = self.H[j, :]
        x_ui = torch.mul(u, i).sum(dim=1)
        x_uj = torch.mul(u, j).sum(dim=1)
        x_uij = x_ui - x_uj
        log_prob = F.logsigmoid(x_uij).sum()
        regularization = self.weight_decay * (u.norm(dim=1).pow(2).sum() + i.norm(dim=1).pow(2).sum() + j.norm(dim=1).pow(2).sum())
        return -log_prob + regularization

    def recommend(self, u):
        """Return recommended item list given users.
        Args:
            u(torch.LongTensor): tensor stored user indexes. [batch_size,]
        Returns:
            pred(torch.LongTensor): recommended item list sorted by preference. [batch_size, item_size]
        """
        u = self.W[u, :]
        x_ui = torch.mm(u, self.H.t())
        pred = torch.argsort(x_ui, dim=1)
        return pred


def precision_and_recall_k(user_emb, item_emb, train_user_list, test_user_list, klist, batch=512):
    """Compute precision at k using GPU.
    Args:
        user_emb (torch.Tensor): embedding for user [user_num, dim]
        item_emb (torch.Tensor): embedding for item [item_num, dim]
        train_user_list (list(set)):
        test_user_list (list(set)):
        k (list(int)):
    Returns:
        (torch.Tensor, torch.Tensor) Precision and recall at k
    """
    # Calculate max k value
    max_k = max(klist)

    # Compute all pair of training and test record
    result = None
    for i in range(0, user_emb.shape[0], batch):
        # Create already observed mask
        mask = user_emb.new_ones([min([batch, user_emb.shape[0]-i]), item_emb.shape[0]])
        for j in range(batch):
            if i+j >= user_emb.shape[0]:
                break
            mask[j].scatter_(dim=0, index=torch.LongTensor(train_user_list[i+j]).cuda(), value=torch.tensor(0.0).cuda())
        # Calculate prediction value
        cur_result = torch.mm(user_emb[i:i+min(batch, user_emb.shape[0]-i), :], item_emb.t())
        cur_result = torch.sigmoid(cur_result)
        assert not torch.any(torch.isnan(cur_result))
        # Make zero for already observed item
        cur_result = torch.mul(mask, cur_result)
        _, cur_result = torch.topk(cur_result, k=max_k, dim=1)
        result = cur_result if result is None else torch.cat((result, cur_result), dim=0)

    result = result.cpu()
    # Sort indice and get test_pred_topk
    precisions, recalls = [], []
    for k in klist:
        precision, recall = 0, 0
        for i in range(user_emb.shape[0]):
            test = set(test_user_list[i])
            pred = set(result[i, :k].numpy().tolist())
            val = len(test & pred)
            precision += val / max([min([k, len(test)]), 1])
            recall += val / max([len(test), 1])
        precisions.append(precision / user_emb.shape[0])
        recalls.append(recall / user_emb.shape[0])
    return precisions, recalls

def main(): 
    data = os.path.join('/content/drive/My Drive/Colab Notebooks/Thesis/preprocessed', 'hetrec.pickle')
    seed = 0
    dim = 100
    lr = 1e-3
    weight_decay = 0.025
    n_epochs = 40
    batch_size = 4096
    print_every = 20
    eval_every = 100
    save_every = 100000
    model = os.path.join('output', 'bpr.pt')
    
    # Initialize seed
    np.random.seed(seed)
    torch.manual_seed(seed)

    # Load preprocess data
    with open(data, 'rb') as f:
        dataset = pickle.load(f)
        user_size, item_size = dataset['user_size'], dataset['item_size']
        train_user_list, test_user_list = dataset['train_user_list'], dataset['test_user_list']
        train_pair = dataset['train_pair']
    print('Load complete')

    # Create dataset, model, optimizer
    dataset = TripletUniformPair(item_size, train_user_list, train_pair, True, n_epochs)
    loader = DataLoader(dataset, batch_size=batch_size, num_workers=16)
    model = BPR(user_size, item_size, dim, weight_decay).cuda()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    writer = SummaryWriter()
    start = time.time()
    # Training
    smooth_loss = 0
    idx = 0
    for u, i, j in loader:
        optimizer.zero_grad()
        loss = model(u, i, j)
        loss.backward()
        optimizer.step()
        writer.add_scalar('train/loss', loss, idx)
        smooth_loss = smooth_loss*0.99 + loss*0.01
        if idx % print_every == (print_every - 1):
            print('loss: %.4f' % smooth_loss)
            # print(idx)
            # print(idx % eval_every)
            # print(eval_every - 1)
        if idx % eval_every == (eval_every - 1):
            plist, rlist = precision_and_recall_k(model.W.detach(),
                                                    model.H.detach(),
                                                    train_user_list,
                                                    test_user_list,
                                                    klist=[1,5,10])
            print('P@1: %.4f, P@5: %.4f P@10: %.4f, R@1: %.4f, R@5: %.4f, R@10: %.4f' % (plist[0], plist[1], plist[2], rlist[0], rlist[1], rlist[2]))
            
            writer.add_scalars('eval', {'P@1': plist[0],
                                                    'P@5': plist[1],
                                                    'P@10': plist[2]}, idx)
            writer.add_scalars('eval', {'R@1': rlist[0],
                                                'R@5': rlist[1],
                                                'R@10': rlist[2]}, idx)
        if idx % save_every == (save_every - 1):
            dirname = os.path.dirname(os.path.abspath(model))
            os.makedirs(dirname, exist_ok=True)
            torch.save(model.state_dict(), model)
        idx += 1
    print('time',time.time()-start)  
if __name__ == '__main__':
  main()

Load complete
loss: 518.8221
loss: 941.4379
loss: 1285.1345
loss: 1560.8456
loss: 1771.7299
P@1: 0.2171, P@5: 0.1334 P@10: 0.1032, R@1: 0.0210, R@5: 0.0617, R@10: 0.0881
loss: 1913.7683
loss: 1984.9093
loss: 1995.9729
loss: 1973.0909
loss: 1930.4454
P@1: 0.2520, P@5: 0.1687 P@10: 0.1293, R@1: 0.0245, R@5: 0.0786, R@10: 0.1104
loss: 1879.7168
loss: 1825.8030
loss: 1777.7896
loss: 1731.8922
loss: 1690.2333
P@1: 0.1764, P@5: 0.1408 P@10: 0.1183, R@1: 0.0170, R@5: 0.0657, R@10: 0.1017
loss: 1653.7018
loss: 1619.6022
loss: 1589.1387
loss: 1560.9141
loss: 1532.0055
P@1: 0.0887, P@5: 0.0983 P@10: 0.0888, R@1: 0.0089, R@5: 0.0456, R@10: 0.0758
loss: 1505.7972
loss: 1483.5688
loss: 1464.1344
loss: 1443.6583
loss: 1427.2758
P@1: 0.0977, P@5: 0.0876 P@10: 0.0924, R@1: 0.0095, R@5: 0.0402, R@10: 0.0788
loss: 1411.8478
loss: 1397.9319
loss: 1384.1697
loss: 1372.7397
loss: 1361.4570
P@1: 0.1442, P@5: 0.1160 P@10: 0.1090, R@1: 0.0134, R@5: 0.0543, R@10: 0.0937
loss: 1349.8633
loss: 1341.0901
loss: 13