In [1]:
from __future__ import division, print_function, absolute_import, unicode_literals

In [2]:
# packages initialization

import torch
from torch.utils.data import TensorDataset, DataLoader

from scipy import sparse
import numpy as np
# time
# math
# heapq

## Configuration

In [3]:
class Config(object):
    def __init__(self):
        self.data_dir = './data/CAMRa2011/'
        self.embedding_size = 32
        self.epoch = 30
        self.num_negatives = 6
        self.batch_size = 256
        self.lr = [0.000005, 0.000001, 0.0000005]
        self.drop_ratio = 0.2
        self.topK = 5

In [4]:
config = Config()

## DataLoader

In [6]:
class CAMRa2011Dataset(object):
    """CAMRa2011 dataset"""
    
    def __init__(self, dataset_dir):
        
        self.pathes = {
            'train': {
                'user': dataset_dir + "userRatingTrain.txt",
                'group': dataset_dir + "groupRatingTrain.txt"
            },
            'test': {
                'user': dataset_dir + "userRatingTest.txt",
                'user_negative': dataset_dir + "userRatingNegative.txt",
                'group': dataset_dir + "groupRatingTest.txt",
                'group_negative': dataset_dir + "groupRatingNegative.txt",
            },
            'group_user': dataset_dir + "groupMember.txt"
        }
        
        # get the mapping of users and groups
        # format: {gid: [uid, uid, ..], gid: [uid, uid, ..], ...}
        self.group_members = self.get_group_user_mapping()
        
        # get interaction matrix from uid-iid training set
        # train_user_matrix[uid, iid] = [1 | 0]
        self.train_user_matrix = self.get_interaction_matrix(self.pathes['train']['user'])
        
        # format: [[uid, iid], [uid, iid], ...]
        # only pairs of users & items have interactions would appear in the list
        self.test_user_list = self.get_interaction_list(self.pathes['test']['user'])
        
        # format: [[uid, ...], [uid, ...], ...]
        # test_user_negative_list & test_user_list follow the same order
        # e.g. test_user_negative_list[0] is for test_user_list[0]
        self.test_user_negative_list = self.get_negatives(self.pathes['test']['user_negative'])
        
        # get interaction matrix from gid-iid training set
        self.train_group_matrix = self.get_interaction_matrix(self.pathes['train']['group'])

        # pairs of group & item to be tested
        self.test_group_list = self.get_interaction_list(self.pathes['test']['group'])

        self.test_group_negative_list = self.get_negatives(self.pathes['test']['group_negative'])
        
    def get_user_dataloader(self, batch_size=256, shuffle=True):
        
        users, positives_negatives = self.get_train_instances(self.train_user_matrix)
        dataset = TensorDataset(
            torch.tensor(users, dtype=torch.float),
            torch.tensor(positives_negatives, dtype=torch.float))
        
        loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
        
        return loader
    
    def get_group_dataloader(self, batch_size=256, shuffle=True):
        
        groups, positives_negatives = self.get_train_instances(self.train_group_matrix)
        
        dataset = TensorDataset(
            torch.tensor(groups, dtype=torch.float),
            torch.tensor(positives_negatives, dtype=torch.float))
        
        loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
        
        return loader
        
    def get_train_instances(self, interaction_matrix, num_negatives=6):
        
        users, positive_items, negative_items = [], [], []
        
        num_users, num_items = interaction_matrix.shape
        
        for (uid, iid) in interaction_matrix.keys():
            
            # positive instance
            for _ in range(num_negatives):
                
                # positive instances
                positive_items.append(iid)
                
                # negative instances
                negative_iid = np.random.randint(num_items+1)
                while (uid, negative_iid) in interaction_matrix:
                    negative_iid = np.random.randint(num_items+1) # re-generate an negative iid
                negative_items.append(negative_iid)
                
                # users
                users.append(uid)

        positives_negatives = [[positive_iid, negative_iid] for positive_iid, negative_iid in zip(positive_items, negative_items)]
        
        return users, positives_negatives
        
        
    def get_group_user_mapping(self):
    
        mapping = {}

        # read mapping file
        with open(self.pathes['group_user'], 'r') as file:

            line = file.readline().strip()
            while line != None and line != "":

                # sample line format: [gid] [uid 1],[uid 2],[uid 3],[uid 4]
                sequences = line.split(' ')
                gid = int(sequences[0])
                mapping[gid] = []
                for uid in sequences[1].split(','):
                    mapping[gid].append(int(uid))
                line = file.readline().strip()

        return mapping

    # parse all interactions in dataset to 2D sparse matrix
    def get_interaction_matrix(self, rating_file_path):

        # get number of users and items
        num_users, num_items = 0, 0
        with open(rating_file_path, "r") as file:

            line = file.readline()
            while line != None and line != "":
                arr = line.split(" ")
                uid, iid = int(arr[0]), int(arr[1])
                num_users = max(num_users, uid)
                num_items = max(num_items, iid)
                line = file.readline()

        # construct interaction matrix
        # dok_matrix: Dictionary Of Keys based sparse matrix, an efficient structure for constructing sparse matrices incrementally.
        matrix = sparse.dok_matrix((num_users + 1, num_items + 1), dtype=np.float32) # iid and uid starts from 1
        with open(rating_file_path, "r") as file:
            line = file.readline()
            while line != None and line != "":
                arr = line.split(" ")
                if len(arr) > 2:
                    uid, iid, rating = int(arr[0]), int(arr[1]), int(arr[2])
                    if (rating > 0):
                        matrix[uid, iid] = 1.0
                else:
                    uid, iid = int(arr[0]), int(arr[1])
                    matrix[uid, iid] = 1.0
                line = file.readline()

        return matrix
    
    # parse all interactions in dataset to list
    def get_interaction_list(self, rating_file_path):

        interaction_list = []
        with open(rating_file_path, "r") as file:
            line = file.readline()
            while line != None and line != "":
                arr = line.split(" ")
                uid, iid = int(arr[0]), int(arr[1])
                interaction_list.append([uid, iid])
                line = file.readline()

        return interaction_list

    # parse negative sample lists for pairs in test set
    # negative samples: the items which never been interacted
    # the order of returned sample lists must be paired with test list
    def get_negatives(self, file_path):

        negative_samples_list = []

        with open(file_path, "r") as file:

            line = file.readline()
            while line != None and line != "":
                arr = line.split(" ")

                negative_iids = []
                for iid in arr[1:]:
                    negative_iids.append(int(iid))

                negative_samples_list.append(negative_iids)
                line = file.readline()

        return negative_samples_list

In [7]:
dataset = CAMRa2011Dataset(config.data_dir)

In [8]:
user_loader = dataset.get_user_dataloader()

In [9]:
group_loader = dataset.get_group_dataloader()