In [1]:
from __future__ import division, print_function, absolute_import, unicode_literals

In [2]:
# packages initialization

import torch
from scipy import sparse
import numpy as np
# time
# math
# heapq

## Configuration

In [3]:
class Config(object):
    def __init__(self):
        self.data_dir = './data/CAMRa2011/'
        self.user_dataset = self.data_dir + 'userRating'
        self.group_dataset = self.data_dir + 'groupRating'
        self.group_user_mapping = self.data_dir + "groupMember.txt"
        self.embedding_size = 32
        self.epoch = 30
        self.num_negatives = 6
        self.batch_size = 256
        self.lr = [0.000005, 0.000001, 0.0000005]
        self.drop_ratio = 0.2
        self.topK = 5

In [4]:
config = Config()

## Dataset Loading

In [9]:
def get_group_user_mapping(path):
    
    mapping = {}
    
    # read mapping file
    with open(path, 'r') as file:
        
        line = file.readline().strip()
        while line != None and line != "":
            
            # sample line format: [gid] [uid 1],[uid 2],[uid 3],[uid 4]
            sequences = line.split(' ')
            gid = int(sequences[0])
            mapping[gid] = []
            for uid in sequences[1].split(','):
                mapping[gid].append(int(uid))
            line = file.readline().strip()

    return mapping

# parse all interactions in dataset to 2D sparse matrix
def get_interaction_matrix(rating_file_path):
    
    # get number of users and items
    num_users, num_items = 0, 0
    with open(rating_file_path, "r") as file:
        
        line = file.readline()
        while line != None and line != "":
            arr = line.split(" ")
            uid, iid = int(arr[0]), int(arr[1])
            num_users = max(num_users, uid)
            num_items = max(num_items, iid)
            line = file.readline()
            
    # construct interaction matrix
    # dok_matrix: Dictionary Of Keys based sparse matrix, an efficient structure for constructing sparse matrices incrementally.
    matrix = sparse.dok_matrix((num_users + 1, num_items + 1), dtype=np.float32) # iid and uid starts from 1
    with open(rating_file_path, "r") as file:
        line = file.readline()
        while line != None and line != "":
            arr = line.split(" ")
            if len(arr) > 2:
                uid, iid, rating = int(arr[0]), int(arr[1]), int(arr[2])
                if (rating > 0):
                    matrix[uid, iid] = 1.0
            else:
                uid, iid = int(arr[0]), int(arr[1])
                matrix[uid, iid] = 1.0
            line = file.readline()
    
    return matrix

# parse all interactions in dataset to list
def get_interaction_list(rating_file_path):
    
    interaction_list = []
    with open(rating_file_path, "r") as file:
        line = file.readline()
        while line != None and line != "":
            arr = line.split(" ")
            uid, iid = int(arr[0]), int(arr[1])
            interaction_list.append([uid, iid])
            line = file.readline()
            
    return interaction_list

# parse negative sample lists for pairs in test set
# negative samples: the items which never been interacted
# the order of returned sample lists must be paired with test list
def get_negative_samples(file_path):
    
    negative_samples_list = []
    
    with open(file_path, "r") as file:
        
        line = file.readline()
        while line != None and line != "":
            arr = line.split(" ")
            
            negative_iids = []
            for iid in arr[1:]:
                negative_iids.append(int(iid))
                
            negative_samples_list.append(negative_iids)
            line = file.readline()
            
    return negative_samples_list

In [10]:
# get the mapping of users and groups
# format: {gid: [uid, uid, ..], gid: [uid, uid, ..], ...}
group_members = get_group_user_mapping(config.group_user_mapping)

# get interaction matrix from uid-iid training set
# train_user_matrix[uid, iid] = [1 | 0]
train_user_matrix = get_interaction_matrix(config.user_dataset + "Train.txt")

# format: [[uid, iid], [uid, iid], ...]
# only pairs of users & items have interactions would appear in the list
test_user_list = get_interaction_list(config.user_dataset + "Test.txt")

# format: [[uid, ...], [uid, ...], ...]
# test_user_negative_list & test_user_list follow the same order
# e.g. test_user_negative_list[0] is for test_user_list[0]
test_user_negative_list = get_negative_samples(config.user_dataset + "Negative.txt")

# get interaction matrix from gid-iid training set
train_group_matrix = get_interaction_matrix(config.group_dataset + "Train.txt")

# pairs of group & item to be tested
test_group_list = get_interaction_list(config.group_dataset + "Test.txt")

test_group_negative_list = get_negative_samples(config.group_dataset + "Negative.txt")