In [1]:
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
from torch import nn
import torch.nn.functional as F
import dgl
import dgl.function as fn
import dgl.nn as dglnn
from surprise import Dataset
from surprise.model_selection import train_test_split
import sys
sys.path.append("/c/users/.dgl/config.json")

Using backend: pytorch


In [2]:
trainset  = pd.read_csv('./ydata-ymusic-rating-study-v1_0-train.txt', header = None, sep = '\t',  encoding='latin-1', error_bad_lines=False ) 
testset  = pd.read_csv('./ydata-ymusic-rating-study-v1_0-test.txt', header = None, sep = '\t',  encoding='latin-1', error_bad_lines=False ) 


In [3]:
trainset.rename({0 : 'userId', 1 : 'itemId', 2 : 'rating'}, axis = 1, inplace = True)
testset.rename({0 : 'userId', 1 : 'itemId', 2 : 'rating'}, axis = 1, inplace = True)

In [4]:
def buildTensor(list, bool): 

    if bool:
        list = torch.LongTensor(list.astype('category')
                                .cat.codes.values) # Konvertiere zu category damit cat.codes ausgeführt werden kann.
    else:
        list = torch.LongTensor(list.values)
    return list

In [5]:
userId = torch.LongTensor(trainset['userId'].astype('category').cat.codes.values) 
itemId = torch.LongTensor(trainset['itemId'].astype('category').cat.codes.values) 

userIdTest = torch.LongTensor(testset['userId'].astype('category').cat.codes.values) 
itemIdTest = torch.LongTensor(testset['itemId'].astype('category').cat.codes.values) 


graph = dgl.heterograph({
    ('user', 'rated', 'item'): (userId, itemId),
    ('item', 'rated-by', 'user'): (itemId, userId)
})
 
print(graph) 

Graph(num_nodes={'item': 1000, 'user': 15400},
      num_edges={('item', 'rated-by', 'user'): 311704, ('user', 'rated', 'item'): 311704},
      metagraph=[('item', 'user', 'rated-by'), ('user', 'item', 'rated')])


  """Entry point for launching an IPython kernel.


In [6]:
yah = buildTensor(trainset['rating'], False)
yahTest = buildTensor(testset['rating'], False) 
graph.edges['rated'].data['rating'] = yah
graph.edges['rated-by'].data['rating'] = yah

In [7]:
tensorTrainset = TensorDataset(userId, itemId, yah)
tensorTestset = TensorDataset(userIdTest, itemIdTest, yahTest)


In [10]:
class MinibatchSampler(object):
    
    def __init__(self, graph, num_layers):
        self.graph = graph
        self.num_layers = num_layers
        
    def sample(self, batch):
        users, items, ratings = zip(*batch)
        users = torch.stack(users)
        items = torch.stack(items)
        ratings = torch.stack(ratings)
        
        pair_graph = dgl.heterograph(
            {('user', 'rated', 'item'): (users, items)},
            num_nodes_dict={'user': self.graph.number_of_nodes('user'), 'item': self.graph.number_of_nodes('item')})
        
        pair_graph = dgl.compact_graphs(pair_graph) 
        pair_graph.edata['rating'] = ratings 
        
        seeds = {'user': pair_graph.nodes['user'].data[dgl.NID], 
                 'item': pair_graph.nodes['item'].data[dgl.NID]}
        blocks = self.construct_blocks(seeds, (users, items)) 
        
        for feature_name in self.graph.nodes['user'].data.keys():
            blocks[0].srcnodes['user'].data[feature_name] = \
                self.graph.nodes['user'].data[feature_name][blocks[0].srcnodes['user'].data[dgl.NID]] 
            
        for feature_name in self.graph.nodes['item'].data.keys():
            blocks[0].srcnodes['item'].data[feature_name] = \
                self.graph.nodes['item'].data[feature_name][blocks[0].srcnodes['item'].data[dgl.NID]]
        

        return pair_graph, blocks 
    

    def construct_blocks(self, seeds, user_item_pairs_to_remove):
        blocks = []
        users, items = user_item_pairs_to_remove
        for i in range(self.num_layers):
            
            sampled_graph = dgl.in_subgraph(self.graph, seeds) 

            sampled_eids = sampled_graph.edges['rated'].data[dgl.EID]
            sampled_eids_rev = sampled_graph.edges['rated-by'].data[dgl.EID]
            

            _, _, edges_to_remove = sampled_graph.edge_ids(users, items, etype='rated', return_uv=True)  
            _, _, edges_to_remove_rev = sampled_graph.edge_ids(items, users, etype='rated-by', return_uv=True)
            
            sampled_with_edges_removed = sampled_graph
            if len(edges_to_remove) > 0:
                sampled_with_edges_removed = dgl.remove_edges(
                    sampled_with_edges_removed, edges_to_remove, 'rated')
                sampled_eids = sampled_eids[sampled_with_edges_removed.edges['rated'].data[dgl.EID]]
                
            if len(edges_to_remove_rev) > 0:
                sampled_with_edges_removed = dgl.remove_edges(
                    sampled_with_edges_removed, edges_to_remove_rev, 'rated-by')
                sampled_eids_rev = sampled_eids_rev[sampled_with_edges_removed.edges['rated-by'].data[dgl.EID]]
            

            block = dgl.to_block(sampled_with_edges_removed, seeds)
            blocks.insert(0, block)
            seeds = {'user': block.srcnodes['user'].data[dgl.NID],
                     'item': block.srcnodes['item'].data[dgl.NID]}
            

            block.edges['rated'].data['rating'] = \
                self.graph.edges['rated'].data['rating'][sampled_eids]
            block.edges['rated-by'].data['rating'] = \
                self.graph.edges['rated-by'].data['rating'][sampled_eids_rev]
            
        return blocks

In [11]:
from torch import nn
import torch.nn.functional as F
import dgl.function as fn
import dgl.nn as dglnn

class GCMCConv(nn.Module): 
    
    def __init__(self, hidden_dims, num_ratings):
        super().__init__()
        
        self.W_r = nn.Parameter(torch.randn(num_ratings + 1, hidden_dims, hidden_dims)) 
        self.W_i = nn.Linear(hidden_dims * 2, hidden_dims) 
        
    def compute_message(self, W, edges): 
        W_r = W[edges.data['rating']]
        h = edges.src['h'] 
        m = (W_r @ h.unsqueeze(-1)).squeeze(2) 
        return m

    def forward(self, graph, node_features):
        with graph.local_scope():
            src_features, dst_features = node_features
            
            graph.srcdata['h'] = src_features 
            graph.dstdata['h'] = dst_features 
            
            graph.apply_edges(lambda edges: {'m': self.compute_message(self.W_r, edges)})
            
            graph.update_all(fn.copy_e('m', 'm'), fn.mean('m', 'h_neigh'))  
            
            result = F.relu(self.W_i(torch.cat([graph.dstdata['h'], graph.dstdata['h_neigh']], 1))) 
            return result 


In [12]:
class GCMCLayer(nn.Module):

    def __init__(self, hidden_dims, num_ratings):
        super().__init__()
        
        self.heteroconv = dglnn.HeteroGraphConv(
            {'rated': GCMCConv(hidden_dims, num_ratings), 'rated-by': GCMCConv(hidden_dims, num_ratings)},
            aggregate='sum')
                
    def forward(self, block, input_user_features, input_item_features):
        with block.local_scope():

            h_user = input_user_features 
            h_item = input_item_features
            
            src_features = {'user': h_user, 'item': h_item} 
            dst_features = {'user': h_user[:block.number_of_dst_nodes('user')],
                            'item': h_item[:block.number_of_dst_nodes('item')]} 
            
            result = self.heteroconv(block, (src_features, dst_features))
            return result['user'], result['item']


In [13]:
class GCMCLayer(nn.Module):

    def __init__(self, hidden_dims, num_ratings):
        super().__init__()

        self.heteroconv = dglnn.HeteroGraphConv(
            {'rated': GCMCConv(hidden_dims, num_ratings), 'rated-by': GCMCConv(hidden_dims, num_ratings)},
            aggregate='sum')
                
    def forward(self, block, input_user_features, input_item_features):
        with block.local_scope():
            h_user = input_user_features 
            h_item = input_item_features
            
            src_features = {'user': h_user, 'item': h_item} 
            dst_features = {'user': h_user[:block.number_of_dst_nodes('user')],
                            'item': h_item[:block.number_of_dst_nodes('item')]} 
            
            result = self.heteroconv(block, (src_features, dst_features))
            return result['user'], result['item']

In [16]:
class GCMCRating(nn.Module):
    def __init__(self, num_users, num_items, hidden_dims, num_ratings, num_layers):
        super().__init__()
        
        self.user_embeddings = nn.Embedding(num_users, hidden_dims)  
        self.item_embeddings = nn.Embedding(num_items, hidden_dims)
        
        self.layers = nn.ModuleList([
            GCMCLayer(hidden_dims, num_ratings) for _ in range(num_layers)]) 
        
        self.W_u = nn.Linear(hidden_dims, hidden_dims)
        self.W_v = nn.Linear(hidden_dims, hidden_dims)
        
    def forward(self, blocks):
        user_embeddings = self.user_embeddings(blocks[0].srcnodes['user'].data[dgl.NID])
        item_embeddings = self.item_embeddings(blocks[0].srcnodes['item'].data[dgl.NID])
        
        for block, layer in zip(blocks, self.layers):
            user_embeddings, item_embeddings = layer(block, user_embeddings, item_embeddings) 
        
        z_u = self.W_u(user_embeddings) 
        z_v = self.W_v(item_embeddings)
        
        return z_u, z_v
    
    def compute_score(self, pair_graph, z_u, z_v):
        with pair_graph.local_scope():
            pair_graph.nodes['user'].data['h'] = z_u 
            pair_graph.nodes['item'].data['h'] = z_v
            
            pair_graph.apply_edges(fn.u_dot_v('h', 'h', 'r')) 
            
            return pair_graph.edata['r']

In [20]:
import tqdm
from sklearn.metrics import mean_squared_error

def trainingLoop(NUM_LAYERS, BATCH_SIZE, NUM_EPOCHS, HIDDEN_DIMS, NUM_RATINGS, printing = True):
    sampler = MinibatchSampler(graph, NUM_LAYERS) 
    
    train_dataloader = DataLoader(tensorTrainset, batch_size=BATCH_SIZE, collate_fn=sampler.sample, shuffle=True)
    test_dataloader = DataLoader(tensorTestset, batch_size=BATCH_SIZE, collate_fn=sampler.sample, shuffle=False)
        
    model = GCMCRating(graph.number_of_nodes('user'), graph.number_of_nodes('item'), HIDDEN_DIMS, NUM_RATINGS, NUM_LAYERS) 
    
    opt = torch.optim.SGD(model.parameters(), lr=0.01) 
    
    rmse = []
    
    for i in range(NUM_EPOCHS):
        
        model.train() 
        
        with tqdm.tqdm(train_dataloader) as t: 
            for pair_graph, blocks in t:
                user_emb, item_emb = model(blocks)
                prediction = model.compute_score(pair_graph, user_emb, item_emb)
                loss = ((prediction - pair_graph.edata['rating']) ** 2).mean()
                opt.zero_grad() 
                loss.backward() 
                opt.step() 
                
        model.eval() 
        
        with tqdm.tqdm(test_dataloader) as t: 
            with torch.no_grad():
                predictions = []
                ratings = []
                for pair_graph, blocks in t:

                    user_emb, item_emb = model(blocks) 

                    prediction = model.compute_score(pair_graph, user_emb, item_emb) 
                    predictions.append(prediction) 
                    ratings.append(pair_graph.edata['rating']) 
                    
                predictions = torch.cat(predictions, 0)
                ratings = torch.cat(ratings, 0)
        
        if printing:
            print('RMSE:', mean_squared_error(predictions, ratings, squared=True).item(),'- after',i+1,'Epoch:')
        
        rmse.append(mean_squared_error(predictions, ratings, squared=True).item())
    
    if printing:
        print('\n\nEvaluation for the following hyper parameters: \n',
              'NUM_LAYERS','=', NUM_LAYERS, '\n',
              'BATCH_SIZE','=', BATCH_SIZE, '\n',
              'NUM_EPOCHS','=', NUM_EPOCHS, '\n',
              'HIDDEN_DIMS','=', HIDDEN_DIMS, '\n') 
        print('Endgültiger RMSE:', mean_squared_error(predictions, ratings, squared=True).item())
    
    return rmse


In [22]:
NUM_LAYERS = 1 
BATCH_SIZE = 1000 
NUM_EPOCHS = 15 
HIDDEN_DIMS = 8 
NUM_RATINGS = len(set(trainset['rating'])) 

rmse = trainingLoop(NUM_LAYERS, BATCH_SIZE, NUM_EPOCHS, HIDDEN_DIMS, NUM_RATINGS) 


  0%|                                                                                          | 0/312 [00:00<?, ?it/s]


IndexError: index 232905 is out of bounds for dimension 0 with size 232290