## Install Libraries & Prepare the Working Directory

In [39]:
!pip install scikit-optimize
!pip install boto3
!pip install dgl
!pip install dgl-cu101

from google.colab import drive
drive.mount('/content/drive')
import sys

sys.path.append('/content/drive/My Drive/DL-PROJECT/')
%cd /content/drive/My\ Drive/DL-PROJECT/


from torch.multiprocessing import Pool, Process, set_start_method
try:
    set_start_method('spawn')
except RuntimeError:
    pass

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/DL-PROJECT


In [40]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as torch_nn_func
import dgl
import dgl.nn.pytorch as dglnn
import dgl.function as dgl_func
import torch
from sklearn.model_selection import train_test_split
import numpy as np
from datetime import timedelta
from collections import defaultdict
from typing import Tuple
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from datetime import datetime
import textwrap
import math
import sys
import time
import pickle


## Read Data

In [41]:
user_feat_df = pd.read_csv('data/user_feat.csv')
interact_df = pd.read_csv('data/interact.csv')
movie_feat_df = pd.read_csv('data/item_feat.csv')
interact_df.sort_values(by='Timestamp')
interact_df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,6040,858,4,956703932
1,6040,2384,4,956703954
2,6040,593,5,956703954
3,6040,1961,4,956703977
4,6040,2019,5,956703977


## Split Train & Test Data

In [42]:
test_size = 0.28
split_idx = int(len(interact_df) * (1 - test_size))
user_movie_train = interact_df[: split_idx]
user_movie_test = interact_df[split_idx: ]

## Extract User Information

In [43]:
user_id_type = 'UserID'

user_id = pd.DataFrame(user_movie_train[user_id_type].unique(), columns=[user_id_type])
user_id['user_new_id'] = user_id.index

## Extract Movie Information

In [44]:
movie_id_type = 'MovieID'

train_movie = user_movie_train[movie_id_type].unique().tolist()
all_movie = movie_feat_df[movie_id_type].unique().tolist()

unseen_movie = [movie for movie in all_movie if movie not in train_movie]
train_movie.extend(unseen_movie)  
movie_id = pd.DataFrame(train_movie, columns=[movie_id_type])
movie_id['movie_new_id'] = movie_id.index

## User-Movie Data for Training

In [45]:
user_movie_train = user_movie_train.merge(user_id, how='left', on=user_id_type)
user_movie_train = user_movie_train.merge(movie_id, how='left', on=movie_id_type)

user_movie_train.drop_duplicates(subset=['Rating', 'user_new_id', 'movie_new_id'], keep='last', inplace=True)
user_movie_train.sort_values(by=['Rating', 'user_new_id', 'movie_new_id'], ignore_index=True, inplace=True) 
user_movie_train.sort_values(by='Timestamp', ignore_index=True, inplace=True)  

## User-Movie Data for Testing

In [46]:
user_movie_test = user_movie_test.merge(user_id, how='inner', on=user_id_type)
user_movie_test = user_movie_test.merge(movie_id, how='inner', on=movie_id_type)
test_src = user_movie_test.user_new_id.values
test_dst = user_movie_test.movie_new_id.values
ground_truth_test = (test_src, test_dst)

## Create Adjacency Dict


In [47]:
adjacency_dict = {}

adjacency_dict['user_movie_rating'] = user_movie_train.Rating.values
adjacency_dict['user_movie_src'] = user_movie_train.user_new_id.values
adjacency_dict['user_movie_dst'] = user_movie_train.movie_new_id.values

## Create Graph Schema based on Adjacency Dict

In [48]:
graph_schema = {('user', 'buys', 'movie'): list(zip(adjacency_dict['user_movie_src'], adjacency_dict['user_movie_dst'])),
                ('movie', 'watched-by', 'user'): list(zip(adjacency_dict['user_movie_dst'], adjacency_dict['user_movie_src']))
               }

## Create a Heterogeneous graph

In [49]:
hetero_graph = dgl.heterograph(graph_schema)
print('Graph --> \n', hetero_graph) 

Graph --> 
 Graph(num_nodes={'movie': 3646, 'user': 4971},
      num_edges={('movie', 'watched-by', 'user'): 720150, ('user', 'buys', 'movie'): 720150},
      metagraph=[('movie', 'user', 'watched-by'), ('user', 'movie', 'buys')])


## Create User Features

In [50]:
user_feat_df = user_feat_df.merge(user_id, how='inner', on=user_id_type)
ids = user_feat_df.user_new_id.values.astype(int)
feats = np.stack((user_feat_df.F.values, user_feat_df.M.values), axis=1)

user_feat = np.zeros((hetero_graph.number_of_nodes('user'), 2))
user_feat[ids] = feats

user_feat = torch.tensor(user_feat).float()

## Create Movie Features

In [51]:
movie_feat_df = movie_feat_df.merge(movie_id, how='left', on=movie_id_type)
movie_feat_df = movie_feat_df[movie_feat_df.movie_new_id < hetero_graph.number_of_nodes('movie')]  

ids = movie_feat_df.movie_new_id.values.astype(int)

feats = np.stack((movie_feat_df.Action.values,
                  movie_feat_df.Animation.values,
                  movie_feat_df.Childrens.values,
                  movie_feat_df.Comedy.values,
                  movie_feat_df.Crime.values,
                  movie_feat_df.Documentary.values,
                  movie_feat_df.Drama.values,
                  movie_feat_df.Fantasy.values,
                  movie_feat_df.Film_Noir.values,
                  movie_feat_df.Horror.values,
                  movie_feat_df.Musical.values,
                  movie_feat_df.Mystery.values,
                  movie_feat_df.Romance.values,
                  movie_feat_df.Sci_Fi.values,
                  movie_feat_df.Thriller.values,
                  movie_feat_df.War.values,
                  movie_feat_df.Western.values,
                  ),
                 axis=1)

movie_feat = np.zeros((hetero_graph.number_of_nodes('movie'), feats.shape[1]))
movie_feat[ids] = feats
movie_feat = torch.tensor(movie_feat).float()

## Create Features Dict from Users and Movies Features

In [52]:
features_dict = {}
features_dict['user_feat'] = user_feat
features_dict['movie_feat'] = movie_feat

## Assign Features to Nodes and Edges

In [53]:
hetero_graph.nodes['user'].data['features'] = features_dict['user_feat']
hetero_graph.nodes['movie'].data['features'] = features_dict['movie_feat']

## Assign Features to Edges

In [54]:
hetero_graph.edges['buys'].data['rating'] = torch.tensor(adjacency_dict['user_movie_rating'])
hetero_graph.edges['watched-by'].data['rating'] = torch.tensor(adjacency_dict['user_movie_rating'])

## Node Embeddings

* We calculate Node embeddings to represent nodes as vectors an capture the topology of the network
* The embeddings --> based on similarity.
* We will use the embeddings for prediction tasks.

In [55]:
class NodeEmbedding(nn.Module):
    def __init__(self, input_features, output_features,):
        super().__init__()
        self.project_features = nn.Linear(input_features, output_features)

    def forward(self, node_features):
        return self.project_features(node_features)

## Message passing layer

In [56]:
class MessagePassing(nn.Module):
    
      def __init__(self, input_features, output_features, dropout,):
        super().__init__()
        self._in_neigh_feats, self._in_self_feats = input_features
        self._output_features = output_features 
        self.dropout = nn.Dropout(dropout)
        self.fc_self = nn.Linear(self._in_self_feats, output_features, bias=False)
        self.fc_neighbour = nn.Linear(self._in_neigh_feats, output_features, bias=False)
        self.fc_pre_agg = nn.Linear(self._in_neigh_feats, self._in_neigh_feats, bias=False)
      
      def forward(self, graph, x):
        
        h_neighbour, h_self = x
        h_self = self.dropout(h_self)
        h_neighbour = self.dropout(h_neighbour)
        

        graph.srcdata['h'] = torch_nn_func.relu(self.fc_pre_agg(h_neighbour))
        graph.update_all(dgl_func.copy_src('h', 'm'), dgl_func.mean('m', 'neigh'))
        h_neighbour = graph.dstdata['neigh']

        #message passing
        z = self.fc_self(h_self) + self.fc_neighbour(h_neighbour)
        z = torch_nn_func.relu(z)

        z_normalization = z.norm(2, 1, keepdim=True)
        z_normalization = torch.where(z_normalization == 0, torch.tensor(1.).to(z_normalization), z_normalization)
        z = z / z_normalization

        return z

## Calculate Similarity between Users & Movies

Ref: https://docs.dgl.ai/en/0.6.x/guide/training-edge.html#guide-training-edge-classification

### Prediction for NN Similarity

In [57]:
class NnSimilarityPredictingLayer(nn.Module):

    def __init__(self, embedding_layer_dim: int):
        super(NnSimilarityPredictingLayer, self).__init__()
        self.hidden_layer_1 = nn.Linear(2 * embedding_layer_dim, hidden_layer_1_output_dim)
        self.hidden_layer_2 = nn.Linear(hidden_layer_1_output_dim, hidden_layer_2_output_dim)
        self.output = nn.Linear(hidden_layer_2_output_dim, 1)
        self.relu_layer = nn.ReLU()
        self.sigmoid_layer = nn.Sigmoid()

    def forward(self, x):
        x = self.hidden_layer_1(x)
        x = self.relu_layer(x)
        x = self.hidden_layer_2(x)
        x = self.relu_layer(x)
        x = self.output(x)
        x = self.sigmoid_layer(x)
        return x

class NnPredictingModule(nn.Module):
    def __init__(self, predicting_layer, embed_dim: int):
        super(NnPredictingModule, self).__init__()
        self.layer_nn = NnSimilarityPredictingLayer(embed_dim)

    def forward(self, graph, h ):
        ratings_dict = {}
        edge_types_list = ['user', 'movie']

        for edge_type in graph.canonical_etypes:
            if edge_type[0] in edge_types_list and edge_type[2] in edge_types_list:
                u_type, _, v_type = edge_type
                src_nid, dst_nid = graph.all_edges(etype=edge_type) 
                emb_heads = h[u_type][src_nid]
                emb_tails = h[v_type][dst_nid]
                cat_embed = torch.cat((emb_heads, emb_tails), 1)
                ratings = self.layer_nn(cat_embed)
                ratings_dict[edge_type] = torch.flatten(ratings)
        
        ratings_dict = {key: torch.unsqueeze(ratings_dict[key], 1) for key in ratings_dict.keys()}

        return ratings_dict

### Prediction for Cosine Similarity

In [58]:
class CosinePrediction(nn.Module):

    def __init__(self):
        super().__init__()

    def forward(self, graph, h):
        with graph.local_scope():
            for edge_type in graph.canonical_edge_types:
                try:
                    graph.nodes[edge_type[0]].data['norm_h'] = torch_nn_func.normalize(h[etype[0]], p=2, dim=-1)
                    graph.nodes[edge_type[2]].data['norm_h'] = torch_nn_func.normalize(h[etype[2]], p=2, dim=-1)
                    graph.apply_edges(fn.u_dot_v('norm_h', 'norm_h', 'cos'), etype=edge_type)
                except ValueError:
                   print("Cosine similarity fucntion is not correct!")
            ratings = graph.edge_data['cos']
        return ratings

### Prediction for Dot-Product Similarity

Refs:
- https://docs.dgl.ai/en/0.6.x/guide/minibatch-link.html
- https://issueexplorer.com/issue/dmlc/dgl/3447

In [59]:
class DotProductPredictor(nn.Module):

    def __init__(self):
      super().__init__()

    def forward(self, graph, h):
      
      with graph.local_scope():
        for edge_type in graph.canonical_edge_types:
          try:
            graph.n_data['h'] = h
            graph.apply_edges(fn.u_dot_v('h', 'h', 'score'), etype=edge_type)
          except KeyError:
            pass
        ratings = graph.edge_data['score']

      return ratings

### Prediction for PairWise Distance Similarity

In [60]:
class PairWiseDistancePredictor(nn.Module):

    def __init__(self):
        super().__init__()

    def forward(self, graph, h):

        with graph.local_scope():
            for edge_type in graph.canonical_edge_types:
                try:
                    graph.nodes[edge_type[0]].data['h'] = h[edge_type[0]]
                    graph.nodes[edge_type[2]].data['h'] = h[edge_type[2]]
                    graph.apply_edges(lambda edges: {'pwdist': nn.PairwiseDistance('h', 'h')}, etype=edge_type )
                except KeyError:
                    pass
            ratings = graph.edge_data['pwdist']

        return ratings

## GNN Model

In [61]:
class GNNModel(nn.Module):

    def __init__(self, g, n_layers: int, dim_dict, dropout, pred, aggregator_hetero, embedding_layer,):

        super().__init__()
        self.embedding_layer = embedding_layer

        if embedding_layer:
            self.user_embed = NodeEmbedding(dim_dict['user'], dim_dict['hidden'])
            self.item_embed = NodeEmbedding(dim_dict['movie'], dim_dict['hidden'])

        self.layers = nn.ModuleList()

        # input layer
        if not embedding_layer:
            self.layers.append(
                dglnn.HeteroGraphConv(
                    {etype[1]: MessagePassing((dim_dict[etype[0]], dim_dict[etype[2]]), dim_dict['hidden'], dropout) for etype in g.canonical_etypes}, 
                    aggregate=aggregator_hetero)
                    )

        # hidden layers
        for i in range(n_layers - 2):
            self.layers.append(
                dglnn.HeteroGraphConv(
                    {etype[1]: MessagePassing((dim_dict['hidden'], dim_dict['hidden']), dim_dict['hidden'], dropout) for etype in g.canonical_etypes},
                    aggregate=aggregator_hetero)
                    )

        # output layer
        self.layers.append(
            dglnn.HeteroGraphConv(
                {etype[1]: MessagePassing((dim_dict['hidden'], dim_dict['hidden']), dim_dict['out'], dropout) for etype in g.canonical_etypes}, 
                aggregate=aggregator_hetero)
                )

        if pred == 'cos':
            self.pred_fn = CosinePrediction()
        elif pred == 'nn':
            self.pred_fn = NnPredictingModule(NnSimilarityPredictingLayer, dim_dict['out'])
        elif pred == 'dotprod':
            self.pred_fn = DotProductPredictor()
        elif pred == 'pw':
            self.pred_fn = PairWiseDistancePredictor()
        else:
            raise KeyError('Prediction function does not exist')
            sys.exit(1)

    def get_repr(self, blocks, h):

        for i in range(len(blocks)):         
            layer = self.layers[i]
            h = layer(blocks[i], h)
          
        return h

    def forward(self, blocks, h, pos_g, neg_g, embedding_layer: bool=True, ):

        if embedding_layer:
            h['user'] = self.user_embed(h['user'])
            h['movie'] = self.item_embed(h['movie'])

        h = self.get_repr(blocks, h)
        pos_score = self.pred_fn(pos_g, h)
        neg_score = self.pred_fn(neg_g, h)

        return h, pos_score, neg_score

## Save Results in output file

In [62]:
def save_txt(data_to_save, filepath, mode='a'):

    with open(filepath, mode) as text_file:
        text_file.write(data_to_save + '\n')

## Max Margin Loss


In [63]:
def max_margin_loss(pos_score, neg_score, delta, neg_sample_size, negative_mask, cuda, device):

    all_scores = torch.empty(0)

    if cuda:
        all_scores = all_scores.to(device)

    for etype in pos_score.keys():
        neg_score_tensor = neg_score[etype]
        pos_score_tensor = pos_score[etype]
        neg_score_tensor = neg_score_tensor.reshape(-1, neg_sample_size)
        
        negative_mask_tensor = negative_mask[etype].reshape(-1, neg_sample_size)
       
        if cuda:
            negative_mask_tensor = negative_mask_tensor.to(device)
        scores = (delta- pos_score_tensor + neg_score_tensor-negative_mask_tensor).clamp(min=0)
        
        if etype == edge_types[0]:
            all_scores = torch.cat((all_scores, scores), 0)

    return torch.mean(all_scores)


## Bayesian Personalized Ranking Loss

In [64]:
def bpr_loss(pos_score, neg_score, delta, neg_sample_size, negative_mask, cuda, device):

    all_scores = torch.empty(0)

    if cuda:
        all_scores = all_scores.to(device)

    for etype in pos_score.keys():
        neg_score_tensor = neg_score[etype]
        pos_score_tensor = pos_score[etype]
        neg_score_tensor = neg_score_tensor.reshape(-1, neg_sample_size)

        negative_mask_tensor = negative_mask[etype].reshape(-1, neg_sample_size)
        

        if cuda:
            negative_mask_tensor = negative_mask_tensor.to(device)

        logsig = nn.LogSigmoid()
        scores = - logsig(pos_score_tensor - neg_score_tensor - negative_mask_tensor)
        
        if etype == edge_types[0]: 
          all_scores = torch.cat((all_scores, scores), 0)

    return torch.mean(all_scores)

## Cross Entropy Loss

In [65]:
def cross_entropy_loss(positive_score, negative_score, delta, negative_sample_size , negative_mask, cuda, device):
    
    all_scores = torch.empty(0)
    
    if cuda:
        all_scores = all_scores.to(device)
    
    for edge_type in positive_score.keys():
        negative_score_tensor = negative_score[edge_type].reshape(-1, 1)
        positive_score_tensor = positive_score[edge_type]
    
        negative_mask_tensor = negative_mask[edge_type].reshape(-1, 1)
    
        if cuda:
            negative_mask_tensor = negative_mask_tensor.to(device)
    
        relu = nn.ReLU()

        if edge_type == edge_types[0]:
            all_scores = torch.cat((all_scores, positive_score_tensor), 0)
            negative_score_tensor = negative_score_tensor - negative_mask_tensor
            all_scores = torch.cat((all_scores, negative_score_tensor), 0).reshape(-1, 1)
            all_scores = relu(all_scores)
            labels = torch.cat([torch.ones(positive_score_tensor.shape[0]), torch.zeros(negative_score_tensor.shape[0])]).reshape(-1, 1)

            if cuda:
                all_scores = all_scores.to(device) 
                labels = labels.to(device)
        
    loss = torch_nn_func.binary_cross_entropy_with_logits(all_scores, labels)

    return loss

## Sampling the Training Graph as well as Node/Edge IDs for training and validation

In [66]:
def train_validation_split(valid_graph, ground_truth_test, etypes, valid_size, reverse_etype):
    
    np.random.seed(11)

    all_eids_dict, valid_eids_dict, train_eids_dict = {}, {}, {}
    valid_uids_all, valid_iids_all = [], []

    for etype in etypes:
      all_eids = np.arange(valid_graph.number_of_edges(etype))
      all_eids_dict[etype] = all_eids
      valid_eids = all_eids[int(len(all_eids) * (1 - valid_size)):]
      valid_eids_dict[etype] = valid_eids
    
    all_eids_dict[etypes[0]] = all_eids
    valid_eids = all_eids[int(len(all_eids) * (1 - valid_size)):]

    valid_uids, valid_iids = valid_graph.find_edges(valid_eids, etype=etypes[0])
    valid_uids_all.extend(valid_uids.tolist())
    valid_iids_all.extend(valid_iids.tolist())
        
    ground_truth_valid = (np.array(valid_uids_all), np.array(valid_iids_all))
    valid_uids = np.array(np.unique(valid_uids_all))

    train_graph = valid_graph.clone()

    for etype in etypes:
      train_graph.remove_edges(valid_eids_dict[etype], etype=etype)
      train_eids = np.arange(train_graph.number_of_edges(etype))
      train_eids_dict[etype] = train_eids

    validation_graph = valid_graph.clone()

    for etype in etypes:
      validation_graph.remove_edges(train_eids_dict[etype], etype=etype)

    train_uids, train_iids =train_graph.find_edges(train_eids_dict[etypes[0]], etype=etypes[0])
    unique_train_uids = np.unique(train_uids)

    ground_truth_train = (np.array(train_uids), np.array(train_iids))
    train_uids = np.array(np.unique(train_uids))
    test_uids, _ = ground_truth_test
    test_uids = np.unique(test_uids)
    all_iids = np.arange(valid_graph.num_nodes('movie'))

    return train_graph, train_eids_dict, valid_eids_dict, train_uids, valid_uids, test_uids, all_iids, ground_truth_train, ground_truth_valid, all_eids_dict

## Sampling --> Creating batches for train, valid & test data
Since data is large, it is fed to the model in batches. 

In [67]:
def load_data(validation_graph, train_graph, train_edge_ids_dict, validation_edge_ids_dict, train_u_ids, validation_u_ids, test_u_ids, all_i_ids, 
                          number_workers, embedding_layer, number_layers, neighbor_sampler, neg_sample_size,  edge_batch_size, 
                          node_batch_size):
   
    if embedding_layer:
        number_layers -= 1
    if neighbor_sampler == 'full':
        sampler = dgl.dataloading.MultiLayerFullNeighborSampler(number_layers)
    elif neighbor_sampler == 'partial':
        sampler = dgl.dataloading.MultiLayerNeighborSampler(block_sampler, replace=False)
    else:
        print('Neighbor sampler does not exit')
        sys.exit(1)

    sampler_n = dgl.dataloading.negative_sampler.Uniform(neg_sample_size)

    #loading nodes
    nodeLoad_train = dgl.dataloading.NodeDataLoader(train_graph, {'user': train_u_ids, 'movie': all_i_ids}, sampler, batch_size=node_batch_size, shuffle=True, 
                                                      drop_last=False, num_workers=number_workers,)

    nodeLoad_validation = dgl.dataloading.NodeDataLoader(validation_graph, {'user': validation_u_ids, 'movie': all_i_ids}, sampler, batch_size=node_batch_size, 
                                                           shuffle=True, drop_last=False, num_workers=number_workers,)

    #loading edges

    edgeLoad_train = dgl.dataloading.EdgeDataLoader(train_graph, train_edge_ids_dict, sampler, exclude='reverse_types', 
                                                          reverse_etypes={'buys': 'watched-by', 'watched-by': 'buys'}, negative_sampler=sampler_n, 
                                                          batch_size=edge_batch_size, shuffle=True, drop_last=False, pin_memory=True, num_workers=number_workers,)

    edgeLoad_validation = dgl.dataloading.EdgeDataLoader(validation_graph, validation_edge_ids_dict, sampler, g_sampling=train_graph, negative_sampler=sampler_n, 
                                                      batch_size=edge_batch_size, shuffle=True, drop_last=False, pin_memory=True, num_workers=number_workers,)

    
    test_node_ids = {'user': test_u_ids, 'movie': all_i_ids}

    nodeLoad_test = dgl.dataloading.NodeDataLoader(validation_graph, test_node_ids, sampler, batch_size=node_batch_size, shuffle=True, drop_last=False, 
                                                     num_workers=number_workers)

    return edgeLoad_train, edgeLoad_validation, nodeLoad_train, nodeLoad_validation, nodeLoad_test


## Design Parameters

In [68]:
edge_types= [('user', 'buys', 'movie'), ('movie', 'watched-by', 'user')]
reverse_edge_types= {('user', 'buys', 'movie'): ('movie', 'watched-by', 'user'), 
                 ('movie', 'watched-by', 'user') : ('user', 'buys', 'movie') }
validation_size = .15
cuda = torch.cuda.is_available()
number_workers = 1 if cuda else 0
device = torch.device('cuda') if cuda else torch.device('cpu')

embedding_layer = True  
number_layers = 3
neighbor_sampler = 'partial' 
block_sampler = [50, 40 ]
neg_sample_size = 15 
edge_batch_size = 4096 
node_batch_size = 128

# GNN Conv Layer parameters:
out_dim = 64 
hidden_dim = 256
validation_graph = hetero_graph
prediction = 'nn' 
aggregator_hetero = 'mean' 
dropout = 0.3 

# nn PredLayer parameters:
hidden_layer_1_output_dim = 256 
hidden_layer_2_output_dim = 128 

num_epochs = 15 
delta = 0.6 

optimizer=torch.optim.Adam 
lr = 0.001 
weight_decay = 1e-5
loss_function = bpr_loss 
patience = 20
result_filepath = '/content/drive/My Drive/DL-PROJECT/output.txt'
k = 5 #number of recommendations


## Initialize Dataloaders - Get Training & Test IDs

In [69]:
train_graph, train_edge_ids_dict, validation_edge_ids_dict, train_u_ids, validation_u_ids, test_u_ids, all_i_ids, ground_truth_train, ground_truth_validation, all_edge_ids_dict = train_validation_split(hetero_graph, ground_truth_test, edge_types, 
                                                                                                                                                                      validation_size, reverse_edge_types)

edgeLoad_train, edgeLoad_validation, nodeLoad_train, nodeLaod_validation, nodeLoad_test = load_data(validation_graph, train_graph, train_edge_ids_dict, 
                                                                                                                      validation_edge_ids_dict, train_u_ids, validation_u_ids, 
                                                                                                                      test_u_ids, all_i_ids, number_workers,  
                                                                                                                      embedding_layer, number_layers, neighbor_sampler, 
                                                                                                                      neg_sample_size, 
                                                                                                                      edge_batch_size, node_batch_size)

## Define Number of Batches for Train, Validation and Test Data

In [70]:
train_edge_ids_len = 0
validation_edge_ids_len = 0

for edge_type in train_edge_ids_dict.keys():
    train_edge_ids_len += len(train_edge_ids_dict[edge_type])
    validation_edge_ids_len += len(validation_edge_ids_dict[edge_type])

num_batches_train_loss = math.ceil(train_edge_ids_len / edge_batch_size)
num_batches_train_metrics = math.ceil((len(train_u_ids) + len(all_i_ids)) / node_batch_size)
num_batches_validation_loss = math.ceil(validation_edge_ids_len /edge_batch_size)
num_batches_validation_metrics = math.ceil((len(validation_u_ids) + len(all_i_ids)) / node_batch_size)
num_batches_test = math.ceil((len(test_u_ids) + len(all_i_ids)) / node_batch_size)


## Calculate Embeddings

In [71]:
def get_embeddings(g, out_dim, trained_model, nodeLoad_test, num_batches_valid, cuda, device, embedding_layer):

    if cuda:  
        trained_model = trained_model.to(device)
    i = 0
    y = {ntype: torch.zeros(g.num_nodes(ntype), out_dim) for ntype in g.ntypes}
    
    if cuda: 
        y = {ntype: torch.zeros(g.num_nodes(ntype), out_dim).to(device) for ntype in g.ntypes}
    
    for input_nodes, output_nodes, blocks in nodeLoad_test:
        i += 1
       
        if i % 10 == 0:
            print("Computing embeddings: Batch "+ str(i)+ " out of "  + str(num_batches_valid))
       
        if cuda:
            blocks = [b.to(device) for b in blocks]
        
        input_features = blocks[0].srcdata['features']

        if embedding_layer:
            input_features['user'] = trained_model.user_embed(input_features['user'])
            input_features['movie'] = trained_model.item_embed(input_features['movie'])
        
        h = trained_model.get_repr(blocks, input_features)
        
        for ntype in h.keys():
            y[ntype][output_nodes[ntype]] = h[ntype]

    return y

## Calculate Metrics 

In [72]:
def get_metrics(h, g, model, embed_dim, ground_truth, k, cuda=False, device=None, pred='cos', epoch=4):

    users, items = ground_truth
    user_ids = np.unique(users).tolist()
    ground_truth_arr = np.stack((np.asarray(users), np.asarray(items)), axis=1)
    ground_truth_dict = defaultdict(list)
    for key, val in ground_truth_arr:
        ground_truth_dict[key].append(val)
    recs = get_recom(g, h, model, embed_dim, k, user_ids, cuda, device, pred, epoch)
    precision, coverage = calculate_metrics(recs, ground_truth_dict, g)
    
    return precision, coverage

## Calculate Recommendations

In [73]:
def calculate_metrics(recs, ground_truth_dict, g):

    k_relevant = 0
    k_total = 0

    for uid, iids in recs.items():
        k_total += len(iids)
        k_relevant += len([id_ for id_ in iids if id_ in ground_truth_dict[uid]])
    
    precision = k_relevant/k_total

    nb_total = g.num_nodes('movie')
    recs_flatten = [item for sublist in list(recs.values()) for item in sublist]
    nb_recommended = len(set(recs_flatten))
    coverage = nb_recommended / nb_total
    
    return precision, coverage
  
def get_recom(g, h, model, embed_dim, k, user_ids, cuda, device, pred: str, epoch):

    if cuda:  
        model = model.to(device)

    recom = {}

    for user in user_ids:
        user_emb = h['user'][user]
        user_emb_rpt = torch.cat(g.num_nodes('movie') * [user_emb]).reshape(-1, embed_dim) 
        
        if pred == 'cos':
            cos = nn.CosineSimilarity(dim=1, eps=1e-6)
            ratings = cos(user_emb_rpt, h['movie'])
        elif pred == 'nn':
            cat_embed = torch.cat((user_emb_rpt, h['movie']), 1)
            ratings = model.pred_fn.layer_nn(cat_embed)
        elif pred == 'dotprod':
            ratings = torch.sum(user_emb_rpt * h['movie'], dim=1)
            print("ratings shape: ", ratings.shape)
        elif pred == 'pw':
            ratings =nn.PairwiseDistance(user_emb_rpt, h['movie'])
        else:
            print ('the prediction function not found!')
            sys.exit(1)
            
        ratings_formatted = ratings.cpu().detach().numpy().reshape(g.num_nodes('movie'),)
        order = np.argsort(-ratings_formatted)

        rec = order[:k] # top k recommendations
        recom[user] = rec
        
    return recom


## Calculate AUC

In [74]:
def compute_auc(scores):
    edge_type = ('user', 'buys', 'movie')
    pos_score = scores[0][edge_type]
    neg_score = scores[1][edge_type]

    scores = torch.cat([pos_score, neg_score]).detach().cpu().numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).detach().numpy()

    return roc_auc_score(labels, scores)

## Train the Model

In [75]:
def train_model(model,
                num_epochs,
                num_batches_train_loss,
                num_batches_validation_loss,
                num_batches_val_metrics,
                num_batches_train_metric,
                edgeLoad_train,
                edgeLoad_validation,
                nodeLoad_validation,
                nodeLoad_train,
                k,
                loss_function, 
                delta, 
                negative_sample_size, 
                cuda=False,
                device=None,
                optimizer=torch.optim.Adam,
                lr=0.001,
                train_graph=None,
                validation_graph=None,
                out_dim=None,
                ground_truth_train=None,
                ground_truth_validation=None,
                result_filepath=None,
                patience=5,
                prediction=None,
                embedding_layer=True,
                ):

    model.train_loss_list, model.train_precision_list, model.train_coverage_list, model.validation_loss_list, model.validation_precision_list, model.validation_coverage_list = [], [], [], [], [], []
    best_metrics = {} 
    
    max_metric = - 0.1
    patience_counter = 0  
    min_loss = 1.1

    opt = optimizer(model.parameters(), lr=lr, weight_decay=weight_decay)

    start = time.time()
    print('Start training for '+ str(num_epochs)+ ' epochs')
    
    for epoch in range(1, num_epochs):
        
        if epoch == 1:
            mode = 'w'  
        else:
            mode = 'a'  

        model.train()  
        i = 0
        total_loss = 0
        for _, pos_g, neg_g, blocks in edgeLoad_train:
            opt.zero_grad()

            # Negative mask
            negative_mask = {}

            nids = neg_g.ndata[dgl.NID]

            for etype in pos_g.canonical_etypes:
                neg_src, neg_dst = neg_g.edges(etype=etype)
                neg_src = nids[etype[0]][neg_src]
                neg_dst = nids[etype[2]][neg_dst]
                negative_mask_tensor = train_graph.has_edges_between(neg_src, neg_dst, etype=etype)
                negative_mask[etype] = negative_mask_tensor.type(torch.float)

                if cuda:
                    negative_mask[etype] = negative_mask[etype].to(device)

            if cuda:
                blocks = [b.to(device) for b in blocks]
                pos_g = pos_g.to(device)
                neg_g = neg_g.to(device)

            i += 1

            if i % 10 == 0:
                print("Edge batch " + str(i) + " out of ", str(num_batches_train_loss))

            input_features = blocks[0].srcdata['features']

            _, pos_score, neg_score = model(blocks, input_features, pos_g, neg_g, embedding_layer,)
            train_score = (pos_score, neg_score)
            loss = loss_function(pos_score, neg_score, delta, neg_sample_size, negative_mask=negative_mask, cuda=cuda, device=device,)

            loss.backward()
            opt.step()

            total_loss += loss.item()

        train_avg_loss = total_loss / i
        model.train_loss_list.append(train_avg_loss)

        model.eval()

        with torch.no_grad():
            total_loss = 0
            i = 0

            for _, pos_g, neg_g, blocks in edgeLoad_validation:
                i += 1

                if i % 10 == 0:
                    print("Edge batch {} out of {}".format(i, num_batches_validation_loss))

                # Negative mask
                negative_mask = {}

                nids = neg_g.ndata[dgl.NID]

                for etype in pos_g.canonical_etypes:
                    neg_src, neg_dst = neg_g.edges(etype=etype)
                    neg_src = nids[etype[0]][neg_src]
                    neg_dst = nids[etype[2]][neg_dst]
                    negative_mask_tensor = validation_graph.has_edges_between(neg_src, neg_dst, etype=etype)
                    negative_mask[etype] = negative_mask_tensor.type(torch.float)

                    if cuda:
                        negative_mask[etype] = negative_mask[etype].to(device)

                if cuda:
                    blocks = [b.to(device) for b in blocks]
                    pos_g = pos_g.to(device)
                    neg_g = neg_g.to(device)

                input_features = blocks[0].srcdata['features']
                _, pos_score, neg_score = model(blocks, input_features, pos_g, neg_g, embedding_layer,)
                
                validation_score = (pos_score, neg_score)

                validation_loss = loss_function(pos_score, neg_score, delta, neg_sample_size, negative_mask=negative_mask, 
                                   cuda=cuda, device=device,)
                total_loss += validation_loss.item()

            validation_avg_loss = total_loss / i
            model.validation_loss_list.append(validation_avg_loss)

        model.eval()

        with torch.no_grad():
            y = get_embeddings(train_graph, out_dim, model, nodeLoad_train, num_batches_train_metrics, cuda, device, embedding_layer,)
            train_precision, train_coverage = get_metrics(y, train_graph, model, out_dim, ground_truth_train, k, 
                                                                cuda, device, prediction, epoch)

            # validation metrics
            y = get_embeddings(validation_graph,
                                out_dim,
                                model,
                                nodeLoad_validation,
                                num_batches_val_metrics,
                                cuda,
                                device,
                                embedding_layer,
                                )

            validation_precision, validation_coverage = get_metrics(y, validation_graph, model, out_dim, ground_truth_validation,  k, 
                                                                          cuda, device, prediction, epoch)
            AUC_val = compute_auc(validation_score)
            AUC_train = compute_auc(train_score)
            results = "Epoch " + str(epoch) +" | Training loss " + str(round(train_avg_loss,4)) +" | Training Precision " + str(round(train_precision * 100,2)) + " | Training AUC " + str(round(AUC_train,2))+ ' | Validation loss '+ str(round(validation_avg_loss,4)) + ' | Validation Precision '+str(round(validation_precision * 100,2)) + ' | AUC '+str(round(AUC_val,2))
          
            
            print(results)
            print('process time: ' +str(round(float(time.time()-start)/60, 2))+ ' minutes')
            
            save_txt(results, result_filepath, mode=mode)

            model.train_precision_list.append(train_precision * 100)
            model.validation_precision_list.append(validation_precision * 100)

            if validation_precision > max_metric:
                max_metric = validation_precision
                best_metrics = { 'precision': validation_precision}

       
        if validation_avg_loss < min_loss:
            min_loss = validation_avg_loss
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter == patience:
            break

    viz_dict = {'train_loss_list': model.train_loss_list,
           'train_precision_list': model.train_precision_list,
           'val_loss_list': model.validation_loss_list,
           'validation_precision_list': model.validation_precision_list,
          }
    
    print('end of training!!')
    print ('process time: ' +str(round(float(time.time()-start)/60, 2))+ ' minutes')
    return model, viz_dict, validation_score, train_score, best_metrics

## Model Inference
### Apply the Trained Model to the Test Data

In [76]:
  def model_inference(valid_graph, out_dim: int, trained_model, nodeLoad_test, num_batches_test: int, cuda: bool, device, embedding_layer: bool, 
                     ground_truth_test, all_eids_dict):

    trained_model.eval()

    with torch.no_grad():
          embeddings = get_embeddings(valid_graph, out_dim, trained_model, nodeLoad_test, num_batches_test, cuda, device, embedding_layer,)
          precision,_ = get_metrics(embeddings, valid_graph, trained_model, out_dim, ground_truth_test, k, cuda, device, prediction, epoch = 4)
          sentence = " TEST Precision {:.3f}% ".format(precision * 100)
          print(sentence)
          save_txt(sentence, result_filepath, mode= 'a')

## The Main Script to Run and Train the Model

In [77]:

dim_dict = {'user': validation_graph.nodes['user'].data['features'].shape[1],
            'movie': validation_graph.nodes['movie'].data['features'].shape[1],
            'out': out_dim,
            'hidden': hidden_dim}

model = GNNModel(validation_graph, number_layers, dim_dict, dropout, prediction, aggregator_hetero, embedding_layer)

if cuda:
    model = model.to(device)



trained_model, viz_dict, validation_score, train_score, best_metrics = train_model(model, 
                                                                       num_epochs,                                                      
                                                                        num_batches_train_loss, 
                                                                        num_batches_validation_loss,
                                                                        num_batches_validation_metrics,
                                                                        num_batches_train_metrics, 
                                                                        edgeLoad_train, 
                                                                        edgeLoad_validation,
                                                                        nodeLaod_validation, 
                                                                        nodeLoad_train, 
                                                                        k,
                                                                        loss_function, 
                                                                        delta, 
                                                                        neg_sample_size,
                                                                        cuda,
                                                                        device,
                                                                        optimizer,
                                                                        lr,
                                                                        train_graph, 
                                                                        validation_graph,
                                                                        out_dim, 
                                                                        ground_truth_train, 
                                                                        ground_truth_validation, 
                                                                        result_filepath,
                                                                        patience,
                                                                        prediction,
                                                                        embedding_layer
                                                                      )




Start training for 15 epochs
Edge batch 10 out of  299
Edge batch 20 out of  299
Edge batch 30 out of  299
Edge batch 40 out of  299
Edge batch 50 out of  299
Edge batch 60 out of  299
Edge batch 70 out of  299
Edge batch 80 out of  299
Edge batch 90 out of  299
Edge batch 100 out of  299
Edge batch 110 out of  299
Edge batch 120 out of  299
Edge batch 130 out of  299
Edge batch 140 out of  299
Edge batch 150 out of  299
Edge batch 160 out of  299
Edge batch 170 out of  299
Edge batch 180 out of  299
Edge batch 190 out of  299
Edge batch 200 out of  299
Edge batch 210 out of  299
Edge batch 220 out of  299
Edge batch 230 out of  299
Edge batch 240 out of  299
Edge batch 250 out of  299
Edge batch 260 out of  299
Edge batch 270 out of  299
Edge batch 280 out of  299
Edge batch 290 out of  299
Edge batch 10 out of 53
Edge batch 20 out of 53
Edge batch 30 out of 53
Edge batch 40 out of 53
Edge batch 50 out of 53
Computing embeddings: Batch 10 out of 62
Computing embeddings: Batch 20 out o

##Apply the Trained Model on Test Data

In [78]:
model_inference(validation_graph, out_dim, trained_model, nodeLoad_test, num_batches_test, cuda, device, embedding_layer, ground_truth_test, all_edge_ids_dict)

Computing embeddings: Batch 10 out of 38
Computing embeddings: Batch 20 out of 38
Computing embeddings: Batch 30 out of 38
 TEST Precision 11.725% 


In [79]:
fig = plt.figure()
x = list(range(1,len(viz_dict['train_loss_list'])+1))
plt.title('\n'.join(textwrap.wrap('train and validation loss', 60)))
fig.tight_layout()
plt.rcParams["axes.titlesize"] = 6
plt.plot(x, viz_dict['train_loss_list'])
plt.plot(x, viz_dict['val_loss_list'])
plt.legend(['training loss', 'valid loss'], loc='upper left')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.savefig('/content/drive/My Drive/DL-PROJECT/' + str(datetime.now())[:-10] + 'loss.png')
plt.close(fig)

fig = plt.figure()

x = list(range(1,len(viz_dict['train_precision_list'])+1))

plt.title('\n'.join(textwrap.wrap('train and validation precision', 60)))
fig.tight_layout()
plt.rcParams["axes.titlesize"] = 6
plt.plot(x, viz_dict['train_precision_list'])
plt.plot(x, viz_dict['validation_precision_list'])
plt.legend(['training precision','valid precision'], loc='upper left')
plt.xlabel('Epochs')
plt.ylabel('Percentage of metrics(%)')
plt.savefig('/content/drive/My Drive/DL-PROJECT/' + str(datetime.now())[:-10] + 'metrics.png')
plt.close(fig)