# Heterogeneous Global Graph Neural Networks for Personalized Session-based Recommendation



## Pre-process the Data

In [None]:
!pip install -qq dgl-cu110 dglgo -f https://data.dgl.ai/wheels/repo.html &>/dev/null

In [None]:
import pandas as pd
import numpy as np
import cudf
from pandas import Timedelta
import os
from tqdm.notebook import tqdm
import pickle
from torch.utils.data import Dataset, DataLoader
import math
from operator import itemgetter
import dgl
import tensorflow as tf
pd.set_option('display.max_rows', 10000)

DGL backend not selected or invalid.  Assuming PyTorch for now.


Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


In [None]:
last_fm_union = cudf.read_parquet('/kaggle/input/lastfm-dataset/lastfm_union.parquet')
len(last_fm_union)

19098852

In [None]:
last_fm_union.head(3)

Unnamed: 0,user_id,timestamp,artist_id,artist_name,track_id,track_name,gender,age,country,registered
0,user_000001,2009-05-04 23:08:57,f1b1cf71-bd35-4e99-8624-24a6e15f133a,Deep Dish,,Fuck Me Im Famous (Pacha Ibiza)-09-28-2007,M,,JAPAN,2006-08-13
1,user_000001,2009-05-04 13:54:10,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,åæ¬é¾ä¸,,Composition 0919 (Live_2009_4_15),M,,JAPAN,2006-08-13
2,user_000001,2009-05-04 13:52:04,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,åæ¬é¾ä¸,,Mc2 (Live_2009_4_15),M,,JAPAN,2006-08-13


In [None]:
print(f" # Unique User : {last_fm_union['user_id'].nunique()}, # Unique Artist : {last_fm_union['artist_id'].nunique()}")

 # Unique User : 992, # Unique Artist : 107295


In [None]:
def get_session_id(df, interval):
    df_prev = df.shift()
    is_new_session = (df.userId != df_prev.userId) | (
        df.timestamp - df_prev.timestamp > interval
    )
    session_id = is_new_session.cumsum() - 1
    return session_id


def group_sessions(df, interval):
    sessionId = get_session_id(df, interval)
    df = df.assign(sessionId=sessionId)
    return df


def filter_short_sessions(df, min_len=2):
    session_len = df.groupby('sessionId', sort=False).size()
    long_sessions = session_len[session_len >= min_len].index
    df_long = df[df.sessionId.isin(long_sessions)]
    return df_long


def filter_infreq_items(df, min_support=5):
    item_support = df.groupby('itemId', sort=False).size()
    freq_items = item_support[item_support >= min_support].index
    df_freq = df[df.itemId.isin(freq_items)]
    return df_freq


def filter_until_all_long_and_freq(df, min_len=2, min_support=5):
    while True:
        df_long = filter_short_sessions(df, min_len)
        df_freq = filter_infreq_items(df_long, min_support)
        if len(df_freq) == len(df):
            break
        df = df_freq
    return df


def truncate_long_sessions(df, max_len=20, is_sorted=False):
    if not is_sorted:
        df = df.sort_values(['sessionId', 'timestamp'])
    itemIdx = df.groupby('sessionId').cumcount()
    df_t = df[itemIdx < max_len]
    return df_t


def update_id(df, field):
    labels = cudf.factorize(df[field])[0]
    kwargs = {field: labels}
    df = df.assign(**kwargs)
    return df


def remove_immediate_repeats(df):
    df_prev = df.shift()
    is_not_repeat = (df.sessionId != df_prev.sessionId) | (df.itemId != df_prev.itemId)
    df_no_repeat = df[is_not_repeat]
    return df_no_repeat


def reorder_sessions_by_endtime(df):
    endtime = df.groupby('sessionId', sort=False).timestamp.max()
    df_endtime = endtime.sort_values().reset_index()
    oid2nid = dict(zip(df_endtime.sessionId, df_endtime.index))
    sessionId_new = df.sessionId.map(oid2nid)
    df = df.assign(sessionId=sessionId_new)
    df = df.sort_values(['sessionId', 'timestamp'])
    return df


def keep_top_n_items(df, n):
    item_support = df.groupby('itemId', sort=False).size()
    top_items = item_support.nlargest(n).index
    df_top = df[df.itemId.isin(top_items)]
    return df_top


def split_by_time(df, timedelta):
    max_time = df.timestamp.max()
    end_time = df.groupby('sessionId').timestamp.max()
    split_time = max_time - timedelta
    train_sids = end_time[end_time < split_time].index
    df_train = df[df.sessionId.isin(train_sids)]
    df_test = df[~df.sessionId.isin(train_sids)]
    return df_train, df_test


def train_test_split(df, test_split=0.2):
    endtime = df.groupby('sessionId', sort=False).timestamp.max()
    endtime = endtime.sort_values()
    num_tests = int(len(endtime) * test_split)
    test_session_ids = endtime.index[-num_tests:]
    df_train = df[~df.sessionId.isin(test_session_ids)]
    df_test = df[df.sessionId.isin(test_session_ids)]
    return df_train, df_test


def save_sessions(df, filepath):
    df = reorder_sessions_by_endtime(df)
    sessions = df.groupby('sessionId').itemId.apply(lambda x: ','.join(map(str, x)))
    sessions.to_csv(filepath, sep='\t', header=False, index=False)


def save_dataset(dataset_dir, df_train, df_test):
    # filter items in test but not in train
    df_test = df_test[df_test.itemId.isin(df_train.itemId.unique())]
    df_test = filter_short_sessions(df_test)

    print(f'No. of Clicks: {len(df_train) + len(df_test)}')
    print(f'No. of Items: {df_train.itemId.nunique()}')

    # update itemId
    train_itemId_new, uniques = pd.factorize(df_train.itemId)
    df_train = df_train.assign(itemId=train_itemId_new)
    oid2nid = {oid: i for i, oid in enumerate(uniques)}
    test_itemId_new = df_test.itemId.map(oid2nid)
    df_test = df_test.assign(itemId=test_itemId_new)

    print(f'saving dataset to {dataset_dir}')
    dataset_dir.mkdir(parents=True, exist_ok=True)
    save_sessions(df_train, dataset_dir / 'train.txt')
    save_sessions(df_test, dataset_dir / 'test.txt')
    num_items = len(uniques)
    with open(dataset_dir / 'num_items.txt', 'w') as f:
        f.write(str(num_items))

## Pre-processing steps - Break sequences into sessions (interval of 8 hours), truncate long sessions, remove infrequency items & short sessions

In [None]:
interval = Timedelta(hours=8)
n = 40000

last_fm_union = cudf.read_parquet('/kaggle/input/lastfm-dataset/lastfm_union.parquet')
last_fm_union = last_fm_union[['user_id','timestamp','artist_id']]
last_fm_union.columns = ['userId', 'timestamp', 'itemId']

last_fm_union = last_fm_union.dropna()
last_fm_union = update_id(last_fm_union, 'userId')
last_fm_union = update_id(last_fm_union, 'itemId')
last_fm_union = last_fm_union.sort_values(['userId', 'timestamp'])

last_fm_union = group_sessions(last_fm_union, interval)
last_fm_union = remove_immediate_repeats(last_fm_union)
last_fm_union = truncate_long_sessions(last_fm_union, is_sorted=True)
last_fm_union = keep_top_n_items(last_fm_union, n)
last_fm_union = filter_until_all_long_and_freq(last_fm_union)

In [None]:
last_fm_union.tail(5)

Unnamed: 0,userId,timestamp,itemId,sessionId
19080499,991,2009-05-02 19:51:15,15252,385133
19080498,991,2009-05-02 19:57:17,8519,385133
19080497,991,2009-05-03 03:48:04,59521,385133
19080485,991,2009-05-03 20:28:10,97231,385134
19080484,991,2009-05-04 00:27:31,105799,385134


In [None]:
print("After Pre-processing - ")
print(f"#Users : {last_fm_union['userId'].nunique()} #Items : {last_fm_union['itemId'].nunique()} #Sessions : {last_fm_union['sessionId'].nunique()} ")

After Pre-processing - 
#Users : 989 #Items : 39767 #Sessions : 325957 


Save all the intermeddiate Data in **lastfm** folder

In [None]:
!mkdir lastfm

In [None]:
saved_path = 'lastfm'

In [None]:
last_fm_union.to_csv(f'{saved_path}/'+'data.txt',sep=',',header=None,index=False)

### Split sessions into train and test set


In [None]:
def _agg_all_seq(df):
    """
    {u:[[s1],[s2],[s3],....]}
    """
    res=[]
    for u,ug in tqdm(df.groupby('userId')):
        res+=ug.groupby('sessionId')['itemId'].agg(list).to_arrow().to_pylist()
    with open(os.path.join(saved_path,'all_train_seq.txt'),'wb') as f:
        pickle.dump(res,f)
    return res

In [None]:
def _agg_df(df):
    """
    {u:[[s1],[s2],[s3],....]}
    """
    res={}
    for u,ug in tqdm(df.groupby('userId')):
        res.setdefault(u,[])
        res[u]=ug.groupby('sessionId')['itemId'].agg(list).to_arrow().to_pylist()
    return res

In [None]:
def _split_data():
    print('split data...')
    val_ratio = 0.2
    test_split= 0.2
    df=cudf.read_csv(os.path.join(saved_path,'data.txt'),header=None,names=['userId', 'timestamp', 'itemId','sessionId'])
    print(df['userId'].nunique())
    print(df['itemId'].nunique())
    print(df['itemId'].max(),df['itemId'].min())
    endtime = df.groupby('sessionId', sort=False).timestamp.max()
    endtime = endtime.sort_values()
    num_tests = int(len(endtime) * test_split)
    test_session_ids = endtime.index[-num_tests:]
    df_train = df[~df.sessionId.isin(test_session_ids)]
    df_test = df[df.sessionId.isin(test_session_ids)].reset_index(drop=True)

    ## remap index
    df_test = df_test[df_test.itemId.isin(df_train.itemId.unique())]
    df_test = filter_short_sessions(df_test)

    train_itemId_new, uniques = cudf.factorize(df_train.itemId)
    df_train = df_train.assign(itemId=train_itemId_new)
    oid2nid = {oid: i for i, oid in enumerate(uniques.to_pandas())}
    test_itemId_new = df_test.itemId.map(oid2nid)
    df_test = df_test.assign(itemId=test_itemId_new)
    df_train['userId']+=1
    df_train['itemId']+=1
    df_test['userId']+=1
    df_test['itemId']+=1

    _agg_all_seq(df_train)


    print(df_train['userId'].min(),df_train['userId'].max())
    print(df_train['itemId'].max(),df_train['itemId'].min())
    print(df_test['itemId'].max(),df_test['itemId'].min())

    # split
    df_test=df_test.reset_index(drop=True)
    df_val= df_test.sample(frac=val_ratio)
    part_test=df_test[~df_test.index.isin(df_val.index)]

    df_train.to_csv(os.path.join(saved_path, 'df_train.csv'), header = True, index = False)
    df_val.to_csv(os.path.join(saved_path, 'df_val.csv'), header = True, index = False)
    part_test.to_csv(os.path.join(saved_path, 'part_test.csv'), header = True, index = False)
    df_test.to_csv(os.path.join(saved_path, 'df_test.csv'), header = True, index = False)

    with open(os.path.join(saved_path,'train.pkl'),'wb') as f:
        pickle.dump(_agg_df(df_train),f)

    with open(os.path.join(saved_path,'val.pkl'),'wb') as f:
        pickle.dump(_agg_df(df_val),f)

    with open(os.path.join(saved_path,'test.pkl'),'wb') as f:
        pickle.dump(_agg_df(part_test),f)

    with open(os.path.join(saved_path,'all_test.pkl'),'wb') as f:
        pickle.dump(_agg_df(df_test),f)

In [None]:
%%time
_split_data()

split data...
989
39767
107291 2


0it [00:00, ?it/s]

1 992
38569 1
38569 1


0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

CPU times: user 42.2 s, sys: 869 ms, total: 43.1 s
Wall time: 43 s


In [None]:
with open('/kaggle/working/lastfm/train.pkl', 'rb') as f:
    train_pkl = pickle.load(f)

In [None]:
print(f" Number of Unique Users in Output Dictionary : {len(train_pkl) }")

 Number of Unique Users in Output Dictionary : 897


In [None]:
print(f" Number of sessions for Used ID 1 is {len(train_pkl[1])} and 1st Session of User 1 has {len(train_pkl[1][0])} items while 2nd session has {len(train_pkl[1][1])} items")

 Number of sessions for Used ID 1 is 284 and 1st Session of User 1 has 17 items while 2nd session has 16 items


## Create training samples from the session sequences and corresponding Label


In [None]:
def common_seq(data_list):
    out_seqs=[]
    label=[]
    uid=[]
    for u in tqdm(data_list,desc='gen_seq...',leave=False):
        u_seqs=data_list[u]
        for seq in u_seqs:
            for i in range(1,len(seq)):
                uid.append(int(u))
                out_seqs.append(seq[:-i])
                label.append([seq[-i]])

    final_seqs=[]
    for i in range(len(uid)):
        final_seqs.append([uid[i],out_seqs[i],label[i]])
    return final_seqs

In [None]:
with open(os.path.join(saved_path,'train.pkl'),'rb') as f:
     train_data=pickle.load(f)

max_vid=0
max_uid=0

for u in tqdm(train_data):
    if u>max_uid:
        max_uid=u
    for sess in train_data[u]:
        if max_vid<max(sess):
            max_vid=max(sess)

try:
    with open(os.path.join(saved_path,'all_test.pkl'),'rb') as f:
        test_data=pickle.load(f)
except:
    with open(os.path.join(saved_path,'test.pkl'),'rb') as f:
        test_data=pickle.load(f)

train_data=common_seq(train_data)
test_data=common_seq(test_data)

with open(os.path.join(saved_path,'test_seq.pkl'),'wb') as f:
    pickle.dump(test_data,f)

with open(os.path.join(saved_path,'train_seq.pkl'),'wb') as f:
    pickle.dump(train_data,f)

  0%|          | 0/897 [00:00<?, ?it/s]

gen_seq...:   0%|          | 0/897 [00:00<?, ?it/s]

gen_seq...:   0%|          | 0/796 [00:00<?, ?it/s]

In [None]:
with open('/kaggle/working/lastfm/train_seq.pkl', 'rb') as f:
    train_seq = pickle.load(f)

In [None]:
print(f"Number of Training Sequence is {len(train_seq)} ")

Number of Training Sequence is 2837544 


In [None]:
with open('/kaggle/working/lastfm/test_seq.pkl', 'rb') as f:
    test_seq = pickle.load(f)

In [None]:
print(f"Number of Test Sequence is {len(test_seq)} ")

Number of Test Sequence is 672394 


### Create Session Dataset Class & Dataloader.


In [None]:
SZ = 12
SEQ_LEN = 10 ## Window Size to create a training Sequence
BATCH_SIZE = 512

In [None]:
class SessionDataset(Dataset):
    def __init__(self, data, max_len):
        """
        args:
            config(dict):
            data_type(int): 0: train 1: val 2:test
        """
        super(SessionDataset, self).__init__()
        self.data=data
        self.max_seq_len=max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        """
        data format:
        <[uid]> <[v1,v2,v3]> <label>
        """

        data=self.data[index]
        uid=np.array([data[0]],dtype=int)
        browsed_ids=np.zeros((self.max_seq_len),dtype=int)

        seq_len=len(data[1][-self.max_seq_len:])
        mask=np.array([1 for i in range(seq_len)]+[ 0 for i in range(self.max_seq_len-seq_len)],dtype=int)
        pos_idx=np.array([seq_len-i-1 for i in range(seq_len)]+[ 0 for i in range(self.max_seq_len-seq_len)],dtype=int)
        browsed_ids[:seq_len]=np.array(data[1][-self.max_seq_len:])

        seq_len=np.array(seq_len,dtype=int)

        label=np.array(data[2],dtype=int)

        return uid,browsed_ids,mask,seq_len,label,pos_idx

In [None]:
train_dataset = SessionDataset(train_seq, max_len=SEQ_LEN)
test_dataset = SessionDataset(test_seq, max_len=SEQ_LEN)

In [None]:
print(f" Length of Training Dataset is {len(train_dataset)} & Test Dataset {len(test_dataset)} ")

 Length of Training Dataset is 2837544 & Test Dataset 672394 


In [None]:
train_iter = DataLoader(dataset=train_dataset,
                            batch_size=BATCH_SIZE,
                            num_workers=0,
                            drop_last=False,
                            shuffle=True,
                            pin_memory=False)

In [None]:
test_iter = DataLoader(dataset=test_dataset,
                            batch_size=BATCH_SIZE,
                            num_workers=0,
                            drop_last=False,
                            shuffle=False,
                            pin_memory=False)

In [None]:
print(f" Length of Training DataLoader is {len(train_iter)} & Test DataLoader {len(test_iter)} ")

 Length of Training DataLoader is 5543 & Test DataLoader 1314 


### Example of DataLoader Output below

In [None]:
eg = next(iter(train_iter))

In [None]:
print(f"Number of elements in the example tuple {len(eg)} - corresponds to uid | browsed_ids | mask | seq_len | label | pos_idx ")

Number of elements in the example tuple 6 - corresponds to uid | browsed_ids | mask | seq_len | label | pos_idx 


In [None]:
print("Size of - ")
print(f" User ID Tensor - {eg[0].size()} ")
print(f" Sequence of Items - {eg[1].size()} ")
print(f" Sequence of Masks - {eg[2].size()} ")
print(f" Actual Sequence Length (before padding) - {eg[3].size()} ")
print(f" Labels - {eg[4].size()} ")
print(f" Position Index (Oldest Index = 0, Latest Index can be till 9, Padded Index = 0) - {eg[5].size()} ")

Size of - 
 User ID Tensor - torch.Size([512, 1]) 
 Sequence of Items - torch.Size([512, 10]) 
 Sequence of Masks - torch.Size([512, 10]) 
 Actual Sequence Length (before padding) - torch.Size([512]) 
 Labels - torch.Size([512, 1]) 
 Position Index (Oldest Index = 0, Latest Index can be till 9, Padded Index = 0) - torch.Size([512, 10]) 


#### Example showing actual tensor of a sample from 1st batch of Dataloader

In [None]:
print("Actual tensor of - ")
print(f" User ID - {eg[0][0]} ")
print(f" Sequence of Items - {eg[0][1]} ")
print(f" Sequence of Masks - {eg[0][2]} ")
print(f" Actual Sequence Length (before padding) - {eg[0][3]} ")
print(f" Labels - {eg[0][4]} ")
print(f" Position Index (Oldest Index = 0, Latest Index can be till 9, Padded Index = 0) - {eg[0][5]} ")

Actual tensor of - 
 User ID - tensor([607]) 
 Sequence of Items - tensor([567]) 
 Sequence of Masks - tensor([274]) 
 Actual Sequence Length (before padding) - tensor([280]) 
 Labels - tensor([260]) 
 Position Index (Oldest Index = 0, Latest Index can be till 9, Padded Index = 0) - tensor([715]) 


##  Create heterogenous Graph

## Create Co-occurance matrix of items based on adjacency of items in same session


In [None]:
def sample_relations(num, sample_size=20):

    adj1 = [dict() for _ in range(num)]
    adj2 = [dict() for _ in range(num)]
    adj_in = [[] for _ in range(num)]
    adj_out = [[] for _ in range(num)]
    relation_out = []
    relation_in = []

    with open(os.path.join(saved_path, 'train.pkl'), 'rb') as f:
        graph = pickle.load(f)

    for u in tqdm(graph, desc='build the graph...', leave=False):
        u_seqs = graph[u]
        for s in u_seqs:
            for i in range(len(s) - 1):
                relation_out.append([s[i], s[i + 1]])
                relation_in.append([s[i + 1], s[i]])

    for tup in relation_out:
        if tup[1] in adj1[tup[0]].keys():
            adj1[tup[0]][tup[1]] += 1
        else:
            adj1[tup[0]][tup[1]] = 1
    for tup in relation_in:
        if tup[1] in adj2[tup[0]].keys():
            adj2[tup[0]][tup[1]] += 1
        else:
            adj2[tup[0]][tup[1]] = 1

    weight = [[] for _ in range(num)]

    for t in range(1, num):
        x = [v for v in sorted(adj1[t].items(), reverse=True, key=lambda x: x[1])]
        adj_out[t] = [v[0] for v in x]

    for t in range(1, num):
        x = [v for v in sorted(adj2[t].items(), reverse=True, key=lambda x: x[1])]
        adj_in[t] = [v[0] for v in x]

    # edge sampling
    for i in range(1, num):
        adj_in[i] = adj_in[i][:sample_size]
    for i in range(1, num):
        adj_out[i] = adj_out[i][:sample_size]

    with open(os.path.join(saved_path, f'adj_{sample_size}.pkl'), 'wb') as f:
        pickle.dump([adj_in, adj_out], f)

In [None]:
sample_relations(num = 40000, sample_size=SZ)

build the graph...:   0%|          | 0/897 [00:00<?, ?it/s]

In [None]:
with open(os.path.join(saved_path, f'adj_{SZ}.pkl'), 'rb') as f:
    adj = pickle.load(f)

In [None]:
adj_in, adj_out = adj[0], adj[1]

In [None]:
print(f"Items which most frequently lies to the left (previous time step) of Item 1 : {adj_in[1]} ")
print(f"Items which most frequently lies to the right (next time step) of Item 1 : {adj_out[1]} ")

Items which most frequently lies to the left (previous time step) of Item 1 : [37151, 33669, 18558, 29129, 35336, 28294, 29082, 28263, 4905, 889, 34325, 22650] 
Items which most frequently lies to the right (next time step) of Item 1 : [34325, 33669, 6335, 17335, 36077, 29082, 36806, 889, 13645, 37151, 4304, 36082] 


### Create Item & User Similarity matrix based on If Items appear in same session & if Users interact with same set of items



In [None]:
def userCF():
    """
    calculate user similarity
    """
    vid_user = {}
    user_sim_matrix = {}
    uid_vcount = {}
    with open(os.path.join(saved_path,'train.pkl'), 'rb') as f:
        session_data = pickle.load(f)
    for uid in tqdm(session_data):
        u_sess = session_data[uid]
        uid_vcount.setdefault(uid, set())
        for sess in u_sess:
            for vid in sess:
                if vid not in vid_user:
                    vid_user[vid] = set()
                vid_user[vid].add(uid)
                if vid not in vid_user:
                    vid_user[vid] = set()
                vid_user[vid].add(uid)
                uid_vcount[uid].add(vid)

    for vid, users in tqdm(vid_user.items()):
        for u in users:
            for v in users:
                if u == v:
                    continue
                user_sim_matrix.setdefault(u, {})
                user_sim_matrix[u].setdefault(v, 0)
                user_sim_matrix[u][v] += (1 / len(users))
    for u, related_users in user_sim_matrix.items():
        for v, count in related_users.items():
            user_sim_matrix[u][v] = count / math.sqrt(len(uid_vcount[u]) * len(uid_vcount[v]))
    user_topK = {}
    # print(user_sim_matrix)
    for user in user_sim_matrix:
        user_topK[user] = sorted(user_sim_matrix[user].items(), key=itemgetter(1), reverse=True)[0:100]
    with open(os.path.join(saved_path,'u2u_sim.pkl'), 'wb') as f:
        pickle.dump(user_topK, f)

**User Similarity between Item i and Item j**

In [None]:
def itemCFSession():
    """
    calculate item similarity
    """
    sess_item = {}
    item_sim_matrix = {}
    vid_ucount = {}
    sess_cnt = 0

    with open(os.path.join(saved_path,'train.pkl'), 'rb') as f:
        session_data = pickle.load(f)

    for uid in tqdm(session_data):
        u_sess = session_data[uid]

        for sess in u_sess:
            sess_cnt += 1
            sess_item[sess_cnt] = set()
            for vid in sess:
                sess_item[sess_cnt].add(vid)
                vid_ucount.setdefault(vid, set())
                vid_ucount[vid].add(sess_cnt)

    for sess, items in tqdm(sess_item.items()):
        for v in items:
            for _v in items:
                if _v == v:
                    continue
                item_sim_matrix.setdefault(v, {})
                item_sim_matrix[v].setdefault(_v, 0)
                item_sim_matrix[v][_v] += (1 / len(items))

    for v, related_items in item_sim_matrix.items():
        for _v, count in related_items.items():
            item_sim_matrix[v][_v] = count / math.sqrt(len(vid_ucount[v]) * len(vid_ucount[_v]))
    item_topK = {}

    for item in item_sim_matrix:
        item_topK[item] = sorted(item_sim_matrix[item].items(), key=itemgetter(1), reverse=True)[0:200]
    with open(os.path.join(saved_path,'i2i_sim_sess.pkl'), 'wb') as f:
        pickle.dump(item_topK, f)

In [None]:
userCF()

  0%|          | 0/897 [00:00<?, ?it/s]

  0%|          | 0/38569 [00:00<?, ?it/s]

In [None]:
itemCFSession()

  0%|          | 0/897 [00:00<?, ?it/s]

  0%|          | 0/260766 [00:00<?, ?it/s]

In [None]:
with open(os.path.join(saved_path,'u2u_sim.pkl'), 'rb') as f:
    u2u_sim = pickle.load(f)

In [None]:
print(f"#Users in the User Sim. Matrix {len(u2u_sim.keys())}, each having Top {len(u2u_sim[1])} Similar User ")

#Users in the User Sim. Matrix 897, each having Top 100 Similar User 


In [None]:
with open(os.path.join(saved_path,'i2i_sim_sess.pkl'), 'rb') as f:
    i2i_sim = pickle.load(f)

In [None]:
print(f"#Items in the Item Sim. Matrix {len(i2i_sim.keys())} and Item1 has similar {len(i2i_sim[1])} Items while Item2 has similar {len(i2i_sim[2])} Items ")

#Items in the Item Sim. Matrix 38569 and Item1 has similar 75 Items while Item2 has similar 39 Items 


### Create Heterogenous Graph using the 4 types of edges mentioned earlier (prepared in Step B1 & B2)

In [None]:
def uui_graph(sample_size, topK, add_u = True, add_v = True):

    """
    dataset_name:
    sample_size:
    topK:
    add_u:
    add_v:
    """

    pre = []
    nxt = []
    src_v = []
    dst_u = []

    with open(os.path.join(saved_path,'train.pkl'), 'rb') as f:
        graph = pickle.load(f)

    with open(os.path.join(saved_path,f'adj_{sample_size}.pkl'), 'rb') as f:
        adj = pickle.load(f)

    adj_in = adj[0]
    adj_out = adj[1]

    print('adj_in:', len(adj_in))
    print('adj_out:', len(adj_out))

    for i in range(len(adj_in)):
        if i == 0:
            continue
        _pre = []
        _nxt = []
        for item in adj_in[i]:
            _pre.append(i)
            _nxt.append(item)
        pre += _pre
        nxt += _nxt
    o_pre = []
    o_nxt = []
    for i in range(len(adj_out)):
        if i == 0:
            continue
        _pre = []
        _nxt = []
        for item in adj_out[i]:
            _pre.append(i)
            _nxt.append(item)
        o_pre += _pre
        o_nxt += _nxt

    for u in tqdm(graph, desc='build the graph...', leave=False):
        u_seqs = graph[u]
        for s in u_seqs:
            pre += s[:-1]
            nxt += s[1:]
            dst_u += [u for _ in s]
            src_v += s

    with open(os.path.join(saved_path,'u2u_sim.pkl'), 'rb') as f:
        u2u_sim = pickle.load(f)

    with open(os.path.join(saved_path,'i2i_sim_sess.pkl'),'rb') as f:
        i2i_sim=pickle.load(f)

    topv_src=[]
    topv_dst=[]
    count_v=0
    for v in tqdm(i2i_sim,desc='gen_seq...',leave=False):
        tmp_src=[]
        tmp_dst=[]

        exclusion=adj_in[v]+adj_out[v]
        for (vid,value) in i2i_sim[v][:topK][:int(len(exclusion))]:
            if vid not in exclusion:
                tmp_src.append(vid)
                tmp_dst.append(v)
        topv_src+=tmp_src
        topv_dst+=tmp_dst

    u_src = []
    u_dst = []
    for u in tqdm(u2u_sim, desc='gen_seq...', leave=False):
        tmp_src = []
        tmp_dst = []
        for (uid, value) in u2u_sim[u][:topK]:
            tmp_src.append(uid)
            tmp_dst.append(u)
        u_src += tmp_src
        u_dst += tmp_dst

    count = 0
    for i in adj_in:
        count += len(i)
    print('local ajdency-in:', count / len(adj_in))
    count = 0
    for i in adj_out:
        count += len(i)
    print('local ajdency-out:', count / len(adj_out))

    item_num = max(max(pre), max(nxt)) +1
    print('addiotn item num', item_num)
    user_num = max(max(u_src), max(u_dst))
    u_src = [u + item_num for u in u_src]
    u_dst = [u + item_num for u in u_dst]
    dst_u = [u + item_num for u in dst_u]

    G = dgl.graph((pre, nxt))
    G = dgl.add_edges(G, nxt, pre)
    G = dgl.add_edges(G, dst_u, src_v)
    G = dgl.add_edges(G, src_v, dst_u)

    if add_u:
        G = dgl.add_edges(G, u_src, u_dst)
        G = dgl.add_edges(G, u_dst, u_src)

    if add_v:
        G = dgl.add_edges(G, topv_src, topv_dst)
        G = dgl.add_edges(G, topv_dst, topv_src)


    G=dgl.add_self_loop(G)

    return G,item_num, pre, nxt, dst_u, src_v, u_src, u_dst, topv_src, topv_dst

In [None]:
G,item_num, pre, nxt, dst_u, src_v, u_src, u_dst, topv_src, topv_dst = uui_graph(SZ, topK = 20, add_u = True, add_v = True)

adj_in: 40000
adj_out: 40000


build the graph...:   0%|          | 0/897 [00:00<?, ?it/s]

gen_seq...:   0%|          | 0/38569 [00:00<?, ?it/s]

gen_seq...:   0%|          | 0/897 [00:00<?, ?it/s]

local ajdency-in: 8.254125
local ajdency-out: 8.256525
addiotn item num 38570


**Edge Type A) Between 2 items based on if 2 items are adjacent to each other in item sequence of same session. \
Edge Type B) Between 2 items based on if 2 items co-occur (item similarity) in same session. \
Edge Type C) Between 2 users based on if 2 users interact with similar items. \
Edge Type D) Between user & item based on if user iteracted with item.**

In [None]:
num_edges_typeA = len(pre)
num_edges_typeB = len(topv_src)
num_edges_typeC = len(u_src)
num_edges_typeD = len(src_v)
print(f"Number Type A Edges : {num_edges_typeA}")
print(f"Number Type B Edges : {num_edges_typeB}")
print(f"Number Type C Edges : {num_edges_typeC}")
print(f"Number Type D Edges : {num_edges_typeD}")

Number Type A Edges : 3167709
Number Type B Edges : 426471
Number Type C Edges : 17940
Number Type D Edges : 3098310


In [None]:
print(f"Total Number of Edges : { G.num_nodes() + 2*num_edges_typeA + 2*num_edges_typeB + 2*num_edges_typeC + 2*num_edges_typeD} " )

Total Number of Edges : 13460423 


In [None]:
G

Graph(num_nodes=39563, num_edges=13460423,
      ndata_schemes={}
      edata_schemes={})

## Train Model and Evaluate


#### Defining the Model Architecture -


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl.nn.pytorch as dglnn
import dgl.function as FN
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [None]:
class HG_GNN(nn.Module):
    # @torchsnooper.snoop()
    def __init__(self, G, config,item_num, max_seq_len=10, max_sess = 10):
        super().__init__()
        self.G = G.to(device)
        self.max_sess = max_sess
        self.hidden_size = config['hidden_size']
        self.em_size = config['embed_size']
        self.pos_embedding = nn.Embedding(200, self.em_size)
        self.v2e = nn.Embedding(G.number_of_nodes(), self.em_size).to(device)

        self.conv1 = dglnn.SAGEConv(self.em_size, self.em_size, 'mean')

        dropout = config["dropout"]
        self.emb_dropout = nn.Dropout(p=dropout)
        self.gru = nn.GRU(self.em_size, self.hidden_size, 1)
        self.max_seq_len = max_seq_len
        self.W = nn.Linear(self.em_size, self.em_size)

        # node embedding
        self.linear_one = nn.Linear(self.em_size, self.em_size, bias=True)
        self.linear_two = nn.Linear(self.em_size, self.em_size, bias=True)
        self.linear_three = nn.Linear(self.em_size, 1, bias=False)

        # gru embedding
        self.a_1 = nn.Linear(self.hidden_size, self.hidden_size)
        self.a_2 = nn.Linear(self.hidden_size, self.hidden_size)
        self.v_t = nn.Linear(self.hidden_size, 1, bias=False)

        self.ct_dropout = nn.Dropout(dropout)

        self.user_transform = nn.Sequential(
            # nn.ReLU(),
            nn.Linear(self.em_size, self.em_size, bias=True)
            # nn.BatchNorm1d(predict_em_size, momentum=0.5),
        )

        self.gru_transform = nn.Sequential(
            # nn.ReLU(),
            nn.Linear(self.hidden_size * 2, self.em_size, bias=True)
            # nn.BatchNorm1d(predict_em_size, momentum=0.5),
        )

        self.sigmoid_concat = nn.Sequential(
            nn.Linear(self.em_size * 2, 1, bias=True),
            nn.Sigmoid()
            # nn.BatchNorm1d(predict_em_size, momentum=0.5),
        )

        self.w_1 = nn.Parameter(torch.Tensor(2 * self.em_size, self.em_size))
        self.w_2 = nn.Parameter(torch.Tensor(self.em_size, 1))
        self.glu1 = nn.Linear(self.em_size, self.em_size)
        self.glu2 = nn.Linear(self.em_size, self.em_size, bias=False)

        self.w_3 = nn.Parameter(torch.Tensor(self.em_size, self.em_size))
        self.w_4 = nn.Parameter(torch.Tensor(self.em_size, 1))
        self.glu3 = nn.Linear(self.em_size, self.em_size)
        self.glu4 = nn.Linear(self.em_size, self.em_size, bias=False)

        self.reset_parameters()

        self.item_num = item_num


    def reset_parameters(self):
        stdv = 1.0 / math.sqrt(self.em_size)
        for weight in self.parameters():
            weight.data.uniform_(-stdv, stdv)


    def compute_hidden_vector(self, hidden, mask, pos_idx):
        """
        hidden:
        mask:
        pos_idx:
        """
        mask = mask.float().unsqueeze(-1)
        batch_size = hidden.shape[0]
        len = hidden.shape[1]
        pos_emb = self.pos_embedding(pos_idx)
        tmp = torch.sum(hidden * mask, -2) / torch.sum(mask, 1)
        hs = tmp.unsqueeze(-2).repeat(1, len, 1)
        nh = torch.matmul(torch.cat([pos_emb, hidden], -1), self.w_1)
        nh = torch.tanh(nh)
        nh = torch.sigmoid(self.glu1(nh) + self.glu2(hs))
        beta = torch.matmul(nh, self.w_2)
        beta = beta * mask
        select = torch.sum(beta * hidden, 1)

        return select, tmp


    def sess_user_vector(self, user_vec, note_embeds,mask):
        """
        user_vec:
        note_embeds:
        mask:
        """
        mask = mask.float().unsqueeze(-1)
        hs = user_vec.repeat(1, mask.shape[1], 1)
        nh = torch.matmul(note_embeds, self.w_3)
        nh = torch.tanh(nh)
        nh = torch.sigmoid(self.glu3(nh) + self.glu4(hs))
        beta = torch.matmul(nh, self.w_4)
        beta = beta * mask
        select = torch.sum(beta * note_embeds, 1)

        return select


    def forward(self, user, seq, mask, seq_len, pos_idx):
        """
        seq(bs*L)
        seq: bs*L
        his_ids: bs * M
        mask:
        seq_len(bs)
        """
        user = user + self.item_num

        # HG-GNN

        h1 = self.conv1( self.G, self.emb_dropout(self.v2e(torch.arange(0, self.G.number_of_nodes()).long().to(device) ) ) )
        h1 = F.relu(h1)

        bs = seq.size()[0]
        L = seq.size()[1]

        node_list = seq
        item_embeds = ( h1[node_list] + self.v2e(node_list)) / 2
        user_embeds = ( h1[user] + self.v2e(user)) / 2
        node_embeds = item_embeds.view((bs, L, -1))
        seq_embeds = user_embeds

        sess_vec, avg_sess = self.compute_hidden_vector(node_embeds, mask, pos_idx)
        sess_user = self.sess_user_vector(user_embeds, node_embeds, mask)
        alpha = self.sigmoid_concat(torch.cat([sess_vec, sess_user], 1))
        seq_embeds =  (alpha * sess_vec + (1 - alpha) * sess_user)
        item_embs = self.v2e.weight[1:]
        scores = torch.matmul(seq_embeds, item_embs.permute(1, 0))

        return scores


In [None]:
config = {
'embed_size' : 128,
'learning_rate' : 0.001,
'hidden_size' : 256,
'batch_size' : 512,
'epoch' : 10,
'gnn_layer_size' : 3,
'patience' : 5,
'save_flag' : 0,
'dropout' : 0.5,
'comment' : "",
'lr_dc_step' : 3,
'lr_dc' : 0.1
}

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [None]:
G.to(device)

Graph(num_nodes=39563, num_edges=13460423,
      ndata_schemes={}
      edata_schemes={})

In [None]:
model = HG_GNN(G, config, item_num, SEQ_LEN).to(device)

### Example of Model Output

In [None]:
eg = next(iter(train_iter))

In [None]:
eg[0].size(), eg[1].size(), eg[2].size(), eg[3].size(), eg[4].size(), eg[5].size(),

(torch.Size([512, 1]),
 torch.Size([512, 10]),
 torch.Size([512, 10]),
 torch.Size([512]),
 torch.Size([512, 1]),
 torch.Size([512, 10]))

In [None]:
uid, browsed_ids, mask, seq_len, label, pos_idx = eg[0], eg[1], eg[2], eg[3], eg[4], eg[5]

In [None]:
outputs = model(uid.to(device),
                browsed_ids.to(device),
                mask.to(device),
                seq_len.to(device),
                pos_idx.to(device)
                )

In [None]:
print(f"Size of Label is {label.size()} & Model Output {outputs.size()} ")

Size of Label is torch.Size([512, 1]) & Model Output torch.Size([512, 39562]) 


#### Training Loop and Evaluation Fuction which computes Accuracy, MRR, NDCG @ top 5, 10, 20 items

In [None]:
def train(config, model, device, train_iter, test_iter=None):
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'], weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=config["lr_dc_step"], gamma=config["lr_dc"])

    dev_best_loss = float('inf')
    last_improve = 0
    flag = False
    AUC_best = 0
    loss_list = []
    best_acc = 0
    STEP_SIZE = 2000
    batchs = train_iter.__len__()

    for epoch in range(config['epoch']):
        print('Epoch [{}/{}]'.format(epoch + 1, config['epoch']))
        loss_records = []
        L = nn.CrossEntropyLoss()
        total_batch = 0
        for i, (uid, browsed_ids, mask, seq_len, label, pos_idx) in enumerate(train_iter):
            model.train()
            outputs = model(uid.to(device),
                            browsed_ids.to(device),
                            mask.to(device),
                            seq_len.to(device),
                            pos_idx.to(device)
                            )
            model.zero_grad()
            loss = L(outputs, (label - 1).to(device).squeeze())
            loss_list.append(loss.item())
            loss_records.append(loss.item())
            loss.backward()
            optimizer.step()

            if total_batch == 0:
                print(f'Initial training Loss is {round(np.mean(loss_records),2)}')

            if (total_batch+1) % STEP_SIZE == 0:
                print(f'Average training Loss till Batch {total_batch+1} is {round(np.mean(loss_records),2)}')
                loss_list = []
            total_batch += 1

        print('preformance on test set....')
        scheduler.step()
        acc = evaluate_topk(config, model, test_iter)

        if acc > best_acc:
            best_acc = acc
            last_improve = 0

            torch.save({
                'model' : model.state_dict(),
                'optimizer' : optimizer.state_dict(),
                'scheduler' : scheduler.state_dict(),
                'Accuracy@20' : acc,
                'epoch' : epoch
            }, os.path.join(saved_path, f'{epoch}.pt'))

        else:
            last_improve += 1
            if last_improve >= config['patience']:
                print('Early stop: No more improvement')
                break


def metrics(res, labels):
    res = np.concatenate(res)
    acc_ar = (res == labels)  # [BS, K]
    acc = acc_ar.sum(-1)

    rank = np.argmax(acc_ar, -1) + 1
    mrr = (acc / rank).mean()
    ndcg = (acc / np.log2(rank + 1)).mean()
    return acc.mean(), mrr, ndcg


def evaluate_topk(config, model, data_iter, K=20):
    model.eval()
    hit = []
    res50 = []
    res20 = []
    res10 = []
    res5 = []
    mrr = []
    labels = []
    uids = []
    with torch.no_grad():
        with tqdm(total=(data_iter.__len__()), desc='Predicting', leave=False) as p:
            for i, (uid, browsed_ids, mask, seq_len, label, pos_idx) in (enumerate(data_iter)):
                outputs = model(uid.to(device), browsed_ids.to(device), mask.to(device), seq_len.to(device),
                                pos_idx.to(device)
                                )
                sub_scores = outputs.topk(K)[1].cpu()
                res20.append(sub_scores)
                res10.append(outputs.topk(10)[1].cpu())
                res5.append(outputs.topk(5)[1].cpu())
                res50.append(outputs.topk(50)[1].cpu())
                labels.append(label)

    labels = np.concatenate(labels)  # .flatten()
    labels = labels - 1

    acc50, mrr50, ndcg50 = metrics(res50, labels)
    acc20, mrr20, ndcg20 = metrics(res20, labels)
    acc10, mrr10, ndcg10 = metrics(res10, labels)
    acc5, mrr5, ndcg5 = metrics(res5, labels)

    print("Top20 : acc {} , mrr {}, ndcg {}".format(acc20, mrr20, ndcg20))
    print("Top10 : acc {} , mrr {}, ndcg {}".format(acc10, mrr10, ndcg10))
    print("Top5 : acc {} , mrr {}, ndcg {}".format(acc5, mrr5, ndcg5))

    return acc20

In [None]:
train(config, model, device, train_iter, test_iter)

Epoch [1/10]
Initial training Loss is 10.6
Average training Loss till Batch 2000 is 8.45
Average training Loss till Batch 4000 is 8.04
preformance on test set....


Predicting:   0%|          | 0/1314 [00:00<?, ?it/s]

Top20 : acc 0.1592845861206376 , mrr 0.0472464727216317, ndcg 0.07149472298517695
Top10 : acc 0.10633497621929999 , mrr 0.043629359340571806, ndcg 0.05817915169210204
Top5 : acc 0.06903095506503627 , mrr 0.03873180506270629, ndcg 0.046197195816210304
Epoch [2/10]
Initial training Loss is 6.92
Average training Loss till Batch 2000 is 7.01
Average training Loss till Batch 4000 is 6.95
preformance on test set....


Predicting:   0%|          | 0/1314 [00:00<?, ?it/s]

Top20 : acc 0.18267117196167723 , mrr 0.05675237284891504, ndcg 0.08408590687710855
Top10 : acc 0.12469326020160798 , mrr 0.05278619593359165, ndcg 0.069498862004407
Top5 : acc 0.08259740568773666 , mrr 0.04724952929383663, ndcg 0.05596772529726891
Epoch [3/10]
Initial training Loss is 6.66


In [None]:
import matplotlib.pyplot as plt



data = {'acc5':0.1105, 'acc10':0.1598, 'acc20':0.2249}
accuracy = list(data.keys())
values = list(data.values())
fig = plt.figure(figsize = (10, 5))

# creating the bar plot
plt.ylabel('Accuracy', labelpad = 50)
plt.bar(accuracy, values, color ='maroon',
        width = 0.4)


In [None]:

data = {'mrr5':0.06243, 'mrr10':0.06892, 'mrr20':0.07339}
mrr = list(data.keys())
values = list(data.values())
fig = plt.figure(figsize = (10, 5))

# creating the bar plot
plt.ylabel('MRR', labelpad = 50)
plt.bar(mrr, values, color ='blue',
        width = 0.4)

In [None]:

data = {'Item-KNN':4.04,'GRU4Rec':5.40,'FPMC':3.78,'BRP-MF':5.73,'NextitNext':7.08,'NARM':7.19,'HGNN':7.31}
metrics = list(data.keys())
values = list(data.values())
fig = plt.figure(figsize = (10, 5))

# creating the bar plot
plt.ylabel('MRR@20', labelpad = 50)
plt.bar(metrics, values, color ='orange',
        width = 0.4)


In [None]:
import matplotlib.pyplot as plt
data = {'ndcg5':0.0751, 'ndcg10':0.09076, 'ndcg20':0.10782}
mrr = list(data.keys())
values = list(data.values())
fig = plt.figure(figsize = (10, 5))

# creating the bar plot
plt.ylabel('nDCG', labelpad = 50)
plt.bar(mrr, values, color ='red',
        width = 0.4)