In [21]:
from tqdm import tqdm 
import os 

import numpy as np 
import pandas as pd

import scipy.sparse  as sp 

from sklearn.model_selection import train_test_split 

import torch 
from torch import nn, optim 
from torch.utils.data import Dataset, DataLoader 
from sklearn.metrics import accuracy_score

In [2]:
class args:
    seed = 42
    num_layers = 3
    batch_size= 512
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    SAVE_PATH = 'Parameters'

In [3]:
d_set = pd.read_csv('ctr_data_1M_sample.csv', encoding='utf-8-sig')

In [4]:
d_train, d_test = train_test_split(d_set, train_size=0.6, random_state=args.seed)
d_valid, d_test = train_test_split(d_test, train_size=0.5, random_state=args.seed)

In [5]:
d_train = d_train.astype({'user_id':'category', 'item_id':'category'})
d_valid = d_valid.astype({'user_id':'category', 'item_id':'category'})
d_test = d_test.astype({'user_id':'category', 'item_id':'category'})

In [6]:
u_cat = d_train.user_id.cat.categories
b_cat = d_train.business_id.cat.categories

In [7]:
d_valid.user_id = d_valid.user_id.cat.set_categories(u_cat)
d_valid.business_id = d_valid.business_id.cat.set_categories(b_cat)

d_test.user_id = d_test.user_id.cat.set_categories(u_cat)
d_test.business_id = d_test.business_id.cat.set_categories(b_cat)

In [8]:
d_train.user_id = d_train.user_id.cat.codes
d_train.business_id = d_train.business_id.cat.codes 

d_valid.user_id = d_valid.user_id.cat.codes
d_valid.business_id = d_valid.business_id.cat.codes 

d_test.user_id = d_test.user_id.cat.codes
d_test.business_id = d_test.business_id.cat.codes 

In [9]:
d_train = d_train.dropna()
d_valid = d_valid.dropna()
d_test = d_test.dropna()

d_train.reset_index(drop=True, inplace=True)
d_valid.reset_index(drop=True, inplace=True)
d_test.reset_index(drop=True, inplace=True)

In [10]:
d_train = d_train.astype({'user_id': int, 'item_id': int})
d_valid = d_valid.astype({'user_id': int, 'item_id': int})
d_test = d_test.astype({'user_id': int, 'item_id': int})

In [11]:
args.num_users = d_train.user_id.max() + 1
args.num_items = d_train.business_id.max() + 1
args.latent_dim = 64
args.num_epochs = 50

In [12]:
class GNNLayer(nn.Module):
    def __init__(self, in_feats, out_feats):
        super(GNNLayer, self).__init__()
        self.in_feats = in_feats
        self.out_feats = out_feats 

        self.W1 = nn.Linear(in_feats, out_feats)
        self.W2 = nn.Linear(in_feats, out_feats)

    def forward(self, L, SelfLoop, feats):
        # (L+I)EW_1
        sf_L = L + SelfLoop
        L = L.cuda()
        sf_L = sf_L.cuda()
        sf_E = torch.sparse.mm(sf_L, feats)
        left_part = self.W1(sf_E) # left part

        # EL odot EW_2, odot indicates element-wise product 
        LE = torch.sparse.mm(L, feats)
        E = torch.mul(LE, feats)
        right_part = self.W2(E)

        return left_part + right_part 

class NGCF(nn.Module):
    def __init__(self, args, matrix):
        super(NGCF, self).__init__()
        self.num_users = args.num_users 
        self.num_items = args.num_items 
        self.latent_dim = args.latent_dim 
        self.device = args.device

        self.user_emb = nn.Embedding(self.num_users, self.latent_dim)
        self.item_emb = nn.Embedding(self.num_items, self.latent_dim)

        self.num_layers = args.num_layers
        self.L = self.LaplacianMatrix(matrix)
        self.I = self.SelfLoop(self.num_users + self.num_items)

        self.leakyrelu = nn.LeakyReLU()
        self.GNNLayers = nn.ModuleList()

        for i in range(self.num_layers-1):
            self.GNNLayers.append(GNNLayer(self.latent_dim, self.latent_dim))

        self.fc_layer = nn.Sequential(
            nn.Linear(self.latent_dim * self.num_layers * 2, 64), 
            nn.ReLU(), 
            nn.Linear(64, 32), 
            nn.ReLU(), 
            nn.Linear(32, 1)
        )

    def SelfLoop(self, num):
        i = torch.LongTensor([[k for k in range(0, num)], [j for j in range(0, num)]])
        val = torch.FloatTensor([1]*num)
        return torch.sparse.FloatTensor(i, val)

    def LaplacianMatrix(self, ratings):
        iids = ratings['item_id'] + self.num_users 
        matrix = sp.coo_matrix((ratings['click'], (ratings['user_id'], ratings['item_id'])))
        
        upper_matrix = sp.coo_matrix((ratings['click'], (ratings['user_id'], iids)))
        lower_matrix = matrix.transpose()
        lower_matrix.resize((self.num_items, self.num_users + self.num_items))

        A = sp.vstack([upper_matrix, lower_matrix])
        row_sum = (A > 0).sum(axis=1)
        # row_sum = np.array(row_sum).flatten()
        diag = list(np.array(row_sum.flatten())[0])
        D = np.power(diag, -0.5)
        D = sp.diags(D)
        L = D * A * D
        L = sp.coo_matrix(L)
        row = L.row 
        col = L.col
        idx = np.stack([row, col])
        idx = torch.LongTensor(idx)
        data = torch.FloatTensor(L.data)
        SparseL = torch.sparse.FloatTensor(idx, data)
        return SparseL 

    def FeatureMatrix(self):
        uids = torch.LongTensor([i for i in range(self.num_users)]).to(self.device)
        iids = torch.LongTensor([i for i in range(self.num_items)]).to(self.device)
        user_emb = self.user_emb(uids)
        item_emb = self.item_emb(iids)
        features = torch.cat([user_emb, item_emb], dim=0)
        return features

    def forward(self, uids, iids):
        iids = self.num_users + iids 

        features = self.FeatureMatrix()
        final_emb = features.clone()

        for gnn in self.GNNLayers:
            features = gnn(self.L, self.I, features)
            features = self.leakyrelu(features)
            final_emb = torch.concat([final_emb, features],dim=-1)

        user_emb = final_emb[uids]
        item_emb = final_emb[iids]

        inputs = torch.concat([user_emb, item_emb], dim=-1)
        outs = self.fc_layer(inputs)
        return outs.flatten()

In [13]:
class GraphDataset(Dataset):
    def __init__(self, dataframe):
        super(Dataset, self).__init__()
        
        self.uid = list(dataframe['user_id'])
        self.iid = list(dataframe['business_id'])
        self.ratings = list(dataframe['click'])
    
    def __len__(self):
        return len(self.uid)
    
    def __getitem__(self, idx):
        uid = self.uid[idx]
        iid = self.iid[idx]
        rating = self.ratings[idx]
        
        return (uid, iid, rating)

In [14]:
def get_loader(args, dataset, num_workers):
    d_set = GraphDataset(dataset)
    return DataLoader(d_set, batch_size=args.batch_size, num_workers=num_workers)

In [15]:
train_loader = get_loader(args, d_train, 4)
valid_loader = get_loader(args, d_valid, 4)
test_loader = get_loader(args, d_test, 4)

In [35]:
def graph_evaluate(args, model, test_loader, criterion):
    output = []
    test_loss = 0

    model.eval()
    with torch.no_grad():
        for batch in tqdm(test_loader, desc='evaluating...'):
            batch = tuple(b.to(args.device) for b in batch)
            inputs = {'uids':   batch[0], 
                      'iids':   batch[1]}
            gold_y = batch[2].float()
            
            pred_y = model(**inputs)
            output.append(pred_y)
            
            loss = criterion(pred_y, gold_y)
            loss = torch.sqrt(loss)
            test_loss += loss.item()
    test_loss /= len(test_loader)
    print(f'Test Loss: {test_loss:.4f}')
    return test_loss, output

In [28]:
def graph_train(args, model, train_loader, valid_loader, optimizer, criterion):
    best_loss = float('inf')
    train_losses, valid_losses = [], []
    for epoch in range(1, args.num_epochs + 1):
        train_loss = 0.0

        model.train()
        for batch in tqdm(train_loader, desc='training...'):
            batch = tuple(b.to(args.device) for b in batch)
            inputs = {'uids':   batch[0], 
                      'iids':   batch[1]}
            
            gold_y = batch[2].float()
            

            pred_y = model(**inputs)
            
            loss = criterion(pred_y, gold_y)
            loss = torch.sqrt(loss)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss /= len(train_loader)
        train_losses.append(train_loss)

        valid_loss , outputs = graph_evaluate(args, model, valid_loader, criterion)
        valid_losses.append(valid_loss)
        

        print(f'Epoch: [{epoch}/{args.num_epochs}]')
        print(f'Train Loss: {train_loss:.4f}\tValid Loss: {valid_loss:.4f}')

        if best_loss > valid_loss:
            best_loss = valid_loss
            if not os.path.exists(args.SAVE_PATH):
                os.makedirs(args.SAVE_PATH)
            torch.save(model.state_dict(), os.path.join(args.SAVE_PATH, f'{model._get_name()}_parameters.pt'))

    return {
        'train_loss': train_losses, 
        'valid_loss': valid_losses
    }, outputs

In [29]:
models = NGCF(args, d_train).to(args.device)

optimizer = optim.Adam(models.parameters(), lr = 1e-3)
criterion = nn.L1Loss()

In [30]:
results = graph_train(args, models, train_loader, valid_loader, optimizer, criterion)

training...: 100%|██████████| 59/59 [00:16<00:00,  3.60it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.34it/s]


Epoch: [1/50]
Train Loss: 1.0726	Valid Loss: 1.1038


training...: 100%|██████████| 59/59 [00:16<00:00,  3.58it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.41it/s]


Epoch: [2/50]
Train Loss: 0.8382	Valid Loss: 1.0829


training...: 100%|██████████| 59/59 [00:16<00:00,  3.52it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.34it/s]


Epoch: [3/50]
Train Loss: 0.8076	Valid Loss: 1.0725


training...: 100%|██████████| 59/59 [00:16<00:00,  3.57it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.34it/s]


Epoch: [4/50]
Train Loss: 0.7908	Valid Loss: 1.1175


training...: 100%|██████████| 59/59 [00:16<00:00,  3.57it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.53it/s]


Epoch: [5/50]
Train Loss: 0.7767	Valid Loss: 1.1211


training...: 100%|██████████| 59/59 [00:16<00:00,  3.65it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.53it/s]


Epoch: [6/50]
Train Loss: 0.7662	Valid Loss: 1.1060


training...: 100%|██████████| 59/59 [00:16<00:00,  3.62it/s]
evaluating...: 100%|██████████| 20/20 [00:06<00:00,  3.21it/s]


Epoch: [7/50]
Train Loss: 0.7536	Valid Loss: 1.1010


training...: 100%|██████████| 59/59 [00:16<00:00,  3.57it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.38it/s]


Epoch: [8/50]
Train Loss: 0.7598	Valid Loss: 1.1177


training...: 100%|██████████| 59/59 [00:16<00:00,  3.59it/s]
evaluating...: 100%|██████████| 20/20 [00:06<00:00,  3.31it/s]


Epoch: [9/50]
Train Loss: 0.7361	Valid Loss: 1.2078


training...: 100%|██████████| 59/59 [00:16<00:00,  3.66it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.34it/s]


Epoch: [10/50]
Train Loss: 0.7070	Valid Loss: 1.2295


training...: 100%|██████████| 59/59 [00:16<00:00,  3.60it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.41it/s]


Epoch: [11/50]
Train Loss: 0.6989	Valid Loss: 1.1821


training...: 100%|██████████| 59/59 [00:16<00:00,  3.63it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.37it/s]


Epoch: [12/50]
Train Loss: 0.7134	Valid Loss: 1.1720


training...: 100%|██████████| 59/59 [00:16<00:00,  3.60it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.38it/s]


Epoch: [13/50]
Train Loss: 0.7078	Valid Loss: 1.2247


training...: 100%|██████████| 59/59 [00:16<00:00,  3.66it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.34it/s]


Epoch: [14/50]
Train Loss: 0.6758	Valid Loss: 1.1600


training...: 100%|██████████| 59/59 [00:16<00:00,  3.57it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.35it/s]


Epoch: [15/50]
Train Loss: 0.6739	Valid Loss: 1.1414


training...: 100%|██████████| 59/59 [00:16<00:00,  3.60it/s]
evaluating...: 100%|██████████| 20/20 [00:06<00:00,  3.27it/s]


Epoch: [16/50]
Train Loss: 0.6991	Valid Loss: 1.2335


training...: 100%|██████████| 59/59 [00:16<00:00,  3.61it/s]
evaluating...: 100%|██████████| 20/20 [00:06<00:00,  3.33it/s]


Epoch: [17/50]
Train Loss: 0.6714	Valid Loss: 1.3044


training...: 100%|██████████| 59/59 [00:16<00:00,  3.55it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.41it/s]


Epoch: [18/50]
Train Loss: 0.6376	Valid Loss: 1.3136


training...: 100%|██████████| 59/59 [00:15<00:00,  3.70it/s]
evaluating...: 100%|██████████| 20/20 [00:06<00:00,  3.29it/s]


Epoch: [19/50]
Train Loss: 0.6016	Valid Loss: 1.3371


training...: 100%|██████████| 59/59 [00:15<00:00,  3.73it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.47it/s]


Epoch: [20/50]
Train Loss: 0.6532	Valid Loss: 1.3117


training...: 100%|██████████| 59/59 [00:16<00:00,  3.62it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.34it/s]


Epoch: [21/50]
Train Loss: 0.6467	Valid Loss: 1.2813


training...: 100%|██████████| 59/59 [00:16<00:00,  3.62it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.45it/s]


Epoch: [22/50]
Train Loss: 0.6415	Valid Loss: 1.2959


training...: 100%|██████████| 59/59 [00:15<00:00,  3.73it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.46it/s]


Epoch: [23/50]
Train Loss: 0.6651	Valid Loss: 1.1913


training...: 100%|██████████| 59/59 [00:16<00:00,  3.64it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.39it/s]


Epoch: [24/50]
Train Loss: 0.6558	Valid Loss: 1.2088


training...: 100%|██████████| 59/59 [00:16<00:00,  3.62it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.39it/s]


Epoch: [25/50]
Train Loss: 0.6284	Valid Loss: 1.2425


training...: 100%|██████████| 59/59 [00:16<00:00,  3.67it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.45it/s]


Epoch: [26/50]
Train Loss: 0.6127	Valid Loss: 1.3183


training...: 100%|██████████| 59/59 [00:16<00:00,  3.59it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.39it/s]


Epoch: [27/50]
Train Loss: 0.6138	Valid Loss: 1.3811


training...: 100%|██████████| 59/59 [00:16<00:00,  3.65it/s]
evaluating...: 100%|██████████| 20/20 [00:06<00:00,  3.27it/s]


Epoch: [28/50]
Train Loss: 0.6160	Valid Loss: 1.3640


training...: 100%|██████████| 59/59 [00:16<00:00,  3.68it/s]
evaluating...: 100%|██████████| 20/20 [00:06<00:00,  3.33it/s]


Epoch: [29/50]
Train Loss: 0.6117	Valid Loss: 1.3171


training...: 100%|██████████| 59/59 [00:15<00:00,  3.69it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.58it/s]


Epoch: [30/50]
Train Loss: 0.6065	Valid Loss: 1.2582


training...: 100%|██████████| 59/59 [00:16<00:00,  3.62it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.42it/s]


Epoch: [31/50]
Train Loss: 0.5888	Valid Loss: 1.2218


training...: 100%|██████████| 59/59 [00:16<00:00,  3.69it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.36it/s]


Epoch: [32/50]
Train Loss: 0.5966	Valid Loss: 1.2557


training...: 100%|██████████| 59/59 [00:16<00:00,  3.65it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.34it/s]


Epoch: [33/50]
Train Loss: 0.5855	Valid Loss: 1.2473


training...: 100%|██████████| 59/59 [00:16<00:00,  3.51it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.35it/s]


Epoch: [34/50]
Train Loss: 0.6047	Valid Loss: 1.1887


training...: 100%|██████████| 59/59 [00:16<00:00,  3.62it/s]
evaluating...: 100%|██████████| 20/20 [00:06<00:00,  3.33it/s]


Epoch: [35/50]
Train Loss: 0.5436	Valid Loss: 1.2267


training...: 100%|██████████| 59/59 [00:15<00:00,  3.84it/s]
evaluating...: 100%|██████████| 20/20 [00:06<00:00,  3.17it/s]


Epoch: [36/50]
Train Loss: 0.4763	Valid Loss: 1.2444


training...: 100%|██████████| 59/59 [00:16<00:00,  3.63it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.45it/s]


Epoch: [37/50]
Train Loss: 0.4611	Valid Loss: 1.2241


training...: 100%|██████████| 59/59 [00:16<00:00,  3.63it/s]
evaluating...: 100%|██████████| 20/20 [00:06<00:00,  3.33it/s]


Epoch: [38/50]
Train Loss: 0.4726	Valid Loss: 1.2062


training...: 100%|██████████| 59/59 [00:15<00:00,  3.71it/s]
evaluating...: 100%|██████████| 20/20 [00:06<00:00,  3.28it/s]


Epoch: [39/50]
Train Loss: 0.5540	Valid Loss: 1.1978


training...: 100%|██████████| 59/59 [00:16<00:00,  3.67it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.44it/s]


Epoch: [40/50]
Train Loss: 0.5412	Valid Loss: 1.1936


training...: 100%|██████████| 59/59 [00:16<00:00,  3.60it/s]
evaluating...: 100%|██████████| 20/20 [00:06<00:00,  3.29it/s]


Epoch: [41/50]
Train Loss: 0.5073	Valid Loss: 1.2098


training...: 100%|██████████| 59/59 [00:16<00:00,  3.62it/s]
evaluating...: 100%|██████████| 20/20 [00:06<00:00,  3.33it/s]


Epoch: [42/50]
Train Loss: 0.5472	Valid Loss: 1.2192


training...: 100%|██████████| 59/59 [00:16<00:00,  3.61it/s]
evaluating...: 100%|██████████| 20/20 [00:06<00:00,  3.26it/s]


Epoch: [43/50]
Train Loss: 0.4687	Valid Loss: 1.2083


training...: 100%|██████████| 59/59 [00:16<00:00,  3.63it/s]
evaluating...: 100%|██████████| 20/20 [00:06<00:00,  3.25it/s]


Epoch: [44/50]
Train Loss: 0.5626	Valid Loss: 1.2790


training...: 100%|██████████| 59/59 [00:16<00:00,  3.64it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.40it/s]


Epoch: [45/50]
Train Loss: 0.6372	Valid Loss: 1.2914


training...: 100%|██████████| 59/59 [00:16<00:00,  3.58it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.34it/s]


Epoch: [46/50]
Train Loss: 0.6244	Valid Loss: 1.2812


training...: 100%|██████████| 59/59 [00:16<00:00,  3.67it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.45it/s]


Epoch: [47/50]
Train Loss: 0.5442	Valid Loss: 1.2597


training...: 100%|██████████| 59/59 [00:16<00:00,  3.62it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.37it/s]


Epoch: [48/50]
Train Loss: 0.5128	Valid Loss: 1.2910


training...: 100%|██████████| 59/59 [00:15<00:00,  3.75it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.51it/s]


Epoch: [49/50]
Train Loss: 0.5396	Valid Loss: 1.3129


training...: 100%|██████████| 59/59 [00:16<00:00,  3.66it/s]
evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.33it/s]

Epoch: [50/50]
Train Loss: 0.5347	Valid Loss: 1.2716





In [36]:
results =graph_evaluate(args, models, test_loader, criterion)

evaluating...: 100%|██████████| 20/20 [00:05<00:00,  3.63it/s]

Test Loss: 1.2731



