In [183]:
import pandas as pd
import numpy as np
import networkx as nx
from sklearn import preprocessing
from sklearn.metrics import accuracy_score

import dgl
import dgl.function as fn
import torch as th
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl import DGLGraph
from dgl.nn.pytorch.conv import SAGEConv, GATConv
device = 'cuda' if torch.cuda.is_available() else 'cpu'

from settings import file_names

# Helper Functions

In [184]:
# A helper function to perform each step of training
def train_step(g, features, edges, y, mask):
    # Sets model to TRAIN mode
    model.train()
    # Makes predictions
    y_hat = model(g, features, edges)
    # Computes loss
    loss = loss_fn(y_hat[mask], y[mask])
    # Computes gradients
    loss.backward()
    # Updates parameters and zeroes gradients
    optimizer.step()
    optimizer.zero_grad()
    # Returns the loss
    return loss.item()

# A helper function to perform each step of validation
def val_step(g, features, edges, y, mask):
    # Avoid to compute gradients
    with th.no_grad():
        # Switch to evaluation mode
        model.eval()
        # Makes predictions
        y_hat = model(g, features, edges)
        # Computes loss
        loss = loss_fn(y_hat[mask], y[mask])
        # Returns the loss
        return loss.item()
    
# Evaluate the accuracy on the test set
def accuracy(model, g, features, edges, y, mask):
    model.eval()
    with th.no_grad():

        # Makes predictions
        y_hat = model(g, features, edges).argmax(dim=1).float().cpu()
        y = y.float().cpu()
            
        print('Accuracy:', accuracy_score(y_hat, y))
        
# Generate predictions
def predict(model, g, features, edges):
    model.eval()
    with th.no_grad():

        # Makes predictions
        y_hat = model(g, features, edges).argmax(dim=1).float()
        
        return y_hat.cpu().squeeze(0).detach().numpy() + 1

# 1. Data preparation

## 1.1 Read data

In [185]:
df = pd.read_csv(file_names['toronto_reviews_without_text'])
df_users = pd.read_csv(file_names['toronto_users'])
df_biz = pd.read_csv(file_names['toronto_businesses'])

## 1.2 Encode IDs

In [186]:
le = preprocessing.LabelEncoder()

# Add unique marks so that there wont be users and businesses that share the same ids
df['user_id'] = df['user_id'] + 'U'
df_users['user_id'] = df_users['user_id'] + 'U'
df['business_id'] = df['business_id'] + 'B'
df_biz['business_id'] = df_biz['business_id'] + 'B'

# Fit the encoder
le.fit(list(df['user_id'].unique()) + list(df['business_id'].unique()))

# Encode the review table
df['user_id'] = le.transform(df['user_id'])
df['business_id'] = le.transform(df['business_id'])

# Encode the business table
df_biz = df_biz[df_biz['business_id'].isin(le.classes_)]
df_biz['business_id'] = le.transform(df_biz['business_id'])

# Encode, filter and transform the user table - only friendships between two toronto residents remain
df_users['friends'] = df_users['friends'].str.split(', ')
df_users = df_users.explode('friends')
df_users['friends'] = df_users['friends'] + 'U'
df_users = df_users[df_users['user_id'].isin(le.classes_)]
df_users = df_users[df_users['friends'].isin(le.classes_)]
df_users['user_id'] = le.transform(df_users['user_id'])
df_users['friends'] = le.transform(df_users['friends'])

## 1.3 Node Features - Business

In [187]:
# Select relevant columns
cols = ['business_id', 'latitude', 'longitude', 'stars', 'review_count', 'attributes',
       'categories']
df_biz = df_biz[cols]

In [188]:
# Clean the attributes column - Undone
df_biz['attributes'] = df_biz['attributes'].map(eval, na_action='ignore')
df_biz = df_biz.drop('attributes', axis=1)

In [189]:
# Clean the categories column
df_biz['categories'] = df_biz['categories'].str.split(', ')

# Keep only categories that have at least 200 samples
temp = df_biz.pop('categories').explode()
temp = pd.crosstab(temp.index, temp)
mask = temp.sum(axis=0)
mask = (mask[mask>=200]).index
df_biz = pd.concat([df_biz, temp[mask]], axis=1)

## 1.4 Aggregate user and business features together

In [191]:
# Business Part
features = df_biz.rename(columns={'business_id':'id'})
features['biz'] = True

# User Part
user_list = (df_users['user_id'].append(df_users['friends'])).unique()
temp = pd.DataFrame({'id':user_list})
temp['biz'] = False

# Put everything together
features = pd.concat([features, temp], axis=0)

# Fill Nan values
features = features.fillna(0)

# Sort the dataframe so that the row index corresponds to the index of the DGL graph
features = features.sort_values('id')
features = features.set_index('id').values

features = features.astype(float)

# Normalization
scaler = preprocessing.StandardScaler()
features = scaler.fit_transform(features)
features = th.FloatTensor(features).to(device)

## 1.5 Training and test split

In [192]:
# Separate chronically
data = df.sort_values('date')
edges = data.drop(['date', 'rating'], axis=1).values
labels = data['rating'].values
labels = th.LongTensor(labels).to(device) - 1 #IF I DONT REDUCE IT BY 1, PYTORCH WILL THINK I HAVE 6 CLASSES

mask = np.arange(len(data))
mask_train, mask_val = mask[:int(len(mask)*0.8)], mask[int(len(mask)*0.8):]

print('The size of training data:', len(mask_train))
print('The size of validation data:', len(mask_val))

The size of training data: 183052
The size of validation data: 45764


# 2. Graph Construction

In [214]:
# An indirected user-business graph
G = dgl.DGLGraph().to(device)
G.add_nodes(len(le.classes_))
G.add_edges(data['user_id'].to_list(), data['business_id'].to_list())

# Add friend-friend edges
G.add_edges(df_users['user_id'].to_list(), df_users['friends'].to_list())

# Add edge attributes: -1 denotes a friend-friend relationship or a rating in the val set
# 1-5 denote the rating of a user given to a business
G.edata['y'] = np.array(list(data['rating'].values) + len(df_users['user_id']) * [-1])
G.edata['y'][mask_val]= -1

# 1 if in the val set
G.edata['is_val'] = np.zeros(G.edata['y'].shape)
G.edata['is_val'][mask_val] = 1

# 1 if it is user-to-user friendship
G.edata['is_fr'] = np.zeros(G.edata['y'].shape)
G.edata['is_fr'][-len(df_users['user_id']):] = 1

# 3. Modeling and Cross Validation

## 3.1 Define the model architecture

In [222]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.gcn1 = GATConv(40, 100, 5, residual=True, activation=F.relu)
        self.gcn2 = GATConv(100, 100, 3, residual=True, activation=F.relu)
        self.gcn3 = GATConv(100, 50, 3, residual=True, activation=F.relu)
        self.gcn4 = GATConv(50, 50, 3, residual=True, activation=F.relu)

        self.fc1 = nn.Linear(101, 50)
        self.fc2 = nn.Linear(50, 10)
        self.fc3 = nn.Linear(10, 5)

    def forward(self, g, features, edges):
        
        # Learning node embeddings
        emb = self.gcn1(g, features)
        emb = emb.max(1)[0]
        emb = self.gcn2(g, emb)
        emb = emb.max(1)[0]
        emb = self.gcn3(g, emb)
        emb = emb.max(1)[0]        
        emb = self.gcn4(g, emb)
        emb = emb.max(1)[0]        
        
        # Encode nodes
        emb1 = emb[edges[:, 0]]
        emb2 = emb[edges[:, 1]]
        fea = features[edges[:, 1], 2].view(-1, 1) #Skip connection
        emb_edges = th.cat([emb1, emb2, fea], axis=1)
        
        # Classify edges
        y = th.relu(self.fc1(emb_edges))
        y = th.relu(self.fc2(y))
        y = self.fc3(y)

        return y

## 3.2 Define hyper parameters

In [223]:
# Model initialization
model = Net().to(device)

# Loss function 
loss_fn = nn.CrossEntropyLoss()

# Optimizer
optimizer = th.optim.Adam(model.parameters(), lr=0.001)

# The number of epochs
n_epochs = 1000

## 3.3 Perform training

In [None]:
losses_train = []
losses_val = []

for epoch in range(n_epochs):

    # 1 step of training
    loss_train = train_step(G, features, edges, labels, mask_train)
    losses_train.append(loss_train)
    
    # Keep track of validation loss
    with th.no_grad():
        # 1 step of validation
        loss_val = val_step(G, features, edges, labels, mask_val)
        losses_val.append(loss_val)
    
    # Report losses
    if epoch % 50 == 0:
        print('Epoch {} Training Loss: {}'.format(epoch, loss_train))
        print('Epoch {} Validation Loss: {}'.format(epoch, loss_val))

Epoch 0 Training Loss: 1.6897279024124146
Epoch 0 Validation Loss: 1.606776237487793
Epoch 50 Training Loss: 1.4374487400054932
Epoch 50 Validation Loss: 1.426090955734253
Epoch 100 Training Loss: 1.4047703742980957
Epoch 100 Validation Loss: 1.4044166803359985
Epoch 150 Training Loss: 1.3724913597106934
Epoch 150 Validation Loss: 1.370019555091858
Epoch 200 Training Loss: 1.3633476495742798
Epoch 200 Validation Loss: 1.3601343631744385
Epoch 250 Training Loss: 1.3602571487426758
Epoch 250 Validation Loss: 1.3581185340881348
Epoch 300 Training Loss: 1.3582323789596558
Epoch 300 Validation Loss: 1.3574138879776
Epoch 350 Training Loss: 1.3611218929290771
Epoch 350 Validation Loss: 1.3598333597183228
Epoch 400 Training Loss: 1.3563538789749146
Epoch 400 Validation Loss: 1.3564337491989136
Epoch 450 Training Loss: 1.3554631471633911
Epoch 450 Validation Loss: 1.3564153909683228
Epoch 500 Training Loss: 1.355350375175476
Epoch 500 Validation Loss: 1.3557368516921997
Epoch 550 Training Loss

In [None]:
print('--Training Phase--')
accuracy(model, G, features, edges, labels, mask_train)
print('--Validation Phase--')
accuracy(model, G, features, edges, labels, mask_val)