In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from sklearn import preprocessing

import dgl
import dgl.function as fn
import torch as th
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl import DGLGraph
from dgl.nn.pytorch.conv import SAGEConv, GATConv
device = 'cuda' if torch.cuda.is_available() else 'cpu'

from settings import file_names

Using backend: pytorch


# Helper Functions

In [30]:
# A helper function to perform each step of training
def train_step(g, features, edges, y, mask):
    # Sets model to TRAIN mode
    model.train()
    # Makes predictions
    y_hat = model(g, features, edges)
    # Computes loss
    loss = loss_fn(y_hat[mask], y[mask])
    # Computes gradients
    loss.backward()
    # Updates parameters and zeroes gradients
    optimizer.step()
    optimizer.zero_grad()
    # Returns the loss
    return loss.item()

# A helper function to perform each step of validation
def val_step(g, features, edges, y, mask):
    # Avoid to compute gradients
    with th.no_grad():
        # Switch to evaluation mode
        model.eval()
        # Makes predictions
        y_hat = model(g, features, edges)
        # Computes loss
        loss = loss_fn(y_hat[mask], y[mask])
        # Returns the loss
        return loss.item()
    
# Evaluate the accuracy on the test set
def accuracy(model, g, features, edges, y, mask):
    model.eval()
    with th.no_grad():

        # Makes predictions
        y_hat = model(g, features, edges).argmax(dim=1).float()
        y = y.float()
            
        # Calculate accuracies
        errors = th.mean(th.abs(y[mask] - y_hat[mask]))
            
        print('Accuracy:', (1 - errors).item())
        
# Generate predictions
def predict(model, g, features, edges):
    model.eval()
    with th.no_grad():

        # Makes predictions
        y_hat = model(g, features, edges).argmax(dim=1).float()
        
        return y_hat.cpu().squeeze(0).detach().numpy() + 1

# 1. Data preparation

## 1.1 Read data

In [3]:
df = pd.read_csv(file_names['toronto_reviews_without_text'])
df_users = pd.read_csv(file_names['toronto_users'])
df_biz = pd.read_csv(file_names['toronto_businesses'])

## 1.2 Encode IDs

In [4]:
le = preprocessing.LabelEncoder()

# Add unique marks so that there wont be users and businesses that share the same ids
df['user_id'] = df['user_id'] + 'U'
df_users['user_id'] = df_users['user_id'] + 'U'
df['business_id'] = df['business_id'] + 'B'
df_biz['business_id'] = df_biz['business_id'] + 'B'

# Fit the encoder
le.fit(list(df['user_id'].unique()) + list(df['business_id'].unique()))

# Encode the review table
df['user_id'] = le.transform(df['user_id'])
df['business_id'] = le.transform(df['business_id'])

# Encode the business table
df_biz = df_biz[df_biz['business_id'].isin(le.classes_)]
df_biz['business_id'] = le.transform(df_biz['business_id'])

# Encode, filter and transform the user table - only friendships between two toronto residents remain
df_users['friends'] = df_users['friends'].str.split(', ')
df_users = df_users.explode('friends')
df_users['friends'] = df_users['friends'] + 'U'
df_users = df_users[df_users['user_id'].isin(le.classes_)]
df_users = df_users[df_users['friends'].isin(le.classes_)]
df_users['user_id'] = le.transform(df_users['user_id'])
df_users['friends'] = le.transform(df_users['friends'])

## 1.3 Node Features - Business

In [5]:
# Select relevant columns
cols = ['business_id', 'latitude', 'longitude', 'stars', 'review_count', 'attributes',
       'categories']
df_biz = df_biz[cols]

In [6]:
# Clean the attributes column - Undone
df_biz['attributes'] = df_biz['attributes'].map(eval, na_action='ignore')
df_biz = df_biz.drop('attributes', axis=1)

In [7]:
# Clean the categories column
df_biz['categories'] = df_biz['categories'].str.split(', ')

# Keep only categories that have at least 100 samples
temp = df_biz.pop('categories').explode()
temp = pd.crosstab(temp.index, temp)
mask = temp.sum(axis=0)
mask = (mask[mask>=100]).index
df_biz = pd.concat([df_biz, temp[mask]], axis=1)

## 1.4 Aggregate user and business features together

In [8]:
# Business Part
features = df_biz.rename(columns={'business_id':'id'})
features['biz'] = True

# User Part
user_list = df_users['user_id'].append(df_users['friends']).unique()
temp = pd.DataFrame({'id':user_list})
temp['biz'] = False

# Put everything together
features = pd.concat([features, temp], axis=0)

# Fill Nan values
features = features.fillna(-999)

# Sort the dataframe so that the row index corresponds to the index of the DGL graph
features = features.sort_values('id')
features = features.set_index('id')

# Normalization
scaler = preprocessing.StandardScaler()
features = scaler.fit_transform(features)
features = th.FloatTensor(features).to(device)

## 1.5 Training and test split

In [9]:
# Separate chronically
data = df.sort_values('date')
edges = data.drop(['date', 'rating'], axis=1).values
labels = data['rating'].values
labels = th.LongTensor(labels).to(device) - 1 #IF I DONT REDUCE IT BY 1, PYTORCH WILL THINK I HAVE 6 CLASSES

mask = np.arange(len(data))
mask_train, mask_val = mask[:int(len(mask)*0.8)], mask[int(len(mask)*0.8):]

print('The size of training data:', len(mask_train))
print('The size of validation data:', len(mask_val))

The size of training data: 183052
The size of validation data: 45764


# 2. Graph Construction

In [10]:
# An indirected user-business graph
G = dgl.DGLGraph().to(device)
G.add_nodes(len(le.classes_))
G.add_edges(data['user_id'].to_list(), data['business_id'].to_list())

# Add friend-friend edges
G.add_edges(df_users['user_id'].to_list(), df_users['friends'].to_list())

# Add edge attributes: -1 denotes a friend-friend relationship, 1-5 denote the rating of a user given to a business
G.edata['y'] = np.array(list(data['rating'].values) + len(df_users['user_id']) * [-1])

# 3. Modeling and Cross Validation

## 3.1 Define the model architecture

In [11]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.gcn1 = GATConv(63, 100, 5, residual=True, activation=F.relu)
        self.gcn2 = GATConv(100, 20, 3, residual=True, activation=F.relu)

        self.fc1 = nn.Linear(40, 10)
        self.fc2 = nn.Linear(10, 5)

    def forward(self, g, features, edges):
        
        # Learning node embeddings
        emb = self.gcn1(g, features)
        emb = emb.max(1)[0]
        emb = self.gcn2(g, emb)
        emb = emb.max(1)[0]       
        
        # Encode nodes
        emb1 = emb[edges[:, 0]]
        emb2 = emb[edges[:, 1]]
        emb_edges = th.cat([emb1, emb2], axis=1)
        
        # Classify edges
        y = th.relu(self.fc1(emb_edges))
        y = self.fc2(y)

        return y

## 3.2 Define hyper parameters

In [12]:
# Model initialization
model = Net().to(device)

# Loss function 
loss_fn = nn.CrossEntropyLoss()

# Optimizer
optimizer = th.optim.Adam(model.parameters(), lr=0.01)

# The number of epochs
n_epochs = 20

## 3.3 Perform training

In [13]:
losses_train = []
losses_val = []

for epoch in range(n_epochs):

    # 1 step of training
    loss_train = train_step(G, features, edges, labels, mask_train)
    losses_train.append(loss_train)
    
    # Keep track of validation loss
    with th.no_grad():
        # 1 step of validation
        loss_val = val_step(G, features, edges, labels, mask_val)
        losses_val.append(loss_val)
    
    # Report losses
    if epoch % 10 == 0:
        print('Epoch {} Training Loss: {}'.format(epoch, loss_train))
        print('Epoch {} Validation Loss: {}'.format(epoch, loss_val))

Epoch 0 Training Loss: 1.8795987367630005
Epoch 0 Validation Loss: 1.7201324701309204
Epoch 10 Training Loss: 1.5969792604446411
Epoch 10 Validation Loss: 1.5950798988342285


In [27]:
print('--Training Phase--')
accuracy(model, G, features, edges, labels, mask_train)
print('--Validation Phase--')
accuracy(model, G, features, edges, labels, mask_val)

--Training Phase--
Accuracy: -0.0782017707824707
--Validation Phase--
Accuracy: -0.14625036716461182
