In [8]:
import pandas as pd
import numpy as np
import networkx as nx
from sklearn import preprocessing

import dgl
import dgl.function as fn
import torch as th
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl import DGLGraph
from dgl.nn.pytorch.conv import SAGEConv, GATConv

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Helper Functions

In [9]:
# A helper function to perform each step of training
def train_step(g, features, edges, y, mask):
    # Sets model to TRAIN mode
    model.train()
    # Makes predictions
    y_hat = model(g, features, edges)
    # Computes loss
    y = y.float().view(-1,1)
    loss = loss_fn(y_hat[mask], y[mask])
    # Computes gradients
    loss.backward()
    # Updates parameters and zeroes gradients
    optimizer.step()
    optimizer.zero_grad()
    # Returns the loss
    return loss.item()

# A helper function to perform each step of validation
def val_step(g, features, edges, y, mask):
    # Avoid to compute gradients
    with th.no_grad():
        # Switch to evaluation mode
        model.eval()
        # Makes predictions
        y_hat = model(g, features, edges)
        # Computes loss
        y = y.float().view(-1,1)
        loss = loss_fn(y_hat[mask], y[mask])
        # Returns the loss
        return loss.item()

In [10]:
# Evaluate the accuracy on the test set
def accuracy(model, g, features, edges, y, mask):
    model.eval()
    with th.no_grad():

        # Makes predictions
        y_hat = model(g, features, edges)
        y_hat[y_hat>0.5] = 1
        y_hat[y_hat<0.5] = 0
            
        # Calculate accuracies
        y = y.float().view(-1,1)
        errors = th.mean(th.abs(y[mask] - y_hat[mask]))
            
        print('Accuracy:', (1 - errors).item())
        
# Generate predictions
def predict(model, g, features, edges):
    model.eval()
    with th.no_grad():

        # Makes predictions
        y_hat = model(g, features, edges)
        y_hat[y_hat>0.5] = 1
        y_hat[y_hat<0.5] = 0
        
        return y_hat.cpu().squeeze(1).detach().numpy() == 1

# 1. Data preparation

## 1.1 Read data

In [11]:
df_train = pd.read_csv('training_set.txt', header=None, sep=' ', names=['X1', 'X2', 'Y'])
df_test = pd.read_csv('testing_set.txt', header=None, sep=' ', names=['X1', 'X2'])
df_features = pd.read_csv('df_features.csv')

## 1.2 Re-index IDs to consecutive intergers ranging from 0

In [12]:
# Map the info dataframe
df_features = df_features.reset_index()
df_features.rename({'index':'new_id', 'ID':'old_id'}, axis=1, inplace=True)

# Keep track of ID mappings
old_2_new = df_features.set_index('old_id')[['new_id']].to_dict()['new_id']

# Map the training and test data
df_train['X1'] = df_train['X1'].map(old_2_new)
df_train['X2'] = df_train['X2'].map(old_2_new)
df_test['X1'] = df_test['X1'].map(old_2_new)
df_test['X2'] = df_test['X2'].map(old_2_new)

## 1.3 Split the data

In [13]:
X = df_train.drop('Y', axis=1)
y = df_train['Y']
X_test = df_test

# 2. Graph Construction

## 2.1 Construct a graph in DGL format

In [14]:
# Create a graph from networkx
edges = X[y==1]
nxg = nx.from_pandas_edgelist(edges, 'X1', 'X2', None, create_using=nx.DiGraph())

# Put into DGL format
G = dgl.DGLGraph().to(device)
G.from_networkx(nxg)

# Add nodes that don't exist in the existing graph but in the info dataframe
G.add_nodes(86)

## 2.2 Constrcut node features

In [15]:
edges = X.values
edges_test = X_test.values
labels = th.IntTensor(y.values).view(-1, 1).to(device)

# Normalization
scaler = preprocessing.StandardScaler()

features = df_features.fillna(df_features.mode().iloc[0])
features = features.values[:, 2:]
features = scaler.fit_transform(features)
features = th.FloatTensor(features).to(device)

In [16]:
print('The number of features:', features.shape[1])

The number of features: 51


## 2.3 Split training and test links

In [17]:
mask = np.arange(len(edges))
np.random.shuffle(mask)
mask_train, mask_val = mask[:int(len(mask)*0.8)], mask[int(len(mask)*0.8):]

print('The size of training data:', len(mask_train))
print('The size of validation data:', len(mask_val))

The size of training data: 492409
The size of validation data: 123103


# 3. Modeling and Cross Validation

## 3.1 Define the model architecture

In [21]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.gcn1 = GATConv(51, 300, 5, residual=True, activation=F.relu)
        self.gcn2 = GATConv(300, 20, 3, residual=True, activation=F.relu)
        self.gcn3 = GATConv(20, 20, 3, residual=True, activation=F.relu)
        self.gcn4 = GATConv(20, 10, 2, residual=True)

        self.fc1 = nn.Linear(20, 50)
        self.fc2 = nn.Linear(50, 20)
        self.fc3 = nn.Linear(20, 10)
        self.fc4 = nn.Linear(10, 1)

    def forward(self, g, features, edges):
        
        # Learning node embeddings
        emb = self.gcn1(g, features)
        emb = emb.max(1)[0]
        emb = self.gcn2(g, emb)
        emb = emb.max(1)[0]
        emb = self.gcn3(g, emb)
        emb = emb.max(1)[0]
        emb = self.gcn4(g, emb)
        emb = emb.max(1)[0]        
        
        # Encode nodes
        emb1 = emb[edges[:, 0]]
        emb2 = emb[edges[:, 1]]
        emb_edges = th.cat([emb1, emb2], axis=1)
        
        # Classify edges
        y = th.relu(self.fc1(emb_edges))
        y = th.relu(self.fc2(y))
        y = th.relu(self.fc3(y))
        y = th.sigmoid(self.fc4(y))
        
        return y

## 3.2 Define hyper parameters

In [22]:
# Model initialization
model = Net().to(device)

# Loss function 
loss_fn = nn.BCELoss()

# Optimizer
optimizer = th.optim.Adam(model.parameters(), lr=0.001)

# The number of epochs
n_epochs = 300

## 3.3 Perform training

In [None]:
losses_train = []
losses_val = []

for epoch in range(n_epochs):

    # 1 step of training
    loss_train = train_step(G, features, edges, labels, mask_train)
    losses_train.append(loss_train)
    
    # Keep track of validation loss
    with th.no_grad():
        # 1 step of validation
        loss_val = val_step(G, features, edges, labels, mask_val)
        losses_val.append(loss_val)
    
    # Report losses
    if epoch % 10 == 0:
        print('Epoch {} Training Loss: {}'.format(epoch, loss_train))
        print('Epoch {} Validation Loss: {}'.format(epoch, loss_val))
        
        # Save the model
        torch.save(model.state_dict(), 'models//model_{}'.format(epoch))
        
        # Save the loss
        df_loss = pd.DataFrame({'Training':losses_train, 
                                'Validation': losses_val})
        df_loss.to_csv('losses.csv')

Epoch 0 Training Loss: 0.7146669626235962
Epoch 0 Validation Loss: 0.7017821073532104


## 3.4 Print Accuracy

In [16]:
print('--Training Phase--')
accuracy(model, G, features, edges, labels, mask_train)
print('--Validation Phase--')
accuracy(model, G, features, edges, labels, mask_val)

--Training Phase--
Accuracy: 0.8332179188728333
--Validation Phase--
Accuracy: 0.8324329853057861


# 4. Modeling on the full data set

## 4.1 Define hyper parameters

In [None]:
# Model initialization
model = Net().to(device)

# Loss function 
loss_fn = nn.BCELoss()

# Optimizer
optimizer = th.optim.Adam(model.parameters(), lr=0.001)

# The number of epochs
n_epochs = 1000

## 4.2 Retrain the model

In [None]:
losses_train = []

for epoch in range(n_epochs):

    # 1 step of training
    loss_train = train_step(G, features, edges, labels, mask)
    losses_train.append(loss_train)
    
    # Report losses
    if epoch % 10 == 0:
        print('Epoch {} Training Loss: {}'.format(epoch, loss))  

## 4.3 Print Accuracy

In [None]:
print('--Training Phase--')
accuracy(model, G, features, edges, labels, mask)

## 4.4 Make predictions

In [35]:
pred = predict(model, G, features, edges_test)
df_pred = pd.DataFrame({'id':range(len(pred)),
                        'category':pred})
df_pred.to_csv('predictions.csv', index=None)