## Package Initialization

In [0]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import matplotlib
import matplotlib.pyplot as plt

import pandas as pd

## Parse & Reformat Dataset

read training data, save to pandas DataFrame

In [0]:
training_data = pd.read_csv('train.csv', sep=',')

In [0]:
training_data

read testing data, save to pandas DataFrame

In [0]:
test_data = pd.read_csv('test.csv', sep=',')

In [0]:
test_data

read node attribute

In [0]:
node_features = pd.read_csv('content.csv', header=None, sep='\t')

In [0]:
node_features

replace node id with attributes in train data and test data

In [0]:
# training data (labels)
labels = list(training_data["label"])
labels = [str(i) for i in labels]

with open("./processed_train_labels.csv", "w") as file:
    file.write('\n'.join(labels))

In [0]:
# training data (attributes)

edges = []
all_features = []

for i in range(training_data.shape[0]):
# for i in range(2):

    training_row = list(training_data.iloc[i])
    edge_id = training_row[0]
    from_node, to_node = training_row[1:-1]

    edges.append(edge_id)

    from_features = node_features.loc[node_features[0] == from_node]
    from_features = list(from_features.iloc[0])[1:]
    from_features = [str(j) for j in from_features]
    from_features_string = "".join(from_features)
    
    to_features = node_features.loc[node_features[0] == to_node]
    to_features = list(to_features.iloc[0])[1:]
    to_features = [str(j) for j in to_features]
    to_features_string = "".join(to_features)

    all_features.append(from_features_string + "," + to_features_string)

with open("./train_attributes.csv", "w") as file:
    file.write('\n'.join(all_features))

with open("./train_edge_list.csv", "w") as file:
    file.write('\n'.join(edges))

In [0]:
# test data (attributes)

edges = []
all_features = []

for i in range(test_data.shape[0]):

    test_row = list(test_data.iloc[i])
    edge_id = test_row[0]
    from_node, to_node = test_row[1:]

    edges.append(edge_id)

    from_features = node_features.loc[node_features[0] == from_node]
    from_features = list(from_features.iloc[0])[1:]
    from_features = [str(j) for j in from_features]
    from_features_string = "".join(from_features)
    
    to_features = node_features.loc[node_features[0] == to_node]
    to_features = list(to_features.iloc[0])[1:]
    to_features = [str(j) for j in to_features]
    to_features_string = "".join(to_features)

    all_features.append(from_features_string + "," + to_features_string)

with open("./test_attributes.csv", "w") as file:
    file.write('\n'.join(all_features))

# with open("./test_edge_list.csv", "w") as file:
#     file.write('\n'.join(edges))

## Calcualte Features for Reformatted data

## Dataset Loader

In [0]:
class NodePairDataset(Dataset):

    def __init__(self, example_file, label_file):
        """
        Args:
            csv_file (string): Path to the csv file with annotations
        """
        self.pairs = pd.read_csv(example_file, header=None, sep=',')
        self.labels = pd.read_csv(label_file, header=None, sep=',')

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        x = self.pairs.iloc[idx].to_numpy().flatten()
        y = self.labels.iloc[idx].to_numpy().flatten()

        return x, y

In [237]:
node_pair_dataset = NodePairDataset("processed_train_examples_product.csv", "processed_train_labels.csv")
loader = DataLoader(dataset=node_pair_dataset, batch_size=10, shuffle=False)
for batch in zip(loader, range(1)):
    data, index = batch
    x, y = data
    print(x)
    print(y)

tensor([[0.0606, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.1304, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.1000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0930, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0513, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       dtype=torch.float64)
tensor([[0],
        [1],
        [0],
        [0],
        [1],
        [0],
        [1],
        [0],
        [1],
        [0]])


## Define Network Model

In [0]:
class LinkPredictNetworkOne(nn.Module):
    def __init__(self, input_dimension, hiddens=[0.125, 0.125, 0.0625, 0.03125], dropouts = [0.2, 0.1, 0.05, 0.025]):
        super(LinkPredictNetworkOne, self).__init__()

        self.hidden_factors = hiddens
        self.dropout_factors = dropouts
        self.input_dimension = input_dimension

        self.hidden_units = [int(factor * self.input_dimension) for factor in self.hidden_factors]

        self.linear1 = nn.Linear( self.input_dimension, self.hidden_units[0], bias=True)
        self.dropout1 = nn.Dropout(self.dropout_factors[0])

        self.linear2 = nn.Linear( self.hidden_units[0], self.hidden_units[1], bias=True)
        self.dropout2 = nn.Dropout(self.dropout_factors[1])

        self.linear3 = nn.Linear( self.hidden_units[1], self.hidden_units[2], bias=True)
        self.dropout3 = nn.Dropout(self.dropout_factors[2])

        self.linear4 = nn.Linear( self.hidden_units[2], self.hidden_units[3], bias=True)
        self.dropout4 = nn.Dropout(self.dropout_factors[3])

        self.outputLinear = nn.Linear( self.hidden_units[3], 1, bias=True)
    
    def forward(self, x):

        # Encoding
        # ============================================================================================

        # x's shape: [batch_size, input_dimension]

        x = x.to(torch.float)
        x[0] += 0.5
        x = x - 0.5

        x = torch.sigmoid(self.linear1(x))
        # x = self.dropout1(x)
        x = torch.sigmoid(self.linear2(x))
        # x = self.dropout2(x)
        x = torch.sigmoid(self.linear3(x))
        # x = self.dropout3(x)
        x = torch.sigmoid(self.linear4(x))
        # x = self.dropout4(x)
        # x = torch.sigmoid(self.linear5(x))
        # x = self.dropout5(x)
        # x = torch.sigmoid(self.linear6(x))
        # x = self.dropout6(x)
        # x = torch.sigmoid(self.linear7(x))
        # x = self.dropout7(x)

        z = torch.sigmoid(self.outputLinear(x))

        return z

In [0]:
class LinkPredictNetworkTwo(nn.Module):
    def __init__(self):
        super(LinkPredictNetworkTwo, self).__init__()

        self.linear1 = nn.Linear( 3, 9, bias=True)

        self.linear2 = nn.Linear( 9, 3, bias=True)

        self.outputLinear = nn.Linear( 3, 1, bias=True)
    
    def forward(self, x):

        # Encoding
        # ============================================================================================

        # x's shape: [batch_size, input_dimension]

        h = torch.zeros(x.size()[0], 3, dtype=torch.float).cuda()

        for i in range(x.size()[0]):

            one_number = int((x[i][1:] == 1).sum())
            zero_number = int((x[i][1:] == 0).sum())

            h[i,0] = x[i][0].item()
            h[i,1] = one_number
            h[i,2] = zero_number

        h = torch.sigmoid(self.linear1(h))
        h = torch.sigmoid(self.linear2(h))

        z = torch.sigmoid(self.outputLinear(h))

        return z

## Define Loss 

In [0]:
def BinaryCrossEntropyLoss(outputs, targets):
    return F.binary_cross_entropy(outputs, targets.to(torch.float), reduction='mean')

## Test the model

In [0]:
net = LinkPredictNetworkTwo().cuda()

In [0]:
optimizer = optim.Adam(net.parameters(), lr=0.001)

In [272]:
node_pair_dataset = NodePairDataset("processed_train_examples_product.csv", "processed_train_labels.csv")
for i in range(100):
    loader = DataLoader(dataset=node_pair_dataset, batch_size=5000, shuffle=True)
    print("epoch {}".format(i))
    for x, y in loader:

        optimizer.zero_grad()
        z = net(x.cuda())

        loss = BinaryCrossEntropyLoss(z, y.cuda())
        print(loss)

        loss.backward()
        optimizer.step()

epoch 0
tensor(0.7135, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.7190, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward>)
epoch 1
tensor(0.7145, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.7162, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward>)
epoch 2
tensor(0.7122, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.7179, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward>)
epoch 3
tensor(0.7132, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.7151, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward>)
epoch 4
tensor(0.7116, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.7160, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward>)
epoch 5
tensor(0.7101, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.7166, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward>)
epoch 6
tensor(0.7131, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward>)
tensor(0.7113, device='cuda:0', gr