## Package Initialization

In [0]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import matplotlib
import matplotlib.pyplot as plt

In [0]:
import pandas as pd

## Parse & Reformat Dataset

read training data, save to pandas DataFrame

In [0]:
training_data = pd.read_csv('train.csv', sep=',')

In [124]:
training_data

Unnamed: 0,id,to,from,label
0,E2143,612,59,1
1,E1615,843,189,1
2,E2103,677,416,1
3,E3131,797,506,0
4,E1257,175,856,0
...,...,...,...,...
2567,E684,612,293,1
2568,E371,645,446,1
2569,E892,688,163,0
2570,E1939,387,748,0


read testing data, save to pandas DataFrame

In [0]:
test_data = pd.read_csv('test.csv', sep=',')

In [126]:
test_data

Unnamed: 0,id,to,from
0,E370,26,317
1,E667,196,323
2,E3190,739,468
3,E848,576,156
4,E2161,466,199
...,...,...,...
639,E492,355,30
640,E3055,204,668
641,E1271,625,756
642,E2199,570,342


read node attribute

In [0]:
node_features = pd.read_csv('content.csv', header=None, sep='\t')

In [128]:
node_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,1664,1665,1666,1667,1668,1669,1670,1671,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682,1683,1684,1685,1686,1687,1688,1689,1690,1691,1692,1693,1694,1695,1696,1697,1698,1699,1700,1701,1702,1703
0,345,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,752,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,672,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,509,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,500,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
872,867,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
873,627,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
874,399,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
875,845,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


replace node id in training data with aggregated node attributes & custom features

In [0]:
labels = list(training_data["label"])
labels = [str(i) for i in labels]

In [0]:
with open("./processed_train_labels.csv", "w") as file:
    file.write('\n'.join(labels))

In [0]:
all_add_aggr_features = []
all_product_aggr_features = []

for i in range(training_data.shape[0]):
# for i in range(2):

    training_row = list(training_data.iloc[i])
    edge_id = training_row[0]
    from_node, to_node = training_row[1:-1]
    # print(from_node, to_node)

    from_features = node_features.loc[node_features[0] == from_node]
    from_features = torch.tensor(list(from_features.iloc[0])[1:])
    
    to_features = node_features.loc[node_features[0] == to_node]
    to_features = torch.tensor(list(to_features.iloc[0])[1:])

    # aggregation method1: add two vector
    add_aggr_features = from_features + to_features

    # aggregation method2: element-wise product
    product_aggr_features = from_features * to_features

    # calculate common-intertest rate
    from_interest_num = int((from_features == 1).sum())
    to_interest_num = int((to_features == 1).sum())
    common_interest_num = int((product_aggr_features == 1).sum())
    common_interest_rate = 2 * common_interest_num / (from_interest_num + to_interest_num)

    # print(common_interest_rate)

    add_aggr_features = [str(common_interest_rate)] + [str(int(i)) for i in list(add_aggr_features)]
    all_add_aggr_features.append(",".join(add_aggr_features))
    product_aggr_features = [str(common_interest_rate)] + [str(int(i)) for i in list(product_aggr_features)]
    all_product_aggr_features.append(",".join(product_aggr_features))

export new training data to file for backup

In [0]:
with open("./processed_train_examples_product.csv", "w") as file:
    file.write('\n'.join(all_product_aggr_features))

with open("./processed_train_examples_add.csv", "w") as file:
    file.write('\n'.join(all_add_aggr_features))

replace node id in training data with aggregated node attributes & custom features

In [0]:
all_add_aggr_features = []
all_product_aggr_features = []
edges = []

for i in range(test_data.shape[0]):
# for i in range(2):

    test_row = list(test_data.iloc[i])
    edge_id = test_row[0]
    from_node, to_node = test_row[1:]
    # print(from_node, to_node)

    edges.append(edge_id)

    from_features = node_features.loc[node_features[0] == from_node]
    from_features = torch.tensor(list(from_features.iloc[0])[1:])
    
    to_features = node_features.loc[node_features[0] == to_node]
    to_features = torch.tensor(list(to_features.iloc[0])[1:])

    # aggregation method1: add two vector
    add_aggr_features = from_features + to_features

    # aggregation method2: element-wise product
    product_aggr_features = from_features * to_features

    # calculate common-intertest rate
    from_interest_num = int((from_features == 1).sum())
    to_interest_num = int((to_features == 1).sum())
    common_interest_num = int((product_aggr_features == 1).sum())
    common_interest_rate = 2 * common_interest_num / (from_interest_num + to_interest_num)

    # print(common_interest_rate)

    add_aggr_features = [str(common_interest_rate)] + [str(int(i)) for i in list(add_aggr_features)]
    all_add_aggr_features.append(",".join(add_aggr_features))
    product_aggr_features = [str(common_interest_rate)] + [str(int(i)) for i in list(product_aggr_features)]
    all_product_aggr_features.append(",".join(product_aggr_features))

export new testing data for backup

In [0]:
with open("./processed_test_examples_product.csv", "w") as file:
    file.write('\n'.join(all_product_aggr_features))

with open("./processed_test_examples_add.csv", "w") as file:
    file.write('\n'.join(all_add_aggr_features))

with open("./test_edge_list.csv", "w") as file:
    file.write('\n'.join(edges))