In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric.nn as pyg_nn
import torch_geometric.utils as pyg_utils
import time
from datetime import datetime
import networkx as nx
import torch
import torch.optim as optim
import numpy as np
from torch_geometric.datasets import TUDataset
from torch_geometric.datasets import Planetoid
from torch_geometric.data import DataLoader
import torch_geometric.transforms as T
from tensorboardX import SummaryWriter
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import os
import copy
import glob
import shutil
import pandas as pd
import numpy as np

import torch
from torch_scatter import scatter_add
from torch_geometric.data import InMemoryDataset, Data, download_url, extract_zip

In [2]:
class Encoder(torch.nn.Module):
    '''
    Encoder : Graph Conv to get embeddings 
    Decoder : inner product 
    '''

    def __init__(self, in_channels, out_channels):
        super(Encoder, self).__init__()
        self.conv1 = pyg_nn.GCNConv(in_channels, 2 * out_channels, cached=True)
        self.conv2 = pyg_nn.GCNConv(2 * out_channels, out_channels, cached=True)
        self.dropout=0.25
    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=self.dropout, training=self.training)
        return self.conv2(x, edge_index)

def train(epoch):
    model.train()
    optimizer.zero_grad()
    z = model.encode(x, train_pos_edge_index)
    loss = model.recon_loss(z, train_pos_edge_index)    # reconstruction loss 
    loss.backward()
    optimizer.step()
    
    #tensorboard
    writer.add_scalar("loss", loss.item(), epoch)

def test(pos_edge_index, neg_edge_index):
    model.eval()
    with torch.no_grad():
        z = model.encode(x, train_pos_edge_index)
    return model.test(z, pos_edge_index, neg_edge_index)

In [3]:

#from torch_geometric.utils import one_hot


class MCDataset(InMemoryDataset):
    def __init__(self, root, name, transform=None, pre_transform=None):
        self.name = name
        super(MCDataset, self).__init__(root, transform, pre_transform)

        self.data, self.slices = torch.load(self.processed_paths[0])
        
    @property
    def num_relations(self):
        return self.data.edge_type.max().item() + 1

    @property
    def num_nodes(self):
        return self.data.x.shape[0]

    @property
    def raw_file_names(self):
        return ['u1.base', 'u1.test']

    @property
    def processed_file_names(self):
        return 'data.pt'

    def download(self):
        if self.name == 'ml-100k':
            url = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'
        path = download_url(url, self.root)
        extract_zip(path, self.raw_dir, self.name)
        os.unlink(path)
        for file in glob.glob(os.path.join(self.raw_dir, self.name, '*')):
            shutil.move(file, self.raw_dir)
        os.rmdir(os.path.join(self.raw_dir, self.name))

    def process(self):

        train_csv, test_csv = self.raw_paths
        train_df, train_nums = self.create_df(train_csv)
        test_df, test_nums = self.create_df(test_csv)

        train_idx, train_gt = self.create_gt_idx(train_df, train_nums)
        test_idx, test_gt = self.create_gt_idx(test_df, train_nums) #??????????????


        train_df['item_id'] = train_df['item_id'] + train_nums['user']

        
        x = torch.eye(train_nums['node'], dtype=torch.long)
        
        # Prepare edges
        edge_user = torch.tensor(train_df['user_id'].values)
        edge_item = torch.tensor(train_df['item_id'].values)


        edge_index = torch.stack((torch.cat((edge_user, edge_item), 0),
                                  torch.cat((edge_item, edge_user), 0)), 0)
        edge_index = edge_index.to(torch.long)

        edge_type = torch.tensor(train_df['relation'])
        edge_type = torch.cat((edge_type, edge_type), 0)
        #edge_norm = copy.deepcopy(edge_index[1])

        #for idx in range(train_nums['node']):

        #    count = (train_df == idx).values.sum()

        #    edge_norm = torch.where(edge_norm==idx,
        #                            torch.tensor(count),
        #                            edge_norm)
        #edge_norm = (1 / edge_norm.to(torch.float))

        # Prepare data
        data = Data(x=x, edge_index=edge_index)
        data.edge_weight = edge_type
        print(type(data.edge_weight))
        #data.edge_norm = edge_norm
        data.train_idx = train_idx
        data.test_idx = test_idx
        data.train_gt = train_gt
        data.test_gt = test_gt
        data.num_users = torch.tensor([train_nums['user']])
        data.num_items = torch.tensor([train_nums['item']])
        
        data, slices = self.collate([data])
        torch.save((data, slices), self.processed_paths[0])

    def create_df(self, csv_path):
        col_names = ['user_id', 'item_id', 'relation', 'ts']
        df = pd.read_csv(csv_path, sep='\t', names=col_names)
        df = df.drop('ts', axis=1)
        df['user_id'] = df['user_id'] - 1
        df['item_id'] = df['item_id'] - 1
        df['relation'] = df['relation'] - 1

        nums = {'user': df.max()['user_id'] + 1,
                'item': df.max()['item_id'] + 1,
                'node': df.max()['user_id'] + df.max()['item_id'] + 2,
                'edge': len(df)}
        return df, nums

    def create_gt_idx(self, df, nums):
        df['idx'] = df['user_id'] * nums['item'] + df['item_id']
        idx = torch.tensor(df['idx'])
        gt = torch.tensor(df['relation'])
        return idx, gt

    def get(self, idx):
        data = torch.load(os.path.join(self.processed_dir, 'data.pt'))
        return data[0]

    def __repr__(self):
        return '{}{}()'.format(self.name.upper(), self.__class__.__name__)


if __name__ == '__main__':
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    dataset = MCDataset(root='./data/ml-100k', name='ml-100k')
    data = dataset[0]
    print(data)
    data = data.to(device)

Data(edge_index=[2, 160000], edge_weight=[160000], num_items=[1], num_users=[1], test_gt=[20000], test_idx=[20000], train_gt=[80000], train_idx=[80000], x=[2625, 2625])


[(3, 54792), (2, 43926), (4, 33488), (1, 18356), (0, 9438)]

In [4]:
writer = SummaryWriter("./log/" + datetime.now().strftime("%Y%m%d-%H%M%S"))
channels = 16
dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('CUDA availability:', torch.cuda.is_available())
# encoder: written by us; decoder: default (inner product)
model = pyg_nn.GAE(Encoder(len(data.x), channels)).to(dev)
#labels = data.y
data.test_gt = data.test_idx = data.train_gt = data.train_idx = None

# data = model.split_edges(data) # split_edges unavilable 
data = pyg_utils.train_test_split_edges(data, val_ratio=0, test_ratio=0.2)   # construct positive/negative edges (for negative sampling!)
x, train_pos_edge_index = data.x.float().to(dev), data.train_pos_edge_index.to(dev) # float long
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
data

CUDA availability: True


Data(edge_weight=[160000], num_items=[1], num_users=[1], test_neg_edge_index=[2, 16000], test_pos_edge_index=[2, 16000], train_neg_adj_mask=[2625, 2625], train_pos_edge_index=[2, 128000], val_neg_edge_index=[2, 0], val_pos_edge_index=[2, 0], x=[2625, 2625])

In [5]:
for epoch in range(1, 201):
    train(epoch)
    auc, ap = test(data.test_pos_edge_index, data.test_neg_edge_index)
    writer.add_scalar("AUC", auc, epoch)
    writer.add_scalar("AP", ap, epoch)
    if epoch % 10 == 0:
        print('Epoch: {:03d}, AUC: {:.4f}, AP: {:.4f}'.format(epoch, auc, ap))

Epoch: 010, AUC: 0.8888, AP: 0.8691
Epoch: 020, AUC: 0.8543, AP: 0.8522
Epoch: 030, AUC: 0.8295, AP: 0.8350
Epoch: 040, AUC: 0.8392, AP: 0.8424
Epoch: 050, AUC: 0.8517, AP: 0.8505
Epoch: 060, AUC: 0.8577, AP: 0.8537
Epoch: 070, AUC: 0.8668, AP: 0.8608
Epoch: 080, AUC: 0.8737, AP: 0.8694
Epoch: 090, AUC: 0.8804, AP: 0.8741
Epoch: 100, AUC: 0.8757, AP: 0.8701
Epoch: 110, AUC: 0.8787, AP: 0.8727
Epoch: 120, AUC: 0.8821, AP: 0.8752
Epoch: 130, AUC: 0.8819, AP: 0.8744
Epoch: 140, AUC: 0.8798, AP: 0.8727
Epoch: 150, AUC: 0.8795, AP: 0.8724
Epoch: 160, AUC: 0.8756, AP: 0.8695
Epoch: 170, AUC: 0.8778, AP: 0.8713
Epoch: 180, AUC: 0.8838, AP: 0.8753
Epoch: 190, AUC: 0.8808, AP: 0.8733
Epoch: 200, AUC: 0.8807, AP: 0.8732
