In [1]:
import os.path as osp
import random 
import os

import torch
import torch.nn.functional as F
from torch import optim, nn

import numpy as np
import networkx as nx
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics import roc_auc_score

from torch_geometric import data as DATA
from torch_geometric.data import Data, DataLoader

from torch_geometric.utils import negative_sampling

In [2]:
effort = "MLP_DOC2Vec"
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'

In [3]:
np.random.seed(0)
random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
torch.backends.cudnn.enabled = False
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [4]:
edge_folds = pd.read_csv("./data_split.csv", header=0)
edge_folds = edge_folds[edge_folds['Label'] == 1]
edge_folds.loc[edge_folds['Fold'] == 'Test', 'Fold'] = 5
edge_folds.loc[edge_folds['Fold'] == '3', 'Fold'] = 3
edge_folds.loc[edge_folds['Fold'] == '4', 'Fold'] = 4

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [5]:
class Net(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        
        self.fc1 = nn.Linear(in_channels * 2, hidden_channels)
        self.fc2 = nn.Linear(hidden_channels, out_channels)
        self.fc3 = nn.Linear(out_channels, 1)

    def encode(self, x, edge_index):
        return x

    def decode(self, z, edge_label_index):
        x = torch.cat((z[edge_label_index[0]], z[edge_label_index[1]]), 1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        x = x.squeeze(1)
        return x

In [6]:
def train(train_data):
    model.train()
    optimizer.zero_grad()
    z = model.encode(train_data.x, train_data.edge_index)

    # We perform a new round of negative sampling for every training epoch:
    neg_edge_index = negative_sampling(
        edge_index=train_data.edge_index, num_nodes=train_data.num_nodes,
        num_neg_samples=train_data.edge_label_index.size(1), method='sparse')

    edge_label_index = torch.cat(
        [train_data.edge_label_index, neg_edge_index],
        dim=-1,
    )
    edge_label = torch.cat([
        train_data.edge_label,
        train_data.edge_label.new_zeros(neg_edge_index.size(1))
    ], dim=0)

    out = model.decode(z, edge_label_index).view(-1)
    loss = criterion(out.cpu(), edge_label)
    loss.backward()
    optimizer.step()
    return loss

In [7]:
@torch.no_grad()
def test(data):
    model.eval()
    z = model.encode(data.x, data.edge_index)
    
    # We perform a new round of negative sampling for every validation:
    neg_edge_index = negative_sampling(
        edge_index = data.edge_index, num_nodes=data.num_nodes,
        num_neg_samples = data.edge_label_index.size(1), method='sparse')

    edge_label_index = torch.cat(
        [data.edge_label_index, neg_edge_index],
        dim=-1,
    )
    
    edge_label = torch.cat([
        data.edge_label,
        data.edge_label.new_zeros(neg_edge_index.size(1))
    ], dim=0)
    
    out = model.decode(z, edge_label_index).view(-1).sigmoid()
    return roc_auc_score(edge_label.cpu().numpy(), out.cpu().numpy())

In [8]:
# Text type features
features = pd.read_csv("./DOC2Vec_features.csv")
features = features.sort_values(by=['id'])

# PCA ONLY on TF_IDF

# Page type features
page_type = pd.read_csv("./Training/node_classification.csv")
myEncoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
myEncoder.fit(page_type['page_type'].to_numpy().reshape(-1, 1))

page_type = pd.concat([page_type.drop('page_type', 1),
            pd.DataFrame(myEncoder.transform(page_type['page_type'].to_numpy().reshape(-1, 1)))], 
                         axis=1).reindex()

# Node Id Embedding features
embedding = torch.nn.Embedding(page_type.shape[0], 32)
embedding = pd.DataFrame(np.array(embedding.weight.data))
embedding['id'] = range(page_type.shape[0])

# Finalize Feature Embedding
features = features.merge(page_type, how='inner', on='id').merge(embedding, how='inner', on='id')
features = features.drop(columns=['id'])

features = torch.Tensor(features.to_numpy()).to(device)

In [9]:
for r in range(1, 6):
    
    with open('./logs/' + effort + '_fold' + str(r) +'.txt', 'w') as f:
        f.write("New training bitches!!\n")
    
    print("Split #", r)
    with open('./logs/' + effort + '_fold' + str(r) + '.txt', 'a') as f:
        f.write("Split # {}\n".format(r))
        
    edge_folds_train = edge_folds[edge_folds['Fold'] != r]
    edge_folds_val = edge_folds[edge_folds['Fold'] == r]
    G = nx.from_pandas_edgelist(edge_folds_train, 'Node1', 'Node2')

    edge_index = []
    for e1, e2 in G.edges:
        edge_index.append([e1, e2])
    
    G_val = nx.from_pandas_edgelist(edge_folds_val, 'Node1', 'Node2')
    edge_label_index_val = []
    for e1, e2 in G_val.edges:
        edge_label_index_val.append([e1, e2])
        
    train_graph = DATA.Data(
        x = features.to(device),
        edge_index = torch.LongTensor(edge_index).transpose(1, 0).to(device),
        edge_label_index = torch.LongTensor(edge_index).transpose(1, 0).to(device),
        edge_label = torch.ones(len(edge_index))
    )
    
    val_graph = DATA.Data(
        x = features.to(device),
        edge_index = torch.LongTensor(edge_index).transpose(1, 0).to(device),
        edge_label_index = torch.LongTensor(edge_label_index_val).transpose(1, 0).to(device),
        edge_label = torch.ones(len(edge_label_index_val))
    )
    
    model = Net(164, 128, 64).to(device) # Change number of features accordingly
    optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01)
    criterion = torch.nn.BCEWithLogitsLoss()
    
    best_val_auc = 0
    for epoch in range(1, 1001):
        loss = train(train_graph)
        print('Train epoch: {}, Total Loss: {:.4f}'.format(epoch, loss))
        with open('./logs/' + effort + '_fold' + str(r) + '.txt', 'a') as f:
            f.write('Train epoch: {}, Total Loss: {:.4f}\n'.format(epoch, loss))  
        val_auc = test(val_graph)
        print('Total Validation AUC: {:.4f}'.format(val_auc))
        with open('./logs/' + effort + '_fold' + str(r) + '.txt', 'a') as f:
            f.write('Total Validation AUC: {:.4f}'.format(val_auc))  
        if val_auc > best_val_auc:
            best_val = val_auc
            torch.save(model.state_dict(), 
                       './models/' + effort + '_fold' + str(r) + '.pt')

Split # 1


KeyboardInterrupt: 