In [2]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="-1"
import pandas as pd
import numpy as np
from tqdm import tqdm

import networkx as nx
import stellargraph as sg
from stellargraph.mapper import PaddedGraphGenerator
from stellargraph.layer import DeepGraphCNN

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

from tensorflow.keras.callbacks import Callback
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Conv1D, MaxPool1D, Dropout, Flatten
from tensorflow.keras.losses import binary_crossentropy, categorical_crossentropy

import wandb

2022-03-31 00:10:48.616922: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-03-31 00:10:48.616977: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: mossad-xps
2022-03-31 00:10:48.616988: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: mossad-xps
2022-03-31 00:10:48.617157: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 470.86.0
2022-03-31 00:10:48.617205: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 470.86.0
2022-03-31 00:10:48.617216: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 470.86.0
2022-03-31 00:10:48.617672: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions

# Read Graphs --> NetworkX --> Stellargraph

In [3]:
def load_graphs_to_stellargraph(graph_uris, graphs_dir):
    uri_to_sg_graph = {}
    for uri in tqdm(graph_uris):
        file_path = os.path.join(graphs_dir, uri.lstrip('http://kglids.org/resource/kaggle/') + '.tsv')
        df_spo = pd.read_csv(file_path, delimiter='\t').astype(str)
        node_embeddings = pd.read_pickle(file_path.replace('.tsv', '.pickle'))
        g = nx.DiGraph()
        df_spo.apply(lambda x: g.add_edge(x['s'], x['o'], type=x['p']), axis=1)
        
        for node in g.nodes():
            g.nodes[node]['features'] = node_embeddings[node]['transE']#['complEx']  # TODO: complEx or transE?
    
        g = sg.StellarDiGraph.from_networkx(g, edge_type_attr='type', node_features='features')
        uri_to_sg_graph[uri] = g
    
    return uri_to_sg_graph

## Graph Classification model

In [4]:

def train_and_evaluate_classification_model(train_graphs, test_graphs, train_labels, test_labels, num_classes, 
                                            epochs=100, batch_size=50, sysname='KGLiDS'):
    gen = PaddedGraphGenerator(graphs=train_graphs + test_graphs)
    k = wandb.config.k  # the number of rows for the output tensor
    layer_sizes = [wandb.config.gcn_layer_size] * (wandb.config.gcn_layers - 1) + [1]  # last layer is of size 1. 
    activations = ["tanh"] * wandb.config.gcn_layers
    dgcnn_model = DeepGraphCNN(
        layer_sizes=layer_sizes,
        activations=activations,
        k=k,
        bias=False,
        generator=gen,
    )
    x_inp, x_out = dgcnn_model.in_out_tensors()

    x_out = Conv1D(filters=16, kernel_size=sum(layer_sizes), strides=sum(layer_sizes))(x_out)
    x_out = MaxPool1D(pool_size=2)(x_out)

    x_out = Conv1D(filters=32, kernel_size=5, strides=1)(x_out)

    x_out = Flatten()(x_out)

    x_out = Dense(units=wandb.config.fc_size, activation="relu")(x_out)  # 128
    x_out = Dropout(rate=wandb.config.dropout)(x_out)

    predictions = Dense(units=num_classes, activation="softmax")(x_out)

    model = Model(inputs=x_inp, outputs=predictions)

    model.compile(optimizer=Adam(learning_rate=wandb.config.lr), loss=categorical_crossentropy, metrics=["acc"])

    train_gen = gen.flow(
        graphs=train_graphs,
        targets=train_labels,
        batch_size=batch_size,
        symmetric_normalization=False,
    )

    test_gen = gen.flow(
        graphs=test_graphs,
        targets=test_labels,
        batch_size=1,
        symmetric_normalization=False,
    )

    # fit
    history = model.fit(train_gen, epochs=epochs, verbose=1, validation_data=test_gen,
                        shuffle=True)  # , callbacks=[Metrics(model, test_graphs, test_labels)])
    for i in range(len(history.history['loss'])):
        wandb.log({"Epoch": i + 1, f"{sysname} Train Loss": history.history['loss'][i],
                   f"{sysname} Train Acc": history.history['acc'][i],
                   f"{sysname} Valid Loss": history.history['val_loss'][i],
                   f"{sysname} Valid Acc": history.history['val_acc'][i]})
    best_epoch = np.argmax(history.history['val_acc'])
    wandb.log({f"{sysname} Best Train Acc": history.history['acc'][best_epoch],
               f"{sysname} Best Valid Acc": history.history['val_acc'][best_epoch]})

    sg.utils.plot_history(history)

# Hyperparameters

In [5]:
wandb.init(project="task4-pipeline-domains", config={'epochs': 30, 'dropout': 0.25, 'lr': 0.0001, 'gcn_layers': 5, 'gcn_layer_size': 32, 'k': 35, 'fc_size': 32})

[34m[1mwandb[0m: Currently logged in as: [33mmossadhelali[0m (use `wandb login --relogin` to force relogin)


# Task Name

In [6]:
task = 'task4'


# get graph names and classes
uris_labels = pd.read_csv(f'{task}_uris_labels.csv')
uris_labels = uris_labels[uris_labels['uri'].apply(lambda x: os.path.exists(f'{task}_kglids_graphs/'+x.lstrip('http://kglids.org/resource/kaggle/') + '.tsv'))]
uris_labels = uris_labels[uris_labels['uri'].apply(lambda x: os.path.exists(f'{task}_kglids_graphs/'+x.lstrip('http://kglids.org/resource/kaggle/') + '.pickle'))]
uris_labels = uris_labels[uris_labels['uri'].apply(lambda x: os.path.exists(f'{task}_graph4code_graphs/'+x.lstrip('http://kglids.org/resource/kaggle/') + '.tsv'))]
uris_labels = uris_labels[uris_labels['uri'].apply(lambda x: os.path.exists(f'{task}_graph4code_graphs/'+x.lstrip('http://kglids.org/resource/kaggle/') + '.pickle'))]
uris_labels['label'] = uris_labels['label'].astype('category').cat.codes
pips = uris_labels['uri'].tolist()
labels = uris_labels['label'].tolist()
num_pipeline_classes = len(uris_labels['label'].unique())
print(len(pips), 'Pipelines')
print(uris_labels.label.value_counts())

543 Pipelines
3    150
1    148
2    140
0    105
Name: label, dtype: int64


wandb: Network error (ConnectionError), entering retry loop.
wandb: Network error (ConnectionError), entering retry loop.


In [None]:
kglids_stellargraph = load_graphs_to_stellargraph(pips, f'{task}_kglids_graphs')

In [None]:
graph4code_stellargraph = load_graphs_to_stellargraph(pips, f'{task}_graph4code_graphs')

# Training a GCN Classifier

### Split to Train / Test

In [None]:
train_names, test_names, train_labels, test_labels = \
train_test_split(pips, labels, train_size=0.8, stratify=labels, random_state=3)
encoder = LabelBinarizer()
train_labels, test_labels = encoder.fit_transform(train_labels), encoder.fit_transform(test_labels)

## KGLiDS:

In [None]:
train_and_evaluate_classification_model([kglids_stellargraph[i] for i in train_names],
                                        [kglids_stellargraph[i] for i in test_names],
                                        train_labels, test_labels, num_classes=num_pipeline_classes, epochs=wandb.config.epochs, 
                                        batch_size=50, sysname='KGLiDS')

## GraphGen4Code:

In [None]:
train_and_evaluate_classification_model([graph4code_stellargraph[i] for i in train_names],
                                        [graph4code_stellargraph[i] for i in test_names], 
                                        train_labels, test_labels, num_classes=num_pipeline_classes, epochs=wandb.config.epochs, 
                                        batch_size=5, sysname='GraphGen4Code')