In [1]:
import pandas as pd
import numpy as np
from numpy import nan
from main_util_func import *
import stellargraph as sg
from stellargraph import IndexedArray
from stellargraph.mapper import PaddedGraphGenerator
from stellargraph.layer import GCNSupervisedGraphClassification
from stellargraph import StellarGraph

from sklearn import model_selection
#from IPython.display import display, HTML

import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
%config Completer.use_jedi = False #autocomplete

In [3]:
def input_read():
    data_loc = './inter_files/input-output-data1.csv'
    data = pd.read_csv(data_loc, sep=',', lineterminator="\n", low_memory=False)
    data['input_nodes_sequence'] = data['input_nodes_sequence'].apply(lambda x : eval(x))
    data['output_seq'] = data['output_seq'].apply(lambda x : eval(x))
    data['output'] = data['output_seq'].apply(lambda x : x[-1])
    data['first_year']= data['input_years_sequence'].apply(lambda x:eval(x)[1]) #new
    data['flag'] = data['first_year']+15 <= 2021 #new
    data = data[data['flag']==True].copy() #new
    node_seq_len = [len(a) for a in data['input_nodes_sequence'].values]
    max_len = max(node_seq_len)
    return data, max_len

In [3]:
#data["output_seq"]

In [4]:
def graphs_data(embeddings, data):
    graph_labels= []
    graphs= []
    for head, family, family_edge, output in data[["Id","input_nodes_sequence","input_edgelist","output_seq"]].values:
        node_feat=[]
        source = []
        target = []
        if type(family)==str:
            family=eval(family)
        family_edge = eval(family_edge)
        for a in family:
            node_feat.append(embeddings[a])
        node_feat = np.array(node_feat)
        node_feat_inx = IndexedArray(node_feat, index=family)
        for edge in family_edge:
            source.append(edge[0])
            target.append(edge[1])
        family_edges = pd.DataFrame({"source":source, "target": target})
        graph = sg.StellarDiGraph(node_feat_inx, family_edges)
        graphs.append(graph)
        graph_labels.append(output)
    return graphs, graph_labels

In [5]:
def validate(graphs, data):
    assert len(graphs)== data.shape[0]
    index=np.random.randint(0,len(graphs))
    print(f"Edges: {eval(data['input_edgelist'].values[index])}")
    print(f"Nodes: {data['input_nodes_sequence'].values[index]}")
    print(f"Head: {data['Id'].values[index]}")
    print(graphs[index].info())
    return

In [6]:
def train_test_data(data, sy, ey, tsy, tey):
    start_date = sy
    end_date = ey
    test_start_date = tsy
    test_end_date = tey
    train_data = data[(data['Year'] > start_date) & (data['Year'] <= end_date)].copy()
    test_data = data[(data['Year'] > test_start_date) & (data['Year'] <= test_end_date)].copy()
    return train_data, test_data

In [24]:
def create_graph_regression_model(generator, lr=0.001):
    gc_model = GCNSupervisedGraphClassification(
        layer_sizes=[64, 64],
        activations=["relu", "relu"],
        generator=generator,
        dropout=0.2,
    )
    x_inp, x_out = gc_model.in_out_tensors()
    predictions = Dense(units=32, activation="relu")(x_out)
    predictions = Dense(units=16, activation="relu")(predictions)
    predictions = Dense(units=1)(predictions)

    # Let's create the Keras model and prepare it for training
    model = Model(inputs=x_inp, outputs=predictions)
    model.compile(optimizer=Adam(lr), loss=MeanSquaredError(), metrics=["mse"])

    return model

In [8]:
def train_model(model, train_gen, val_gen, es, epochs):
    history = model.fit(
        train_gen, epochs=epochs, validation_data=val_gen, callbacks=[es],
    )
    # calculate performance on the test data and return along with history
    val_metrics = model.evaluate(val_gen, verbose=0)
    val_mse = val_metrics[model.metrics_names.index("mse")]

    return history, val_mse

In [14]:
def get_generators(generator, train_index, val_index, graph_labels, batch_size):
    
    train_gen = generator.flow(
        train_index, targets=graph_labels[train_index], batch_size=batch_size
    )
    val_gen = generator.flow(
        val_index, targets=graph_labels[val_index], batch_size=batch_size
    )
    return train_gen, val_gen

In [22]:
def test_generator(test_graphs, test_graph_labels, batch_size=32):
    test_gen_pad =  PaddedGraphGenerator(graphs=test_graphs)
    test_index = range(len(test_graphs))
    test_gen   = test_gen_pad.flow(test_index, targets=test_graph_labels[test_index], batch_size=batch_size)
    return test_gen

In [25]:
if __name__ == "__main__":
    print("started")
#     parser = argparse.ArgumentParser(description='input output for m1 and m2')
#     parser.add_argument('-sy','--start_date', help='train start year',required=True)
#     parser.add_argument('-ey','--end_date', help='train end year', required=True)
#     parser.add_argument('-tsy','--test_start_date', help='test start year', required=True)
#     parser.add_argument('-tey','--test_end_date', help='test end year', required=True)
    
#     args = vars(parser.parse_args())
#     start_date = int(args['start_date'])
#     end_date   = int(args['end_date'])
#     test_start_date = int(args['test_start_date'])
#     test_end_date   = int(args['test_end_date'])
    start_date = 1950
    end_date   = 1980
    test_start_date = 1980
    test_end_date   = 1985
    
    epochs = 200
    batch_size = 32
    lr= 0.001
    
    es = EarlyStopping(monitor="val_loss", min_delta=0, patience=25, restore_best_weights=True)
    
    secibert_embeddings  = load_obj('combined_reduced_tsne_embed')
    
    data, max_len = input_read()
    train_data, test_data = train_test_data(data, start_date, end_date, test_start_date, test_end_date)
    print(f"train data: {train_data.shape[0]}, test data: {test_data.shape[0]}")
    train_graphs, train_graph_labels = graphs_data(secibert_embeddings, train_data)
    test_graphs, test_graph_labels = graphs_data(secibert_embeddings, test_data)
    
    train_graph_labels = np.log2(np.array([g[-1] for g in train_graph_labels]))
    test_graph_labels = np.log2(np.array([g[-1] for g in test_graph_labels]))
    print(f"train output: {train_graph_labels[0:5]}")
    print(f"test output: {test_graph_labels[0:5]}")
    
    train_generator = PaddedGraphGenerator(graphs=train_graphs)
    
    split_index = int(0.70*len(train_data))
    train_index = range(0, split_index)
    val_index  = range(split_index, len(train_data))

    train_gen, val_gen = get_generators(train_generator, train_index, val_index, train_graph_labels, batch_size=batch_size)
    test_gen= test_generator(test_graphs, test_graph_labels, batch_size)

    model = create_graph_regression_model(train_generator, lr)


    history, mse = train_model(model, train_gen, val_gen, es, epochs)
    print(f"Testing")
    test_error = model.evaluate(test_gen)[-1]
    print(f"val:{mse}, test error:{test_error}")

started
train data: 19988, test data: 4501
train output: [4.45943162 1.5849625  1.         3.169925   2.80735492]
test output: [4.         3.32192809 1.         2.32192809 3.45943162]
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Testing
val:0.30460596084594727, test error:0.3265707790851593


In [None]:
#validate(graphs, data)