In [1]:
import csv
import pickle
import random
import matplotlib
import numpy as np
import pandas as pd
from glob import glob
from numpy import nan
import networkx as nx
from tqdm import tqdm
import tensorflow as tf
import matplotlib.pyplot as plt

from tensorflow.keras.models import Model
from keras.utils.vis_utils import plot_model
from tensorflow.keras.models import load_model
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.sequence import pad_sequences
from main_util_func import *
from dataprep.clean import clean_df

In [2]:
import functools
# map(functools.partial(add, y=2), a)

In [3]:
# import os
# os.environ['LD_LIBRARY_PATH'] = "/usr/local/cuda-10.1/lib64/"

In [4]:
def multi_parent_detect(filtered_researchers):
    multi_parent = 0
    multi_index= []
    for i, row in filtered_researchers.iterrows():
        edges = eval(row['input_edgelist'])
        #print(type(edges))
        #years = eval(row['input_node_years'])
        graph = nx.DiGraph()
        graph.add_edges_from(edges)
        #node_year = dict(years)
        #nx.set_node_attributes(graph, values = node_year, name='year')
        #print(graph.nodes, graph.edges)
        for node in graph.nodes:
            #print(type(node))
            parents = list(graph.predecessors(node))
            if len(parents) > 1:
                multi_parent +=1
                multi_index.append(i)
    return multi_parent, multi_index

In [11]:
def data_load():
    embeddings  = load_obj('combined_reduced_tsne_embed')

    print(f"embedding nodes : {len(embeddings)}")
    print(f"embedding shape : {embeddings[1].shape}")
    embeddings[0] = np.array([0]*embeddings[1].shape[0])
     
    data_loc = './inter_files/path_model_with_tree_newer_rel_kept_input_output1.csv'  #"older relation kept"
    #data_loc = './inter_files/path_model_input_output1.csv' #"no relation removed (all relation kept)"
    data = pd.read_csv(data_loc, sep=',', lineterminator="\n", low_memory=False)
    inferred_dtypes, cleaned_data = clean_df(data)
    del data

    cleaned_data['first_year']= cleaned_data['input_years_sequence'].apply(lambda x:eval(x)[1]) 
    #new after 05-05-2022
    
    cleaned_data['flag'] = cleaned_data['first_year']+15 <= 2021 #new
    cleaned_data = cleaned_data[cleaned_data['flag']==True].copy() #new
    cleaned_data['output_seq'] = cleaned_data['output_seq'].apply(lambda x : eval(x))
    cleaned_data['paths'] = cleaned_data['paths'].apply(lambda x : eval(x))

    num_paths = [len(paths) for paths in cleaned_data['paths'].values]
    max_num_paths = max(num_paths)
    print(f"Max no. paths : {max_num_paths}")
    path_length = [len(path) for paths in cleaned_data['paths'].values for path in paths]
    max_path_len = max(path_length)
    print(f"Max path len : {max_path_len}")
    padded_input_paths = []
    for index, row in tqdm(cleaned_data.iterrows(), total=cleaned_data.shape[0]):
        padded_path = pad_sequence(row['paths'], num_seq=max_num_paths, seq_length=max_path_len)
        padded_input_paths.append(padded_path) 
    cleaned_data['padded_paths'] = padded_input_paths
    #print(cleaned_data.columns)
    multi_parent, multi_index = multi_parent_detect(cleaned_data)
    print(f"Family having researcher with multi parents: {multi_parent}")
    #print(multi_index)
    
    test_data = cleaned_data.loc[multi_index].copy()
    print(test_data.shape)
    del cleaned_data

#     train_input = np.array([np.array(list1) for list1 in train_data['padded_paths'].values])
#     train_output = np.array([np.array(list1) for list1 in train_data['output_seq'].values])
    test_input = np.array([np.array(list1) for list1 in test_data['padded_paths'].values])
    test_output = np.array([np.array(list1) for list1 in test_data['output_seq'].values])

    
    return embeddings, test_input, test_output, max_path_len, max_num_paths

In [12]:
def mapping(sequences,embeddings,max_num_paths,max_path_len):
#     global embeddings
#     global max_num_paths
#     global max_path_len
    seq = np.array([embeddings[int(node)] for seq in sequences for node in seq])
    return seq.reshape(max_num_paths, max_path_len, -1)

In [13]:
def load_model3(model_file, enc_file, dec_file):
    model     = load_model(model_file, compile=False)
    model_enc = load_model(enc_file, compile=False)
    model_dec = load_model(dec_file, compile=False)
    return (model, model_enc, model_dec)

In [14]:
def decode_sequence(encoder_model, decoder_model, input_seq, input_length, output_size, max_num_seq, max_seq_len, embedding_size):
    input_seq = input_seq.reshape(-1, max_num_seq, max_seq_len, embedding_size)
    states_value = encoder_model.predict(input_seq)

    target_seq = np.array(input_length).reshape(-1, 1, output_size) #(N * 1 * 1)
    #print(target_seq.shape)
    decoded_seq = []

    stop_condition = False
    count = 0
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)
        #print(output_tokens)
        decoded_seq.append(output_tokens)
        #print(output_tokens.shape)
        count+=1
        #print(count)
        if count >= max_seq_len:
            stop_condition = True

        target_seq = output_tokens
        states_value = [h, c]
    decoded_seq = np.hstack((decoded_seq))
    return decoded_seq.reshape(-1, max_seq_len, output_size)

In [17]:
if __name__=="__main__":
    #print(f"Without relation removed(all relation kept):")
    print(f"With relation removed (older relation kept):")
    embeddings, test_input, test_output, max_path_len, max_num_paths = data_load()
    mapping_other = functools.partial(mapping, embeddings=embeddings,max_num_paths=max_num_paths,max_path_len=max_path_len)
    test_input =  np.array(list(map(mapping_other, test_input)))
    print(f"test input shape: {test_input.shape}")
    test_output = np.log2(test_output)
    print(f"test output shape: {test_output.shape}")
    max_num_seq = test_input.shape[1]
    max_seq_len = test_input.shape[2]
    embedding_size = test_input.shape[3]
    output_size = 1

    test_data = test_input
    test_input_length = test_output[:,0]
    test_actual_op = test_output[:,1:].reshape(-1,5,1)
    error_list = []

    years = ["1950_1980_1980_1985","1950_1985_1985_1990", "1950_1990_1990_1995","1950_1995_1995_2000","1950_1980_1995_2000"]
    #path_dec_32_512_0.0001_25_1950_1980_1995_2000_0.04324817657470703_0.1624655984963131_True_0.01
    dataset = {}
    for year in years:
        temp=[]
        model_file   = glob(f"./exp_models/path_32_512_0.0001_25_{year}*True_0.01_nrk")
        encoder_file = glob(f"./exp_models/path_enc_32_512_0.0001_25_{year}*True_0.01_nrk")
        decoder_file = glob(f"./exp_models/path_dec_32_512_0.0001_25_{year}*True_0.01_nrk")
#         model_file   = glob(f"./exp_models/path_32_512_0.0001_25_{year}*True_0.01") #With relation kept
#         encoder_file = glob(f"./exp_models/path_enc_32_512_0.0001_25_{year}*True_0.01")
#         decoder_file = glob(f"./exp_models/path_dec_32_512_0.0001_25_{year}*True_0.01")
        for file_inx in range(6):
            model, encoder_model, decoder_model = load_model3(model_file[file_inx], encoder_file[file_inx], decoder_file[file_inx])
            test_predicted_op = decode_sequence(encoder_model, decoder_model, test_data, test_input_length, output_size, max_num_seq, max_seq_len, embedding_size)
            error = mse(test_predicted_op.reshape(-1,), test_actual_op.reshape(-1,))
            error_list.append(error)
            temp.append(error)
        dataset[str(year)]=(np.mean(temp), np.std(temp))
    avg_error =  np.mean(error_list)
    std = np.std(error_list)
    print(f"Error with relation removed-> Mean:{avg_error}, Std:{std}")
    #print(f"Error without relation removed-> Mean:{avg_error}, Std:{std}")

With relation removed (older relation kept):
embedding nodes : 268653
embedding shape : (400,)
Data Type Detection Report:
	These data types are supported by DataPrep to clean: ['country']
Column Headers Cleaning Report:
No Headers Cleaned.
Downcast Memory Report:
	Memory reducted from 81052290 to 79681494. New size: (98.31%)


  2%|▏         | 824/43164 [00:00<00:05, 8239.14it/s]

Max no. paths : 52
Max path len : 5


100%|██████████| 43164/43164 [00:04<00:00, 9121.36it/s]


Family having researcher with multi parents: 1128
(1128, 18)
test input shape: (1128, 52, 5, 400)
test output shape: (1128, 6)
Error with relation removed-> Mean:1.1992622955833, Std:0.9352781169416587


In [14]:
#test_predicted_op

In [15]:
#test_actual_op

In [1]:
#glob(f"./exp_models/path_dec_32_32_0.0001_25_{year}*True_0.01_oldrk")

In [16]:
dataset

{'1950_1980_1980_1985': (11.84475482220445, 5.904536041657321),
 '1950_1985_1985_1990': (12.007767090805586, 3.5552801344161105),
 '1950_1990_1990_1995': (3.4040206701080944, 3.6804298760042427),
 '1950_1995_1995_2000': (4.018599642239562, 3.853051603010467),
 '1950_1980_1995_2000': (10.863088815688778, 7.6704058316229995)}

In [18]:
dataset

{'1950_1980_1980_1985': (1.5562725812366895, 1.0259584432534765),
 '1950_1985_1985_1990': (1.3307620452563567, 0.8018707570009993),
 '1950_1990_1990_1995': (0.5570353752792092, 0.4909129718384233),
 '1950_1995_1995_2000': (1.1258617373619033, 1.0288783745320078),
 '1950_1980_1995_2000': (1.4263797387823416, 0.8742872331746663)}

In [10]:
#  with relation removed
# {'1950_1980_1980_1985': (1.314820141018356, 0.8301380089761211),
#  '1950_1985_1985_1990': (0.838448149452543, 0.8131235937916369),
#  '1950_1990_1990_1995': (0.5734713805111756, 0.28322045831154463),
#  '1950_1995_1995_2000': (0.8145801653747169, 0.9274146325576711),
#  '1950_1980_1995_2000': (1.1505943449076284, 1.0636817965990732)}

In [12]:
# Without relation removed (results)

# {'1950_1980_1980_1985': (1.9235642143992513, 0.7415566749356955),
#  '1950_1985_1985_1990': (0.8826815717419967, 0.8292638474599289),
#  '1950_1990_1990_1995': (1.3671353133720894, 1.4374834860291181),
#  '1950_1995_1995_2000': (1.9457733611593966, 1.4761518765433876),
#  '1950_1980_1995_2000': (1.9256257579640799, 1.3454863929371412)}

In [None]:
#Error without relation removed-> Mean:1.6089553557902456, Std:1.2804592642682693

In [None]:
#Error with relation removed-> Mean:0.9385267230940997, Std:0.8685251846360821

In [12]:
#len(error_list)

In [101]:
#test_actual_op

In [102]:
#test_predicted_op

In [10]:
# model, encoder_model, decoder_model = load_model3(model_file[file_inx], encoder_file[file_inx], decoder_file[file_inx])
            
#predicted_op = decode_sequence(enc, dec, encoder_input, decoder_input, max_seq_len=5)
#predicted_op_1 = predicted_op.reshape(predicted_op.shape[0], predicted_op.shape[1])


In [11]:
# res_m2 = np.sum(np.square(test_predicted_op-test_actual_op))/(test_predicted_op.shape[0]*test_predicted_op.shape[1])

In [15]:
[]+[1,2,3]+[4,5,6]

[1, 2, 3, 4, 5, 6]