In [1]:
import networkx as nx
import pandas as pd
import numpy as np
from tqdm import tqdm
from matplotlib import pyplot as plt
from sklearn.manifold import TSNE
from main_util_func import *

In [2]:
tqdm.pandas()

In [3]:
def init():
    node_data_loc = '../new_data/mgpnodeList_with_year_completion.csv'
    edge_data_loc = '../new_data/mgpedgeList_wodup.csv'
    nodes = pd.read_csv(node_data_loc, sep=',', lineterminator="\n", low_memory=False)
    nodes['Year'] = pd.to_datetime(nodes['Year'],format='%Y',errors='ignore')
    nodes['Year'] = nodes['Year'].astype(float)
    edges = pd.read_csv(edge_data_loc, sep=',', low_memory=False, lineterminator="\n",)
    print(nodes.shape)
    print(edges.shape)
    edges['advisee_year'] = edges['advisee'].map(nodes.set_index('Id')['Year'])
    edges['advisor_year'] = edges['advisor'].map(nodes.set_index('Id')['Year'])
    nodes_att=dict(zip(nodes['Id'], nodes['Year']))
    graph = nx.from_pandas_edgelist(edges, 'advisor', 'advisee', create_using=nx.DiGraph())
    graph.add_nodes_from(nodes_att.keys())
    nx.set_node_attributes(graph, values = nodes_att, name='year')
    return nodes, edges, graph

In [4]:
def add_additional_info(edges, nodes, graph):
    nodes['combined_input'] = nodes.progress_apply(lambda x: find_family_graph(graph, x['Id']), axis=1)
    nodes['nodeid'] =  nodes['combined_input'].apply(lambda x : x[0])
    nodes['input_connected'] =  nodes['combined_input'].apply(lambda x : x[1])
    nodes['input_edgelist'] =  nodes['combined_input'].apply(lambda x : x[2])
    nodes['input_node_years'] =  nodes['combined_input'].apply(lambda x : x[3])
    nodes['input_length'] = nodes['input_node_years'].apply(lambda x : len(x))
    nodes['combined_output'] = nodes.progress_apply(lambda x: find_output_sequence(graph, x['Id']), axis=1)
    nodes['output_seq'] = nodes['combined_output'].apply(lambda x : x[0])
    nodes['output_uniq_seq'] = nodes['combined_output'].apply(lambda x : set(x[0]))
    nodes['output_years'] = nodes['combined_output'].apply(lambda x : x[1])
    nodes['output_uniq_len'] = nodes['output_uniq_seq'].apply(lambda x : len(x))
    nodes['tree_edges'] = nodes[['input_edgelist','input_node_years']].apply(convert_directed_family_graph_to_tree_keep_old, axis=1)
    nodes.to_csv('./data/mgpnodeList_with_year_completion_updated_old_kept.csv', index = False) #previous file in new_data folder
    edges.to_csv('./data/mgpedgeList_wodup_updated_old_kept.csv', index = False)
    return

In [5]:
if __name__ == "__main__":
    nodes,edges, graph = init()
    graph.remove_edges_from(nx.selfloop_edges(graph))
    graph.remove_edges_from(nx.simple_cycles(graph))
    add_additional_info(edges, nodes, graph)

(268653, 9)
(292586, 2)


100%|██████████| 268653/268653 [05:57<00:00, 751.12it/s]  
100%|██████████| 268653/268653 [17:38<00:00, 253.70it/s]  


In [6]:
nodes[nodes['input_connected']==True].shape

(65276, 21)