In [1]:
import os
import networkx as nx
import pandas as pd

import json

from IPython.display import display, HTML

import itertools

# Cora

In [None]:
# cora
data_dir = os.path.join(os.getcwd(), 'data\cora')

In [None]:
# import edge list
edgelist = pd.read_csv(
    os.path.join(data_dir, "cora.cites"),
    sep='\t',
    header=None,
    names=["target", "source"]
)
edgelist["label"] = "cites"

In [None]:
display(edgelist.head(5))

In [None]:
G = nx.from_pandas_edgelist(edgelist, edge_attr="label")

In [None]:
# type of nodes
for node in G.nodes():
    print(node, type(node))
    break

In [None]:
nx.set_node_attributes(G, "paper", "label")

In [None]:
# print a sample node
G.nodes[1103985]        # nodes without attributes

In [None]:
_NUM_FEATURES = 1433

In [None]:
feature_names = [ "w_{}".format(ii) for ii in range( _NUM_FEATURES ) ]
column_names =  feature_names + ["subject"]
node_list = pd.read_csv(
    os.path.join(data_dir, "cora.content"), 
    sep='\t', 
    header=None, 
    names=column_names
)

In [None]:
# gather features and save scalar per attribute
for ii, row in node_list.iterrows():
    node_id = row.name
    node_subject = row["subject"]

    for _ in row.index:
        if _ in feature_names:
            G.nodes[node_id][_] = row[_]

In [None]:
# append subject to node
nx.set_node_attributes(G, node_list["subject"].to_dict(), "subject")

In [None]:
# print a sample node
G.nodes[1103985]["subject"]       # nodes with attributes

In [None]:
# DO NOT RUN THIS CELL

# gather all features into lists under 'features' column.
node_list["features"] = node_list[feature_names].values.tolist()

node_list = node_list.drop(columns=feature_names)
node_list["id"] = node_list.index
node_list.head(5)

In [None]:
# export as `.graphml` file
nx.write_graphml(G, os.path.join(data_dir, "cora.graphml"))

In [None]:
G = nx.read_graphml(os.path.join(data_dir, "cora.graphml"))

In [None]:
print(nx.info(G))

In [None]:
print( G.nodes["1103985"] )

# Citeseer

In [None]:
# load data
data_dir = os.path.join(os.getcwd(), 'data\citeseer')

In [None]:
# import edge list
edgelist = pd.read_csv(
    os.path.join(data_dir, "citeseer.cites"),
    sep='\t',
    header=None,
    names=["target", "source"]
)
edgelist["label"] = "cites"

In [None]:
print(edgelist.head(5))

In [None]:
_NUM_FEATURES = 3703

In [None]:
# add node attributes
feature_names = [ "w_{}".format(ii) for ii in range( _NUM_FEATURES ) ]
column_names =  feature_names + ["subject"]
node_list = pd.read_csv(
    os.path.join(data_dir, "citeseer.content"),
    sep='\t',
    header=None,
    names=column_names,
    dtype={
        0 : str,            # node id (string)
        "subject": str      # node subject (string)
     }
)

In [None]:
valid_source = node_list.index.get_indexer(edgelist.source) >= 0
valid_target = node_list.index.get_indexer(edgelist.target) >= 0
cleaned_edgelist = edgelist[valid_source & valid_target]

In [None]:
# compare cleaned edgelist with original edgelist
print("Original edgelist size: {}".format(edgelist.shape))
print("Cleaned edgelist size: {}".format(cleaned_edgelist.shape))

In [None]:
G = nx.from_pandas_edgelist( cleaned_edgelist, edge_attr="label")

In [None]:
# type of nodes
for node in G.nodes():
    print(node, type(node))
    break

In [None]:
# convert node ids to integers

# I can't do it, because there are nodes id composed by letters and numbers, so the type should be string

In [None]:
# print a sample node
for node in G.nodes( data=True ) :
    print(node)         # nodes without attributes
    break

In [None]:
# gather features and save scalar values per attribute
for ii, row in node_list.iterrows():
    node_id = row.name
    node_subject = row["subject"]

    for _ in row.index:
        if _ in feature_names:
            G.nodes[node_id][_] = row[_]

In [None]:
nx.set_node_attributes(G, node_list["subject"].to_dict(), "subject")

In [None]:
# print a sample node
for node in G.nodes( data=True ) :
    print(node)         # nodes with attributes
    break

In [None]:
# write graphml file
nx.write_graphml(G, os.path.join(data_dir, "citeseer.graphml"))

In [None]:
# load graphml file
G = nx.read_graphml(os.path.join(data_dir, "citeseer.graphml"))

In [None]:
# print graph info
print(nx.info(G))

# PubMed

In [2]:
# load data
data_dir = os.path.join(os.getcwd(), 'data\pubmed-diabetes')

In [3]:
# import edge list
edgelist = pd.read_csv(
    os.path.join(data_dir, "Pubmed-Diabetes.DIRECTED.cites.tab"),
    sep='\t',
    skiprows=2,
    header=None,
    names=["id", "source", "pipe", "target"],
    usecols=["source", "target"],
)

In [4]:
# print shape of edgelist
print(edgelist.shape)

(44338, 2)


In [5]:
edgelist.source = edgelist.source.str.lstrip("paper:").astype(int)
edgelist.target = edgelist.target.str.lstrip("paper:").astype(int)

In [6]:
print(edgelist.head(5))

     source    target
0  19127292  17363749
1  19668377  17293876
2   1313726   3002783
3  19110882  14578298
4  18606979  10333910


In [7]:
def parse_feature(feat):
    name, value = feat.split("=")
    return name, float(value)

In [8]:
def parse_line(line):
    pid, raw_label, *raw_features, _summary = line.split("\t")
    features = dict(parse_feature(feat) for feat in raw_features)
    features["pid"] = int(pid)
    features["label"] = int(parse_feature(raw_label)[1])
    return features

In [9]:
node = "data\pubmed-diabetes\pubmed-diabetes.NODE.paper.tab"

In [10]:
with open(node) as fp:
    node_data = pd.DataFrame(
        parse_line(line) for line in itertools.islice(fp, 2, None)
    )

In [11]:
node_data.fillna(0, inplace=True)
node_data.set_index("pid", inplace=True)

In [12]:
labels = node_data["label"]

In [13]:
nodes = node_data.drop(columns="label")

In [14]:
G = nx.MultiDiGraph()
G = nx.from_pandas_edgelist(edgelist, create_using=G)

In [18]:
for node in G.nodes( data=True ):
    print(node)        # nodes without attributes
    break

print( nx.info(G) )

(19127292, {})
MultiDiGraph with 19717 nodes and 44338 edges



  print( nx.info(G) )


In [19]:
# gather features into attributes
for ii, row in nodes.iterrows():
    node_id = row.name
    for _ in row.index:
        G.nodes[node_id][_] = row[_]

In [20]:
# append labels to nodes
nx.set_node_attributes(G, labels.to_dict(), "label")

In [21]:
for node in G.nodes( data=True ):
    print(node)      # nodes with attributes
    break

(19127292, {'w-rat': 0.0, 'w-common': 0.0, 'w-use': 0.0, 'w-examin': 0.02889892370027422, 'w-pathogenesi': 0.0, 'w-retinopathi': 0.0, 'w-mous': 0.0, 'w-studi': 0.03927472173927795, 'w-anim': 0.0, 'w-model': 0.0, 'w-metabol': 0.0, 'w-abnorm': 0.0, 'w-contribut': 0.0, 'w-develop': 0.0, 'w-investig': 0.0, 'w-mice': 0.0, 'w-2': 0.0051138308462225415, 'w-month': 0.0, 'w-compar': 0.017653254069202904, 'w-obtain': 0.0, 'w-method': 0.0, 'w-induc': 0.0, 'w-6': 0.0, 'w-inject': 0.0, 'w-experiment': 0.0, 'w-normal': 0.0, 'w-diet': 0.0, 'w-30': 0.0, 'w-hyperglycemia': 0.0, 'w-level': 0.0, 'w-lipid': 0.0, 'w-oxid': 0.0, 'w-activ': 0.0, 'w-protein': 0.0, 'w-kinas': 0.0, 'w-c': 0.0, 'w-measur': 0.011734827227009909, 'w-result': 0.005626481865792995, 'w-increas': 0.013080628861391524, 'w-retin': 0.0, 'w-stress': 0.0, 'w-3': 0.009410652924576506, 'w-similar': 0.0, 'w-observ': 0.013638082547296626, 'w-conclus': 0.009595651021257695, 'w-play': 0.0, 'w-import': 0.0, 'w-role': 0.0, 'w-present': 0.0, 'w-p':

In [22]:
# write graphml file
nx.write_graphml(G, os.path.join(data_dir, "pubmed-diabetes.graphml"))

In [23]:
# load graphml file
G = nx.read_graphml(os.path.join(data_dir, "pubmed-diabetes.graphml"))

In [24]:
# print graph info
print(nx.info(G))

DiGraph with 19717 nodes and 44338 edges



  print(nx.info(G))


In [25]:
# print a sample node
for node in G.nodes( data=True ):
    print(node)
    break

('19127292', {'w-rat': 0.0, 'w-common': 0.0, 'w-use': 0.0, 'w-examin': 0.02889892370027422, 'w-pathogenesi': 0.0, 'w-retinopathi': 0.0, 'w-mous': 0.0, 'w-studi': 0.03927472173927795, 'w-anim': 0.0, 'w-model': 0.0, 'w-metabol': 0.0, 'w-abnorm': 0.0, 'w-contribut': 0.0, 'w-develop': 0.0, 'w-investig': 0.0, 'w-mice': 0.0, 'w-2': 0.0051138308462225415, 'w-month': 0.0, 'w-compar': 0.017653254069202904, 'w-obtain': 0.0, 'w-method': 0.0, 'w-induc': 0.0, 'w-6': 0.0, 'w-inject': 0.0, 'w-experiment': 0.0, 'w-normal': 0.0, 'w-diet': 0.0, 'w-30': 0.0, 'w-hyperglycemia': 0.0, 'w-level': 0.0, 'w-lipid': 0.0, 'w-oxid': 0.0, 'w-activ': 0.0, 'w-protein': 0.0, 'w-kinas': 0.0, 'w-c': 0.0, 'w-measur': 0.011734827227009909, 'w-result': 0.005626481865792995, 'w-increas': 0.013080628861391524, 'w-retin': 0.0, 'w-stress': 0.0, 'w-3': 0.009410652924576506, 'w-similar': 0.0, 'w-observ': 0.013638082547296626, 'w-conclus': 0.009595651021257695, 'w-play': 0.0, 'w-import': 0.0, 'w-role': 0.0, 'w-present': 0.0, 'w-p