In [1]:
ID_COL = 'Id'

In [2]:
import pandas as pd
from collections import Counter
import networkx as nx
import pygraphviz as pgv

In [3]:
edge_counter = Counter()
dates = pd.read_csv('./Bosch/train_date.csv', chunksize=10000, index_col=ID_COL)
for date in dates:
    edges = []
    date.rename({col: '_'.join(col.split('_')[:2]) for col in date.columns}, axis=1, inplace=True)
    not_null_msk  = date.T.notnull()
    for item in date.index.tolist():
        item_df = not_null_msk[item]
        vertices = list(dict.fromkeys(item_df[item_df].index.tolist()))
        edges.extend([(vertices[i-1], vertices[i]) for i in range(1, len(vertices))])
    edge_counter.update(Counter(edges))

In [4]:
G = pgv.AGraph(strict=False, directed=True)

In [7]:
ebunch = [(k[0], k[1], v) for k,v in edge_counter.items()]
ebunch_max, ebunch_min = ebunch[0][2], ebunch[-1][2]
reg_ebunch = [(x[0], x[1]) for x in ebunch]
out_nodes = set([k[0] for k,v in edge_counter.items()])
in_nodes = set([k[1] for k,v in edge_counter.items()])
sink_nodes = in_nodes - out_nodes
origin_nodes = out_nodes - in_nodes
norm_ebunch = []
for i, e in enumerate(ebunch):
    norm_ebunch.append((e[0], e[1], (e[2]-ebunch_min)/(ebunch_max-ebunch_min)*5))

In [8]:
sink_nodes, origin_nodes

({'L3_S38', 'L3_S51'}, {'L0_S0'})

In [9]:
lines = ['L0', 'L1', 'L2', 'L3']
colors = ['black', 'red', 'green', 'blue']
node_line = {}
for color, line in zip(colors, lines):
    node_line[line] = [k[0] for k,v in edge_counter.items() if line in k[0]]
    G.add_nodes_from(node_line[line], color=color)

In [10]:
G.add_edges_from(reg_ebunch)

In [21]:
G.layout(prog="dot")
G.draw("file.png")