# Transform the JDK dependency dataset
The dataset we use is the *class dependency network of JDK 1.6.0.7* framework downloaded from the [KOBLENZ data repository](http://konect.uni-koblenz.de/networks/subelj_jdk). 

The original dataset contains big number of nodes and edges between them. For this reason, in this script we subselect 50 nodes and the edges between them. Then we transform the original dataset in format compatible with Cytoscape JS.

In [1]:
import json

## 1. Create the nodes
First we have to create the  list of nodes with some options for them.

In [2]:
NUM_NODES = 50  # how many nodes to subselect
nodes = []  # the final subset of nodes
packages = []  # the set of packages for each node

# each line represents one node
with open('raw_data/ent.subelj_jdk_jdk.class.name', 'r') as f:
    for i, line in enumerate(f):
        # stop when the limit is reached
        if i == NUM_NODES:
            break
            
        full_name = str(line[:-1])  # the  fill name
        class_name = full_name.split('.')[-1]  # only the class name
        package = '.'.join(full_name.split('.')[:-1])  # the package where the class belongs
        packages.append(package)
        node = {
            "data": {
                "id": str(i + 1),  # the string representation of the unique node ID
                "idInt": i + 1,  # the numeric representation of the unique node ID
                "name": 'cls: ' + class_name + "; pkg: " + package,  # the name of the node used for printing
                "query": True,
                "classes": package  # the keyword 'classes' is used to group the nodes in classes
            },
            "group": "nodes",  # it belongs in the group of nodes
            "removed": False,
            "selected": False,  # the node is not selected
            "selectable": True,  # we can select the node
            "locked": False,  # the node position is not immutable
            "grabbable": True  # we can grab and move the node
        }
        nodes.append(node)

# get all the unique package names
packages = set(packages)
print(packages)

{'java.awt', 'java.applet', 'java.net', 'java.awt.peer', 'java.util', 'javax.accessibility', 'java.awt.dnd', 'java.beans', 'java.io', 'java.awt.image', 'java.lang', 'java.awt.event'}


## 2. Create the edges
Omnce we selected our subset of nodes, we need to select the corresponding edges.

In [3]:
edges = []  # the final subset of edges

# each line represents an edge between two nodes. The nodes are represented by their id
with open('raw_data/out.subelj_jdk_jdk', 'r') as f:
    # jump the first two lines, they contain some info
    for i, line in enumerate(f):
        if i == 0 or i == 1:
            continue
        
        # get the source node and the target node
        node_ids = line.strip().split(' ')
        source, target = node_ids[0], node_ids[1]
        if int(source) <= NUM_NODES and int(target) <= NUM_NODES: 
            edge = {
                "data": {
                    "source": str(source),  # the source node id (edge comes from this node)
                    "target": str(target),  # the target node id (edge goes to this node)
                    "directed": True,
                    "intn": True,
                    "rIntnId": i - 1,
                    "id": "e" + str(i - 1)
                },
                "position": {},  # the initial position is not known
                "group": "edges",  # it belongs in the group of edges
                "removed": False,
                "selected": False,  # the edge is not selected
                "selectable": True,  # we can select the node
                "locked": False,  # the edge position is not immutable
                "grabbable": True,  # we can grab and move the node
                "directed": True  # the edge is directed
            }
            edges.append(edge)

### 2.1. Calculate the indegree for each node
We calculate the *indegree* for each node which represents the number of incoming edges. Then we used the normalized *indegree* numbers as a score  of the node. This helps with drawing the node bigger since it is more important.

In [4]:
# initial dictionary mapping each node id to its normalized indegree
nodes_indegree = dict(zip(list(range(1, NUM_NODES + 1)), [0]*(NUM_NODES + 1)))
N = len(edges)
for e in edges:
    nodes_indegree[int(e["data"]["target"])] += 1.0/N

print(max(list(nodes_indegree.values())))

0.21903959561920813


In [5]:
for node in nodes:
    node["data"]["score"] = nodes_indegree[node["data"]["idInt"]]

## 3. Dump the data in a JSON file

In [6]:
data = []
data.extend(nodes)
data.extend(edges)

In [7]:
with open('datasets/custom.json', 'w') as f:
    json.dump(data, f)