# Learning from networks - Stonks

In [1]:
import networkx as nx
import extended_networkx as ex
import scipy
import numpy as np

## Load graph and compute market capitalization

First of all let's start by loading the graph file and compute the market capitalization of every node.

In [2]:
G = nx.read_gml("out_graph.gml")

def compute_capitalization(G: nx.Graph):
    """
    Adds the 'capitalization' attribute to every node, which is the sum of the incoming edges weighs.
    """
    for node in G.nodes():
        capitalization = 0
        for edge in G.in_edges(node):
            capitalization += G.get_edge_data(*edge)["weight"]
        G.nodes[node]["capitalization"] = capitalization

compute_capitalization(G)

Now let's print the top 20 capitalization nodes.

In [3]:
k = 20
print(f"Top {k} nodes with highest capitalization: {ex.max_k_nodes(G, k, 'capitalization')}")

Top 20 nodes with highest capitalization: ['CPIN', 'AAPL', 'MSFT', 'AMZN', 'ADRO', 'FB', 'GOOGL', 'GOOG', 'TSLA', 'NVDA', 'JPM', 'JNJ', 'UNVR', 'V', 'UNH', 'PG', 'HD', 'PYPL', 'ADBE', 'BAC']


In [4]:
sub_G = ex.connected_random_subgraph(G, k)
print(sub_G)
for node in sub_G.nodes():
    print(f"{node}: {len(sub_G.in_edges(node))}")

There are 2 components with more than 20 nodes.
DiGraph with 20 nodes and 38 edges
VONV: 0
OPEN: 3
NUMG: 0
XRX: 3
BRK: 4
PINC: 3
RNMC: 0
VOO: 0
FTCS: 0
DPZ: 4
MMM: 7
VOOV: 0
UGE: 0
SUSA: 0
IWS: 0
DUSL: 0
VTV: 0
BWA: 7
SLB: 5
CHNG: 2


In [5]:
b_centralities = ex.betweenness_centrality_percent(G, percentage=0.02)
print(sorted(b_centralities.items(), key=lambda t: t[1], reverse=True)[:k])

[('IEMG', 0.0006646842104818762), ('HNDL', 0.0004765202732830068), ('IDEV', 0.0004735350248990882), ('CPI', 0.00038295139424955816), ('ITOT', 0.00038183192610558866), ('VWO', 0.0002395661828094678), ('VEA', 0.00021913588918202485), ('SMCP', 0.00015224766757984868), ('SCZ', 0.0001517812225198614), ('GEM', 0.00010914814403702388), ('IWM', 0.00010896156601302897), ('MCRO', 8.470642289369032e-05), ('FM', 8.078828438979717e-05), ('VTI', 5.933181163038221e-05), ('IJR', 5.242842474257045e-05), ('UPRO', 4.5804904890751045e-05), ('JKH', 3.8994807014936735e-05), ('IGM', 3.6755870726997783e-05), ('CRBN', 3.656929270300287e-05), ('EEM', 3.2557865187112254e-05)]


In [6]:
def local_clustering_coefficient(G):
    return nx.clustering(G, weight="weight")

def global_clustering_coefficient(G):
    return nx.transitivity(G)

nodes_clustering_coeff = local_clustering_coefficient(G)
# Print top k nodes with highest clustering coefficient
print(f"Top {k} nodes by clustering coefficient") 
print(sorted(nodes_clustering_coeff.items(), key=lambda t: t[1], reverse=True)[:k])
print(f"Global clustering coefficient: {global_clustering_coefficient(G)}")


Top 20 nodes by clustering coefficient
[('RELIANCEP1', 0.00016589196490068256), ('TATAMTRDVR', 4.4570382724807e-05), ('WHIRLPOOL', 3.127923241268114e-05), ('GLAXO', 3.102642871368198e-05), ('CADILAHC', 2.6203799127153454e-05), ('RWVG', 2.3440949852530024e-05), ('BAJAJHLDNG', 2.327737443598335e-05), ('CHT', 2.276084431203389e-05), ('BOSCHLTD', 2.2435231450985962e-05), ('TVSMOTOR', 2.178549609819416e-05), ('UNA', 2.0883890679729935e-05), ('HONAUT', 1.925677269269553e-05), ('AUOTY', 1.9130916486344096e-05), ('PFIZER', 1.882216126163122e-05), ('KANSAINER', 1.761364406751079e-05), ('BANKBARODA', 1.720363579034824e-05), ('KNIP11', 1.5558204365693597e-05), ('SSI', 1.5216670357133018e-05), ('4161', 1.4634795460600534e-05), ('9USDUSD953', 1.4543323738002963e-05)]
Global clustering coefficient: 0.00011406233747100072


In [7]:
def closeness_centrality_matrix(G):
    A = nx.adjacency_matrix(G).tolil()  # matrix converted into list of lists
    D = scipy.sparse.csgraph.floyd_warshall(A, directed=False, unweighted=False)

    n = D.shape[0]
    centralities = {}
    for r in range(0, n):

        cc = 0.0

        possible_paths = list(enumerate(D[r, :]))
        shortest_paths = dict(filter(lambda x: not x[1] == np.inf, possible_paths))

        total = sum(shortest_paths.values())
        n_shortest_paths = len(shortest_paths) - 1.0
        if total > 0.0 and n > 1:
            s = n_shortest_paths / (n - 1)
            cc = (n_shortest_paths / total) * s
        centralities[r] = cc
    return centralities

c_centralities = closeness_centrality_matrix(sub_G)
print(sorted(c_centralities.items(), key=lambda t: t[1], reverse=True)[:k])

[(3, 5.5350745255534096e-08), (6, 5.535074525553409e-08), (17, 5.530941232012373e-08), (12, 5.522193423636232e-08), (5, 5.461359993357783e-08), (0, 5.459762709356159e-08), (11, 5.373731117387403e-08), (1, 5.159819956369824e-08), (19, 4.9837236989454576e-08), (9, 4.961422232334373e-08), (2, 4.88813735913777e-08), (18, 4.409144360800137e-08), (14, 4.3490127301770964e-08), (10, 3.46487222884248e-08), (15, 3.284736550945351e-08), (13, 3.159240436577959e-08), (16, 2.6431083987830447e-08), (4, 2.3706323507168625e-08), (7, 1.4772790544884634e-08), (8, 6.541687243729823e-09)]
