# **Network Analysis on Post Trade data:**

## **variation in topology and stability of the Scale-free Networks over time**

The aim of the study is the construction of Social Networks from Settlement Instructions of T2S:

- The topological structure of the Network may change over time due to disruptive events.
- Two Case-studies are conducted on disruptive events: COVID19 and BTP Italia, BTP Futura emissions.
- The identification over time of a Scale-free behavior and a ranking for the most central nodes is conducted.
- Moreover, a networks resiliency analysis is performed using random and targeted attacks.

****Library requirements****

- Powerlaw
- Jsonpickle
- Pyvis
- igraph

In [2]:
import os

import io

import random

random.seed(123456789)

from datetime import datetime, timedelta

import time

import math

import json

import gc

import pickle

import collections



import numpy as np

import pandas as pd

from pandas.tseries.offsets import BDay

import matplotlib.pyplot as plt

import matplotlib.cm as cm

import matplotlib.colors as colors

from matplotlib.ticker import MultipleLocator, FixedFormatter, FixedLocator



import seaborn as sns



import pickle

import boto3

from s3 import S3



pd.set_option('display.max_rows', None)

pd.set_option('display.max_columns', None)

In [None]:
import utils

from matplotlib.backends.backend_pdf import PdfPages

In [None]:
from statsmodels.distributions.empirical_distribution import ECDF

import networkx as nx

from networkx.algorithms import community

from scipy.stats import kstest

import powerlaw

import pyvis

from pyvis.network import Network

import igraph as ig

import leidenalg as la

In [None]:
file = open("anonnames.txt")

names = []

for i in file:

    names.append(i.split(" ")[0])







company_names = [ "Telecom",

"Software",

"Technology",

"Hardware",

"Electronics",

"Consulting",

"General",

"Frontier",

"Alpha",

"Industries",

"Net",

"People",

"Star",

"Bell",

"Research",

"Architecture",

"Building",

"Construction",

"Medicine",

"Hill",

"Graphics",

"Analysis",

"Vision",

"Contract",

"Solutions",

"Advanced",

"Venture",

"Innovation",

"Systems",

"Solutions",

"Provider",

"Design",

"Internet",

"Virtual",

"Vision",

"Application",

"Signal",

"Network",

"Net",

"Data",

"Electronic",

"Max",

"Adventure",

"Atlantic",

"Pacific",

"North",

"East",

"South",

"West",

"Speed",

"Universal",

"Galaxy",

"Future",

"Digital",

"Studio",

"Interactive",

"Source",

"Omega",

"Direct",

"Resource",

"Power",

"Federated",

"Star"]



company_types = ('LawFirm', 'Generic', 'Short')

last_names = names

def create_company_name(biz_type=None):

    name = []

    if not biz_type:

        biz_type = random.choice(company_types)

    if biz_type == "LawFirm":

        name.append( random.choice(last_names)+ ", " + random.choice(last_names) + " & " + 

                     random.choice(last_names))

        name.append('LLP')

    else:

        for i in range(1,random.randint(2,4)):

            rand_name = random.choice(company_names)

            if rand_name not in name:

                name.append(rand_name)

        if biz_type == 'Generic':

            name.append(random.choice(company_types))

        elif len(name) < 3:

            name.append(random.choice(company_names))

    return " ".join(name)

# **Data Retrieval**

The result plots and statistics will be stored in the dirs **/home/ec2-user/SageMaker/Tesi_results/**

- monthly subdir fs3_helper = S3(bucket_name='mt-res-prod-ml-bucket')or monthly data
- daily subdir for daily data

In [None]:
s3_helper = S3(bucket_name='mt-res-prod-ml-bucket')

In [None]:
# The path to the directory where the input data is



bucket_name = 'mt-res-prod-ml-bucket'



#output_path = f's3://{bucket_name}/tmp-andrea-spark'

ROOT_PATH = '/home/ec2-user/SageMaker/Tesi_results/'

if 'Tesi_results' not in os.listdir('/home/ec2-user/SageMaker/'):

    os.mkdir('/home/ec2-user/SageMaker/Tesi_results')



PATH_TO_INPUT_DIR=f'TESI/Tesi-Andrea/RAW'

PATH_TO_DATASET = f'TESI/Tesi-Andrea/DATA'

path_to_root_directory = f's3://{bucket_name}/{PATH_TO_DATASET}'



# Define the partition column of the input directory

PARTITION_COLS=['dt_business']

START_DATE = '2018-04-01'

#END_DATE = '2018-05-04'

END_DATE = '2021-07-30'







    

anonymize_data = True

if anonymize_data:

    ANONYMIZE_DICT = dict()

    ROOT_PATH = ROOT_PATH+'anonymized/'

    os.makedirs(ROOT_PATH, exist_ok=True)

    if 'anonymize_companies.pkl' in os.listdir(ROOT_PATH):

        with open(ROOT_PATH+'anonymize_companies.pkl', 'rb') as handle:

            ANONYMIZE_DICT = pickle.load(handle)



ROOT_PATH = ROOT_PATH+'monthly/'

In [None]:
def check_anonymized(node_list):

    for i in node_list:

        if i == 'MONTE TITOLI':

            ANONYMIZE_DICT[i] = 'MONTE TITOLI'

            continue

        if i == 'C.COMP.GARANZIA':

            ANONYMIZE_DICT[i] = 'C.COMP.GARANZIA'

            continue

        r_name =  create_company_name("Short")

        if r_name in ANONYMIZE_DICT.values():

            flag = True

            while flag :

                r_name =  create_company_name("Short")

                if r_name not in ANONYMIZE_DICT.values():

                    flag = False

        if i not in ANONYMIZE_DICT.keys():

            ANONYMIZE_DICT[i] = r_name

    with open(ROOT_PATH+'anonymize_companies.pkl', 'wb') as handle:

            pickle.dump(ANONYMIZE_DICT, handle, protocol=pickle.HIGHEST_PROTOCOL)

    

    return  [ANONYMIZE_DICT[i] for i in node_list]

In [None]:
def save_single_plot(plot, path, name):

    os.makedirs(ROOT_PATH+path,exist_ok = True)

    fig=plot.get_figure()

    fig.savefig(f'{ROOT_PATH}/{path}/{name}.png', bbox_inches='tight')

    plt.close()

    

    print("plot saved: ",f'{ROOT_PATH}/{path}/{name}.png' )

In [None]:
def retrieve_data(forced = False):

    try:

        if forced:

            return get_data(forced)

        else:

            alldata_df =  pd.read_csv(os.path.join(path_to_root_directory,'aggregated_daily.csv'), sep=',')

            print("Data retrieved successfully: shape",alldata_df.shape)

            return alldata_df

        

    except:

        print("NOT found")

        return get_data()

In [None]:
def get_data(forced=False):

    datelist = get_datelist(START_DATE,END_DATE)

    if 'MONTHLY' in [i.key.split("/")[-2] for i in s3_helper.bucket.objects.filter(Prefix=PATH_TO_DATASET)] and forced == False:

        csv_list = [i for i in [i.key.split("/")[-1] for i in s3_helper.bucket.objects.filter(Prefix=PATH_TO_DATASET+'/MONTHLY')]]

        df = pd.DataFrame()

        for f in csv_list:

            df = pd.concat([pd.read_csv(os.path.join(path_to_root_directory+'/MONTHLY',f)),df])

        return df

    else:

        month_dict_df = dict() 

        for start,end in datelist:

            print("\n=============\nDATE:",start,"--",end)



            start_time = time.time()

            print("Start reading dataset...")

            raw_df = read_input_dataset(start,end)

            end_time = time.time()

            #raw_df[raw_df['cd_sett']== 'N']

            print("Process time: ",round((end_time - start_time),2)," seconds")

            raw_group = raw_df.groupby(['dt_business','ds_deli_pty1','ds_rece_pty1','cd_sett','cd_sec_at','ind_etf_mkt'])[['am_pend','am_amt']].sum()

            raw_group = raw_group.reset_index()

            month_dict_df[start+'_'+end]  =raw_group

            print(raw_group.shape)

            #raw_group.to_pickle(f'{start}_{end}.pkl')

              

            utils.upload_dataset_to_aws_s3_v2(df=raw_group, \

                                bucket='mt-res-prod-ml-bucket',\

                                prefix=PATH_TO_DATASET+'/MONTHLY',\

                                output_file_name=f'{start}_{end}.csv',\

                                index=False, header=True, sep=',', decimal='.')

            print("UPLOADED to s3", f'{start}_{end}.csv')

            

        alldata_df = pd.DataFrame()

        for key in month_dict_df:

            #print(key)

            alldata_df = pd.concat([alldata_df, month_dict_df[key]])

            

            

        utils.upload_dataset_to_aws_s3_v2(df=alldata_df,

                                bucket='mt-res-prod-ml-bucket',

                                prefix=PATH_TO_DATASET,

                                output_file_name='aggregated_daily.csv',

                                index=False, header=True, sep=',', decimal='.')

        

        return alldata_df



    

def read_input_dataset(START_DATE, END_DATE):

    # Read the input .parquet dataset

    raw_df = s3_helper.read_parquet(remote_dir=PATH_TO_INPUT_DIR, partition_cols=PARTITION_COLS, start_date =START_DATE,end_date=END_DATE)

    

    # Reset the index of the Pandas DataFrame

    raw_df = raw_df.reset_index(drop=True)

    

    # convert column names to lowercase 

    raw_df.columns = [x.lower() for x in raw_df.columns]

    

    return raw_df



def get_datelist(START_DATE, END_DATE):

    cnt = 0

    datelist = []

    d = datetime.strptime(START_DATE, '%Y-%m-%d')

    nextmonthdate = d

    while (nextmonthdate < datetime.strptime(END_DATE, '%Y-%m-%d')):

            cnt+=1

            #print(d.month+cnt)

            #print("TRY",nextmonthdate)

            if nextmonthdate.month >= 12:

                nextmonthdate = nextmonthdate.replace(year=nextmonthdate.year+1, month=1)

            #    print("IF",nextmonthdate)

                end_month = nextmonthdate.replace(month=nextmonthdate.month+1)-timedelta(days=1)



            else:

                nextmonthdate = nextmonthdate.replace(month=nextmonthdate.month+1) 

                if nextmonthdate.month >= 12:

                    end_month = nextmonthdate.replace(year=nextmonthdate.year+1, month=1)-timedelta(days=1)



                else:

                    end_month = nextmonthdate.replace(month=nextmonthdate.month+1)-timedelta(days=1)



            #end_month = nextmonthdate.replace(month=nextmonthdate.month+1)+timedelta(days=1)

            datelist.append((str(nextmonthdate).split(" ")[0], str(end_month).split(" ")[0]))





    datelist.pop(-1)

    return datelist

In [2]:
alldata_df = retrieve_data()

NameError: name 'retrieve_data' is not defined

In [None]:
alldata_df.shape

In [None]:
alldata_df_agg_monthly = alldata_df

alldata_df_agg_monthly.index = pd.to_datetime(alldata_df_agg_monthly['dt_business'],format='%Y-%m-%d')

In [None]:
alldata_df_agg_monthly = alldata_df_agg_monthly.groupby(by=['ds_deli_pty1','ds_rece_pty1','cd_sett','cd_sec_at','ind_etf_mkt',pd.Grouper(freq='M')])[['am_pend','am_amt']].sum().reset_index().sort_values(by='dt_business')

In [None]:
df_monthly = dict()

for dt in alldata_df_agg_monthly.dt_business.unique():

    key = str(dt).split('T')[0]

    df_monthly[key] = alldata_df_agg_monthly[alldata_df_agg_monthly.dt_business==dt]

In [None]:
df_monthly.keys()

In [None]:
def df_generate_dict(df,financial_instrument_agg, sett_agg):

    df_dict = dict()



        

    if not financial_instrument_agg and not sett_agg:

        fi_list = df['cd_sec_at'].unique()

        sett_list = df['cd_sett'].unique()

        

        for fi in fi_list:

            for sett in sett_list:

                df_dict[fi+"_"+sett] = df[ (df['cd_sec_at'] == fi) & (df['cd_sett'] == sett)]

                

    if not financial_instrument_agg and  sett_agg:

        fi_list = df['cd_sec_at'].unique()

        for fi in fi_list:

            df_dict[fi] = df[ (df['cd_sec_at'] == fi) ]

    

    if financial_instrument_agg and not sett_agg:

        sett_list = df['cd_sett'].unique()

        for sett in sett_list:

            df_dict[sett] = df[ (df['cd_sett'] == sett)]

            

    if financial_instrument_agg and  sett_agg:

        df_dict['all'] = df

        

    return df_dict

In [None]:
def preprocess_to_graph(df):

    df['am_pend']= df['am_pend'].astype(int)

    df['am_amt']=  df['am_amt'].astype(int)

    

    df = df[(df['am_amt'] != 0) | (df['am_pend'] != 0 )]

    

    df = df[(df['cd_sett'] == "S") | (df['cd_sett']=="N")]

    return df

In [None]:
def create_graph(df, date_range='all', financial_instrument_agg = False,sett_agg=False,  filter_on_sett= False,direction= 'D'):

    

    all_musk = ['dt_business','ds_deli_pty1','ds_rece_pty1','cd_sett','cd_sec_at']

    if date_range == 'all':

        all_musk.remove('dt_business')

    if date_range == 'Y':

        return

    if date_range == 'M':

        return

    if date_range == 'W':

        return 

    if date_range == 'D':

        return

    

    

    if financial_instrument_agg:

        all_musk.remove('cd_sec_at')

        

    if sett_agg:

        all_musk.remove('cd_sett')



    df = df.groupby(all_musk)[['am_pend','am_amt']].sum().reset_index()

    

    df_pp = preprocess_to_graph(df)

    df_pp_dict = df_generate_dict(df_pp,financial_instrument_agg, sett_agg)



   

    

    graph_dict = dict()

    for df_name in df_pp_dict:

        src_list = list(df_pp_dict[df_name]['ds_deli_pty1'])

        dst_list = list(df_pp_dict[df_name]['ds_rece_pty1'])



        if anonymize_data:

            src_list = check_anonymized(src_list)

            dst_list = check_anonymized(dst_list)

     

        

        if 'S' in df_name or 'R' in df_name:

            w_list = list(df_pp_dict[df_name]['am_amt'])

        else:

            w_list = list(df_pp_dict[df_name]['am_pend'])



        if direction == 'D':

            G = nx.DiGraph()

        else:

            G = nx.Graph()

        assert(len(src_list) == len(dst_list) == len(w_list))

        for index in range(len(src_list)):



            src =  src_list[index]

            dst = dst_list[index]

            w = w_list[index]

          

            if src is None or dst is None or w ==0:

                continue

            else:

                G.add_weighted_edges_from([(src,dst,int(w))])

        assert len(G.nodes()) ==len(list(set(df_pp_dict[df_name]['ds_deli_pty1'].unique()) | set(df_pp_dict[df_name]['ds_rece_pty1'].unique()))), "Number of nodes and company names must be equal"

        assert len(G.edges())== len(df_pp_dict[df_name]) , "Number of edges and number of dataframe rows must be equal"



        graph_dict[df_name] = G

    return graph_dict

In [None]:
G_dict = create_graph(alldata_df,financial_instrument_agg=False,sett_agg = False, filter_on_sett= True)

In [None]:
G_dict_ETF = create_graph(alldata_df[alldata_df['ind_etf_mkt'] == 1],financial_instrument_agg=True,sett_agg = False, filter_on_sett= True)

G_dict_ETF['ETF_N'] = G_dict_ETF.pop("N")

G_dict_ETF['ETF_S'] = G_dict_ETF.pop("S")

G_dict = {**G_dict_ETF, **G_dict}

In [None]:
G_dict_monthly= dict()

df_concat = pd.DataFrame()

for key in df_monthly:

    df = df_monthly[key]    

    G_tmp = create_graph(df,financial_instrument_agg=False,sett_agg = False, filter_on_sett= True)

    G_tmp_ETF = create_graph(df[df['ind_etf_mkt'] == 1],financial_instrument_agg=True,sett_agg = False, filter_on_sett= True)

    G_tmp_ETF['ETF_N'] = G_tmp_ETF.pop("N")

    G_tmp_ETF['ETF_S'] = G_tmp_ETF.pop("S")

    G_dict_monthly[key] = {**G_tmp_ETF, **G_tmp}

In [None]:
G_dict_monthly_cumulative= dict()

df_concat = pd.DataFrame()

for key in df_monthly:

    df = df_monthly[key]

    df_concat = pd.concat([df_concat,df])

    

    G_tmp = create_graph(df_concat,financial_instrument_agg=False,sett_agg = False, filter_on_sett= True)

#    print(df_concat.shape)

    G_tmp_ETF = create_graph(df_concat[df_concat['ind_etf_mkt'] == 1],financial_instrument_agg=True,sett_agg = False, filter_on_sett= True)

    G_tmp_ETF['ETF_N'] = G_tmp_ETF.pop("N")

    G_tmp_ETF['ETF_S'] = G_tmp_ETF.pop("S")

    G_dict_monthly_cumulative[key] = {**G_tmp_ETF, **G_tmp}





In [None]:
# CHECK ORDER AND SIZE of Final DF



for key in G_dict_monthly_cumulative:   

    for instr in  G_dict_monthly_cumulative[key].keys():

        if key == list(G_dict_monthly_cumulative.keys())[-1]:

            assert G_dict_monthly_cumulative[key][instr].order() == G_dict[instr].order(), "Order between time graph at last step is different from Graph of aggregated for all dates"

            assert G_dict_monthly_cumulative[key][instr].size() == G_dict[instr].size(), "Size between time graph at last step is different from Graph of aggregated for all dates"

In [None]:
def compute_power_law_exponent(degree_list):        

    fit = powerlaw.Fit(degree_list)

    

    alpha = fit.power_law.alpha

    xmin = fit.power_law.xmin

    test, p = kstest(degree_list, "powerlaw", args = (alpha,xmin),N=len(degree_list))

    return {'p-value':p,'test':test,'exp': alpha}

In [None]:
def tenth_centrality(G, centr_type ,w = None):

    if centr_type == 'degree':

        centr = nx.degree_centrality(G)

    elif centr_type == 'closeness':

        centr = nx.closeness_centrality(G)

    elif centr_type == 'betweenness':

        centr = nx.betweenness_centrality(G, weight= w)

    elif centr_type == 'eigenvector':

        try:

            centr = nx.eigenvector_centrality(G, weight= w)

        except:

            centr = "No Eigenvector Centrality"

            centr = float('nan')

            return centr

    elif centr_type == 'pagerank':

        centr = nx.pagerank(G, weight= w)

    else:

        raise Exception("Centrality type not found [try degree, betweenness, closeness, eigenvector, pagerank]")

    sort_orders = sorted(centr.items(), key=lambda x: x[1], reverse=True)

    return sort_orders[:10]

In [None]:
def compute_graph_stats(G, name="",str_to_write = ""):

    

    df = dict()

    str_to_write = ""

    order = G.order()

    size = G.size()

    if order <= 0: 

        return {"string_to_write":"Not applicable, size or order are equal to 0"}

    

    if order <=3:

        return {"string_to_write":"Not applicable, size or order lesser than 3"}

    try:

        order_size_ratio = size/order

      

    except:

          order_size_ratio = "NA"

            





    

    str_to_write+=f'\n▸Number of nodes: {order} - Number of links:{size} - Size/Order ratio: {order_size_ratio}'

    degree= list(dict(G.degree()).values())

    

    str_to_write+=f'\n▸Standard deviation: {np.std(degree)}'

    str_to_write+= f'\n▸Mean: {np.mean(degree)}'

    str_to_write+= f'\n▸Median: {np.median(degree)}'

    str_to_write+= f'\n▸Min: {np.min(degree)}'

    str_to_write+= f'\n▸Max: {np.max(degree)}'

    

    in_degree = list(dict(G.in_degree()).values())

    str_to_write+=f'\n▸Standard deviation in_degree: {np.std(in_degree)}'

    str_to_write+=f'\n▸Mean in_degree: {np.mean(in_degree)}'

    str_to_write+=f'\n▸Median in_degree: {np.median(in_degree)}'

    str_to_write+=f'\n▸Min in_degree: {np.min(in_degree)}'

    str_to_write+=f'\n▸Max in_degree: {np.max(in_degree)}'

    

    

    out_degree = list(dict(G.out_degree()).values())

    str_to_write+=f'\n▸Standard deviation out_degree: {np.std(out_degree)}'

    str_to_write+=f'\n▸Mean out_degree: {np.mean(out_degree)}'

    str_to_write+=f'\n▸Median out_degree: {np.median(out_degree)}'

    str_to_write+=f'\n▸Min out_degree: {np.min(out_degree)}'

    str_to_write+=f'\n▸Max out_degree: {np.max(out_degree)}'

    

    

    degree_weighted= list(dict(G.degree(weight="weight")).values())

    str_to_write+=f'\n▸Standard deviation weighted: {np.std(degree_weighted)}'

    str_to_write+=f'\n▸Mean weighted: {np.mean(degree_weighted)}'

    str_to_write+=f'\n▸Median weighted: {np.median(degree_weighted)}'

    str_to_write+=f'\n▸Min weighted: {np.min(degree_weighted)}'

    str_to_write+=f'\n▸Max weighted: {np.max(degree_weighted)}'

    

    in_degree_weighted = list(dict(G.in_degree(weight="weight")).values())

    str_to_write+=f'\n▸Standard deviation in_degree weighted: {np.std(in_degree_weighted)}'

    str_to_write+=f'\n▸Mean in_degree weighted: {np.mean(in_degree_weighted)}'

    str_to_write+=f'\n▸Median in_degree weighted: {np.median(in_degree_weighted)}'

    str_to_write+=f'\n▸Min in_degree weighted: {np.min(in_degree_weighted)}'

    str_to_write+=f'\n▸Max in_degree weighted: {np.max(in_degree_weighted)}'

    

    

    out_degree_weighted = list(dict(G.out_degree(weight="weight")).values())

    str_to_write+=f'\n▸Standard deviation out_degree weighted: {np.std(out_degree_weighted)}'

    str_to_write+=f'\n▸Mean out_degree weighted: {np.mean(out_degree_weighted)}'

    str_to_write+=f'\n▸Median out_degree weighted: {np.median(out_degree_weighted)}'

    str_to_write+=f'\n▸Min out_degree weighted: {np.min(out_degree_weighted)}'

    str_to_write+=f'\n▸Max out_degree weighted: {np.max(out_degree_weighted)}'

  







    density = nx.density(G)

    str_to_write+=f'\n▸Density: {density}'



    avg_clustering = nx.average_clustering(G)

    str_to_write+=f'\n▸Avg. Clustering coeff: {avg_clustering}'

    transitivity = nx.transitivity(G)

    str_to_write+=f'\n▸Transitivity: {transitivity}'

    assortativity =  str(nx.degree_assortativity_coefficient(G))

    str_to_write+=f'\n▸Assortativity coefficient: {assortativity}'

    

    assortativity_w =  str(nx.degree_assortativity_coefficient(G,weight='weigth'))

    str_to_write+=f'\n▸Assortativity weighted coefficient: {assortativity_w}'

    

    pearson_assortativity = nx.degree_pearson_correlation_coefficient(G)

    str_to_write+=f'\n▸Pearson Assortativity coefficient: {pearson_assortativity}' 



    try:

        avg_shortest_path_length = nx.average_shortest_path_length(G)

    except nx.NetworkXError:

        avg_shortest_path_length = "is weakly connected"

        

    dag = nx.is_directed_acyclic_graph(G)



    try:

        diameter = nx.algorithms.distance_measures.diameter(G)

    except:

        diameter = float("inf")



    wk_comps = [len(c) for c in sorted(nx.weakly_connected_components(G),key=len, reverse=True)]

    is_weak =  nx.is_strongly_connected(G)

    sg_comps = [len(c) for c in sorted(nx.strongly_connected_components(G),key=len, reverse=True)]

    is_strong = nx.is_weakly_connected(G)

    

    str_to_write+=f'\n▸Average Shortest Path Length: {avg_shortest_path_length}'

    str_to_write+= f'\n▸Diameter: {diameter}'

    str_to_write+= f'\n▸Is DAG?: {dag}'



    str_to_write+= f'\n▸Weakly Connected Components: {wk_comps}'

    str_to_write+= f'\n▸Is Weakly connected?:  {is_weak}'

    

    str_to_write+= f'\n▸Strongly Connected Components: {sg_comps}'

    str_to_write+= f'\n▸Is Strongly connected?:  {is_strong}'   

    

    

    

    deg_centr = nx.degree_centrality(G)

    sort_orders_dc = sorted(deg_centr.items(), key=lambda x: x[1], reverse=True)



    #for i in range(10):

        #print(sort_orders_dc[i])

    

    

    degree_Centrality = tenth_centrality(G, centr_type="degree")

    betweenesCentrality = tenth_centrality(G, centr_type="betweenness")

    closenessCentrality = tenth_centrality(G, centr_type="closeness")

    eigenCentrality = tenth_centrality(G, centr_type="eigenvector")

    pagerankCentrality = tenth_centrality(G, centr_type="pagerank")

    

    betweenesCentrality_w = tenth_centrality(G, centr_type="betweenness", w='weight')

    eigenCentrality_w = tenth_centrality(G, centr_type="eigenvector", w='weight')

    pagerankCentrality_w = tenth_centrality(G, centr_type="pagerank", w='weight')



    str_to_write+=f'\n▸10 most important nodes for Degree Centrality:\n{degree_Centrality}'

    str_to_write+=f'\n▸10 most important nodes for Betweennes Centrality:\n{betweenesCentrality}'

    str_to_write+=f'\n▸10 most important nodes for Closeness Centrality:\n{closenessCentrality}'

    str_to_write+=f'\n▸10 most important nodes for Eigenvector Centrality:\n{eigenCentrality}'

    str_to_write+=f'\n▸10 most important nodes for Page Rank:\n{pagerankCentrality}'

    

    

    str_to_write+=f'\n▸10 most important nodes for Betweennes Centrality Weighted:\n{betweenesCentrality}'

    str_to_write+=f'\n▸10 most important nodes for Eigenvector Centrality Weighted:\n{eigenCentrality}'

    str_to_write+=f'\n▸10 most important nodes for Page Rank Weighted:\n{pagerankCentrality}'



    

    percentile_90 = np.percentile(degree,90)

    str_to_write+=f'\n▸90-percentile degree: {percentile_90}'

    hub_nodi = [k for k,v in dict(G.degree()).items() if v>= percentile_90]

    str_to_write+=f'\n▸Number of nodes in HUBs: {len(hub_nodi)}'

    str_to_write+=f'\n▸List of nodes in HUBs:\n{list(hub_nodi)}'



    percentile_90_in = np.percentile(in_degree,90)

    str_to_write+=f'\n▸90-percentile degree: {percentile_90_in}'

    hub_nodi_in = [k for k,v in dict(G.degree()).items() if v>= percentile_90_in]

    str_to_write+=f'\n▸Number of nodes in HUBs: {len(hub_nodi_in)}'

    str_to_write+=f'\n▸List of nodes in HUBs:\n{list(hub_nodi_in)}'



    percentile_90_out = np.percentile(out_degree,90)

    str_to_write+=f'\n▸90-percentile degree: {percentile_90_out}'

    hub_nodi_out = [k for k,v in dict(G.degree()).items() if v>= percentile_90_out]

    str_to_write+=f'\n▸Number of nodes in HUBs: {len(hub_nodi_out)}'

    str_to_write+=f'\n▸List of nodes in HUBs:\n{list(hub_nodi_out)}'

    

    

    isolates = list(nx.isolates(G))

    str_to_write+=f'\n▸Isolated nodes:{isolates}'



    

    

    

    

    # Not working on directed Graphs

    #print("Network connected?",nx.is_connected(G))

    #print("# Connected components",nx.number_connected_components(G))

    #triangles = len(nx.triangles(G))

    #print("Number of triangles:",triangles)

   

    deg_PL = compute_power_law_exponent(degree)

    deg_W_PL = compute_power_law_exponent(degree_weighted)

    deg_in_PL = compute_power_law_exponent(in_degree)

    deg_in_W_PL= compute_power_law_exponent(in_degree_weighted)

    deg_out_PL= compute_power_law_exponent(out_degree)

    deg_out_W_PL = compute_power_law_exponent(out_degree_weighted)

    

  #  str_to_write+=f'\n▸K-test for PowerLaw distribution'



    str_to_write+=f'\n▸Power Law K-test on Degree: {deg_PL}'

    str_to_write+=f'\n▸Power Law K-test on Weighted Degree: {deg_W_PL}'

    str_to_write+=f'\n▸Power Law K-test on In-Degree: {deg_in_PL}'

    str_to_write+=f'\n▸Power Law K-test on Weighted In-Degree: {deg_in_W_PL}'

    str_to_write+=f'\n▸Power Law K-test on Out-Degree: {deg_out_PL}'

    str_to_write+=f'\n▸Power Law K-test on Weighted Out-Degree: {deg_out_W_PL}'



    



    

    return {'string_to_write':str_to_write,'order':order,'size':size, 'order_size_ratio': order_size_ratio, 'avg_shortest_path_length': avg_shortest_path_length,\

            'mean':np.mean(degree),'std':np.std(degree),'median':np.median(degree),'min_deg':np.min(degree),'max_deg':np.max(degree),\

            \

            'mean_in':np.mean(in_degree),'std_in':np.std(in_degree),\

            'median_in':np.median(in_degree),'min_deg_in':np.min(in_degree),'max_deg_in':np.max(in_degree),\

            \

            'mean_out':np.mean(out_degree),'std_out':np.std(out_degree),\

            'median_out':np.median(out_degree),'min_deg_out':np.min(out_degree),'max_deg_out':np.max(out_degree),\

            \

            'mean_weighted':np.mean(degree_weighted),'std_weighted':np.std(degree_weighted),\

            'median_weighted':np.median(degree_weighted),'min_deg_weighted':np.min(degree_weighted),'max_deg_weighted':np.max(degree_weighted),\

            \

            'mean_in_weighted':np.mean(in_degree_weighted),'std_in_weighted':np.std(in_degree_weighted),\

            'median_in_weighted':np.median(in_degree_weighted),'min_deg_in_weighted':np.min(in_degree_weighted),'max_deg_in_weighted':np.max(in_degree_weighted),\

            \

            'mean_out_weighted':np.mean(out_degree_weighted),'std_out_weighted':np.std(out_degree_weighted),\

            'median_out_weighted':np.median(out_degree_weighted),'min_deg_out_weighted':np.min(out_degree_weighted),'max_deg_out_weighted':np.max(out_degree_weighted),\

            \

            'assortativity':assortativity, 'pearson_assortativity':pearson_assortativity, 'assortativity_weigthed':assortativity_w, \

            'transitivity':transitivity,'avg_clustering':avg_clustering,'density':density,\

            'diameter':diameter, 'is_dag':dag, 'wk_comps':wk_comps, 'is_weak':is_weak, 'sg_comps':sg_comps, 'is_strong':is_strong,\

            'degree_centrality':degree_Centrality,'betweennes_centrality':betweenesCentrality, 'closeness_centrality':closenessCentrality, \

            'eigen_centrality':eigenCentrality,'pagerank_centrality':pagerankCentrality, \

            'betweennes_centrality_weighted':betweenesCentrality_w,'eigen_centrality_weighted':eigenCentrality_w,\

            'pagerank_centrality_weighted':pagerankCentrality_w,\

            '90-percentile_degree':percentile_90,'hubs':list(hub_nodi),\

            'hubs_number':len(hub_nodi),'isolated_nodes':isolates, \

            'PL_degree_p':deg_PL['p-value'], 'PL_degree_t':deg_PL['test'],'PL_degree_exp':deg_PL['exp'], \

            'PL_degree_weighted_p': deg_W_PL['p-value'],'PL_degree_weighted_t': deg_W_PL['test'], 'PL_degree_weighted_exp': deg_W_PL['exp'],  \

            'PL_in_degree_p': deg_in_PL['p-value'], 'PL_in_degree_t': deg_in_PL['test'], 'PL_in_degree_exp': deg_in_PL['exp'],\

            'PL_in_degree_weighted_p': deg_in_W_PL['p-value'],'PL_in_degree_weighted_t': deg_in_W_PL['test'],'PL_in_degree_weighted_exp': deg_in_W_PL['exp'],\

            'PL_out_degree_p': deg_out_PL['p-value'],  'PL_out_degree_t': deg_out_PL['test'], 'PL_out_degree_exp': deg_out_PL['exp'],\

            'PL_out_degree_weighted_p': deg_out_W_PL['p-value'], 'PL_out_degree_weighted_t': deg_out_W_PL['test'],'PL_out_degree_weighted_exp': deg_out_W_PL['exp'] \

           }

In [None]:
df_stats = dict() 

os.makedirs(ROOT_PATH+"stats/",exist_ok=True)

file = open(ROOT_PATH+"stats/aggregated_stats.txt", "w")

str_to_write_merged = ""





for g_name in G_dict:

    str_to_write_merged += "\n\n============ "+g_name+" ============\n"

    df_stats[g_name] = compute_graph_stats(G_dict[g_name], name=g_name, str_to_write="")

    str_to_write_merged+=  df_stats[g_name]['string_to_write']

    str_to_write_merged+="\n"



file.write(str_to_write_merged)

file.close()

In [None]:
stats_month_dict = dict() 

os.makedirs(ROOT_PATH+"stats/",exist_ok=True)

file = open(ROOT_PATH+"stats/stats_monthly.txt", "w")

str_to_write_merged = ""

for month in G_dict_monthly:

    for g_name in G_dict_monthly[month]:

        str_to_write_merged += "\n\n============ "+g_name+ "  -  "+month+" ============\n"

        stats_month_dict[g_name+"_"+month] = compute_graph_stats(G_dict_monthly[month][g_name], name=month+"_"+g_name, str_to_write="")

        str_to_write_merged+=  stats_month_dict[g_name+"_"+month]['string_to_write']

        str_to_write_merged+="\n"



file.write(str_to_write_merged)

file.close()

In [None]:
stats_month_dict_cumulative = dict() 

os.makedirs(ROOT_PATH+"stats/",exist_ok=True)

file = open(ROOT_PATH+"stats_monthly_cumulative.txt", "w")



str_to_write_merged = ""

for month in G_dict_monthly_cumulative:

    for g_name in G_dict_monthly_cumulative[month]:

        str_to_write_merged += "\n\n============ "+g_name+ "  -  "+month+" ============\n"

        stats_month_dict_cumulative[g_name+"_"+month] = compute_graph_stats(G_dict_monthly_cumulative[month][g_name], name=month+"_"+g_name, str_to_write="")

        str_to_write_merged+=  stats_month_dict_cumulative[g_name+"_"+month]['string_to_write']

        str_to_write_merged+="\n"



file.write(str_to_write_merged)

file.close()

In [None]:
stats_df = pd.DataFrame.from_dict(df_stats, orient='index')

stats_df=stats_df.drop('string_to_write',axis=1)

In [None]:
df_stats_month = pd.DataFrame.from_dict(stats_month_dict, orient='index')

df_stats_month=df_stats_month.drop('string_to_write',axis=1)

In [None]:
df_stats_month.sort_index()

In [None]:
df_stats_month_cumulative = pd.DataFrame.from_dict(stats_month_dict_cumulative, orient='index')

df_stats_month_cumulative=df_stats_month_cumulative.drop('string_to_write',axis=1)

In [None]:
df_stats_month_cumulative.sort_index()

# **Centrality Analysis**

The idea is to detect the most central nodes in the network each month. Different techniques have been applied for the computation of the most Central nodes: Degree Centrality, Betweenness Centrality, Closeness Centrality, Eigenvector Centrality, PageRank Centrality. Using these techniques, for each month a nodes ranking is obtained. In particular, the objective is the identification of the most 10th central nodes and also their position changes over time. If a node does not change position or is not excluded from the ranking, it means that Preferential Attachment property is confirmed.

In [None]:
centrality_algos = ['degree_centrality','betweennes_centrality','eigen_centrality',

   'pagerank_centrality','betweennes_centrality_weighted','eigen_centrality_weighted','pagerank_centrality_weighted']

In [None]:
def get_hub_month_dict(type_dict = 'cumulative', algo='pagerank_centrality_weighted' ):

    if type_dict == 'cumulative':

        df_stats = df_stats_month_cumulative

    if type_dict == 'month':

        df_stats = df_stats_month

    empty = []     

    hubs_month_dict = dict()

    for instr in list(set([i.split('_')[0]+"_"+i.split('_')[1] for i in df_stats_month_cumulative.index])):

  

        for index, row in df_stats[df_stats.index.str.contains(instr)][['hubs','degree_centrality','betweennes_centrality','eigen_centrality',

    'pagerank_centrality','betweennes_centrality_weighted','eigen_centrality_weighted','pagerank_centrality_weighted']].iterrows():

           # print(instr)

            

            hubs_month_dict[instr] = dict()

            for index, row in df_stats[df_stats.index.str.contains(instr)][[algo]].iterrows():

                split_index = index.split("_")

                date = split_index[2][:-3]

                instr = split_index[0]+"_"+split_index[1]

                hubs_month_dict[instr][date] = dict()

                try: 

                    if type(row[algo]) is not list and math.isnan(row[algo]):

                        empty.append(instr+"_"+date)

                    else:

                        for el in row[algo]:

                            hubs_month_dict[instr][date][el[0]] =   int(row[algo].index(el))+1                    

                except TypeError:

                    print("\n\nTYPEERROR")

                    print(row[algo])

                    print(algo)

           

              

    print("EMPTY RANKING for:")

    print(empty)

    return hubs_month_dict

In [None]:
def get_distribution_hubs_plot(dictionary):

    dict_comp = dict()

    for instr in dictionary:

        for date in dictionary[instr]:

            for comp in dictionary[instr][date]:

                if comp in dict_comp.keys():

                    dict_comp[comp]+=1

                else:

                    dict_comp[comp] = 1

    dict_comp = {k: v for k, v in sorted(dict_comp.items(), key=lambda item: item[1], reverse=True)}

    fig = plt.figure(figsize=(15,12))

    sns.set(style="white", font_scale=1.5)

    ax =sns.barplot(x =  list(dict_comp.keys()), y = list(dict_comp.values()))

    ax.tick_params(axis='x', rotation=90)

    ax.set_title("Most common Central nodes")

    return ax #dict_comp

In [None]:
hub_month_dict_cumulative = get_hub_month_dict(type_dict = 'cumulative')



In [None]:
save_single_plot(get_distribution_hubs_plot(hub_month_dict_cumulative), path="centrality_frequency/", name ="centrality_freq_cumulative")

In [None]:
hub_month_dict = get_hub_month_dict(type_dict = 'month')

In [None]:
p = get_distribution_hubs_plot(hub_month_dict)

save_single_plot(p, path="centrality_frequency/", name ="centrality_freq_monthly")

p

In [None]:
def central_node_ranking_plot(df, save =False, name = "", type_df ='cumulative'):

    n_top_ranked = 10

    

    fig, ax = plt.subplots(figsize=(15, 10), subplot_kw=dict(ylim=(0.5, 0.5 + n_top_ranked)))



    ax.xaxis.set_major_locator(MultipleLocator(1))

    ax.yaxis.set_major_locator(MultipleLocator(1))



    cmap = cm.get_cmap('tab20c', len(df.columns))

    cNorm  = colors.Normalize(vmin=0, vmax=len(df.columns))

    scalarMap = cm.ScalarMappable(norm=cNorm, cmap=cmap)

 

    for col in range(len(df.columns)):

         

        colorVal = scalarMap.to_rgba(col)

        ax.plot(df.index,df[df.columns[col]] , "o-", mfc="w", label = df.columns[col], color=colorVal)



    

    ax.invert_yaxis()

    ax.grid(axis="x")

    plt.xlabel('Month', fontsize=18)

    plt.ylabel('Rank', fontsize=16)

    plt.title(name+" - Central Nodes ranking over time", fontsize=20)

    plt.legend()

#    plt.tight_layout()

    ax.legend(loc='center left', bbox_to_anchor=(1.05, 0.5),

              ncol=1, fancybox=True, shadow=True )

    ax.tick_params(axis='x', rotation=90)

    

    if save:

        if 'centrality_ranking' not in os.listdir(ROOT_PATH):

            os.mkdir(ROOT_PATH+'centrality_ranking')

        if type_df not in os.listdir(ROOT_PATH+'centrality_ranking'):

            os.mkdir(ROOT_PATH+'centrality_ranking/'+type_df)

        fig.savefig(f'{ROOT_PATH}/centrality_ranking/{type_df}/{name}_ranking.png')

        plt.close()

    return ax

    



In [None]:
for algorithm in centrality_algos:

    type_dict = 'cumulative'

    hub_month_dict_cumulative = get_hub_month_dict(type_dict = type_dict, algo= algorithm)

    print(algorithm)

    path = ROOT_PATH+'centrality_ranking/'+type_dict+'/'

    os.makedirs(path, exist_ok=True)



    #os.mkdir(ROOT_PATH+'centrality_ranking/'+type_df)  

    plots = []

                

    for instr in hub_month_dict_cumulative:

        df = pd.DataFrame.from_dict(hub_month_dict_cumulative[instr], orient='index')

        plots.append(central_node_ranking_plot(df, save =False, name = algorithm+"_"+instr, type_df =type_dict))

        plt.close()

        

    with PdfPages(f'{path}/{type_dict}_{algorithm}.pdf') as pdf:

            for p in plots:      

                fig=p.get_figure()

                pdf.savefig(fig,bbox_inches='tight')

In [None]:
for algorithm in centrality_algos:

    type_dict = 'month'

    hub_month_dict_cumulative = get_hub_month_dict(type_dict = type_dict, algo= algorithm)

    print(algorithm)

    path = ROOT_PATH+'centrality_ranking/'+type_dict+'/'

    os.makedirs(path, exist_ok=True)



    #os.mkdir(ROOT_PATH+'centrality_ranking/'+type_df)  

    plots = []

                

    for instr in hub_month_dict_cumulative:

        df = pd.DataFrame.from_dict(hub_month_dict_cumulative[instr], orient='index')

        plots.append(central_node_ranking_plot(df, save =False, name = algorithm+"_"+instr, type_df =type_dict))

        plt.close()

        

    with PdfPages(f'{path}/{type_dict}cumulative_{algorithm}.pdf') as pdf:

            for p in plots:      

                fig=p.get_figure()

                pdf.savefig(fig,bbox_inches='tight')

In [None]:


for i in df_stats_month[df_stats_month.index.str.contains('ETF_N')][['hubs',

'degree_centrality',

'betweennes_centrality',

'eigen_centrality',

'closeness_centrality',

'pagerank_centrality',

'betweennes_centrality_weighted',

'eigen_centrality_weighted',

'pagerank_centrality_weighted']]:

    print(i)

In [None]:
def get_hubname(lis):

    if type(lis) is not list and math.isnan(lis):

        return "Centrality measure is not computable"

    return [i[0] for i in lis]

In [None]:
def save_central_nodes(df, name=""):

    path = ROOT_PATH+'hubs/'+name+'/'

    os.makedirs(path, exist_ok=True)

    for instr in G_dict.keys():

        str_file = ""

        str_file_score = "\n\n######### SCORES ###########"

        file = open(path+instr+".txt", "w")

        for index, row in df[df.index.str.contains(instr)][['hubs',

    'degree_centrality',

    'betweennes_centrality',

    'eigen_centrality',

    'closeness_centrality',

    'pagerank_centrality',

    'betweennes_centrality_weighted',

    'eigen_centrality_weighted',

    'pagerank_centrality_weighted']].iterrows():

         

            str_file+=f"================ {str(index)} ================\n\n"

            str_file+=f"HUBS with degree in 90-percentile: {row['hubs']}\n\n"

            str_file+=f"Degree Centrality: {get_hubname(row['degree_centrality'])}\n\n"

            str_file+=f"Betweennes Centrality:  {get_hubname(row['betweennes_centrality'])}\n\n"

            str_file+=f"Eigenvector Centrality:  {get_hubname(row['eigen_centrality'])}\n\n"

            str_file+=f"Pagerank Centrality:  {get_hubname(row['pagerank_centrality'])}\n\n"

            str_file+=f"Weighted Betweennes Centrality:  {get_hubname(row['betweennes_centrality_weighted'])}\n\n"

            str_file+=f"Weighted Eigenvector Centrality:  {get_hubname(row['eigen_centrality_weighted'])}\n\n"

            str_file+=f"Weighted Pagerank Centrality:  {get_hubname(row['pagerank_centrality_weighted'])}\n\n"



            str_file+="\n\n"



            str_file_score+= f"================ Centrality SCORES: {str(index)} ================\n\n"

            str_file_score+=f"Degree Centrality: {row['degree_centrality']}\n\n"

            str_file_score+=f"Betweennes Centrality:  {row['betweennes_centrality']}\n\n"

            str_file_score+=f"Eigenvector Centrality:  {row['eigen_centrality']}\n\n"

            str_file_score+=f"Pagerank Centrality:  {row['pagerank_centrality']}\n\n"

            str_file_score+=f"Weighted Betweennes Centrality:  {row['betweennes_centrality_weighted']}\n\n"

            str_file_score+=f"Weighted Eigenvector Centrality:  {row['eigen_centrality_weighted']}\n\n"

            str_file_score+=f"Weighted Pagerank Centrality:  {row['pagerank_centrality_weighted']}\n\n"

  

        file.write(str_file+'\n\n\n'+str_file_score)

        file.close()

In [None]:
save_central_nodes(df_stats_month  , name='monthly')

In [None]:
save_central_nodes(df_stats_month_cumulative  , name='cumulative')

# **Distribution plots**

In [None]:
def compute_distribution_plots(G,g_name, graph_type):

    

    #pp = PdfPages(f'{ROOT_PATH}graphplots/{graph_type}/{g_name}_plots.pdf')

    plots = []

    

    degree= list(dict(G.degree).values())

    in_degree = list(dict(G.in_degree).values())

    out_degree = list(dict(G.out_degree).values())

    

    fig = plt.figure(figsize=(12,8))    

    plt.title("Degree Distribution")

    txt = 'Degree distribution visualisations: '+g_name+"_"+graph_type

    plt.text(0.5,0.95,txt, transform=fig.transFigure, size=24, ha='center')

    plt.plot(sorted(degree,reverse=True))

    

    plots.append(fig)

    #pp.savefig()

    #plt.show()

    plt.close()



    degree_sequence = sorted([d for n, d in G.degree()], reverse=True)  # degree sequence

    degreeCount = collections.Counter(degree_sequence)

    deg, cnt = zip(*degreeCount.items())



    fig = plt.figure(figsize=(12,8))    

    plt.bar(deg, cnt, width=0.8, color="b")

    plt.title("Degree Histogram")

    plt.ylabel("Nodes")

    plt.xlabel("Degree")

    plots.append(fig)

   # pp.savefig()

   # plt.show()

    plt.close()



    fig = plt.figure(figsize=(12,8))    

    plt.bar(x =cnt, height= deg)

    plt.axvline(np.array(deg).mean(), color='k', linestyle='dashed', linewidth=2)

    plt.title("Degree Histogram "+g_name+"_"+graph_type)

    plt.xlabel('Number of connections per node')

    plt.ylabel('Frequency')

    plots.append(fig)

    #pp.savefig()

    #plt.show()

    plt.close()



    

    

    fig = plt.figure(figsize=(12,8)) 

    plt.title("Degree Distribution "+g_name+"_"+graph_type)

    plt.plot(deg,cnt,"ro-") # degree

    plt.legend(['Degree'])

    plt.xlabel('Degree')

    plt.ylabel('Number of nodes')

    plt.title('Network')

    #pp.savefig()

    #plt.show()

    plots.append(fig)

    plt.close()



    fig = plt.figure(figsize=(12,8))    

    plt.loglog(deg,cnt,"ro") # degree

    plt.legend(['Degree'])

    plt.xlabel('Degree')

    plt.ylabel('Number of nodes')

    plt.title('Log Network '+g_name+"_"+graph_type)

    #pp.savefig()

    #plt.show()

    plots.append(fig)

    plt.close()

    

    

    # ECDF linear scale

    cdf = ECDF(degree)

    x = np.unique(degree)

    y = cdf(x)



    fig_cdf = plt.figure(figsize=(12,8))

    axes = fig_cdf.gca()

    plt.title("ECDF Linear Scale "+g_name+"_"+graph_type)



    axes.plot(x,y,marker='o',ms=6, linestyle='None')

    axes.set_xlabel('Degree',size=20)

    axes.set_ylabel('ECDF', size = 20)

    plots.append(fig_cdf)

    #pp.savefig()

    #plt.close()



    

    # ECDF loglog scale

    cdf = ECDF(degree)

    x = np.unique(degree)

    y = cdf(x)

    fig_cdf = plt.figure(figsize=(12,8))

    axes = fig_cdf.gca()

    plt.title("ECDF Log-Log Scale "+g_name+"_"+graph_type)



    axes.loglog(x,y,marker='o',ms=8, linestyle='--')

    axes.set_xlabel('Degree',size=20)

    axes.set_ylabel('ECDF', size = 20)

    plots.append(fig_cdf)

    #pp.savefig()

    #plt.close()

    

    

    # ECCDF

    cdf = ECDF(degree)

    x = np.unique(degree)

    y = cdf(x)

    fig_cdf = plt.figure(figsize=(12,8))

    plt.title("ECCDF "+g_name+"_"+graph_type)



    axes = fig_cdf.gca()

    axes.loglog(x,1-y,marker='o',ms=8, linestyle='--')

    axes.set_xlabel('Degree',size=20)

    axes.set_ylabel('ECCDF', size = 20)

    plots.append(fig_cdf)

    #pp.savefig()

    #plt.close()



    

    SF = nx.generators.directed.scale_free_graph(len(G.nodes))

    sf_deg = list(dict(SF.degree()))

    

    # ECCDF

    cdf = ECDF(sf_deg)

    x = np.unique(sf_deg)

    y = cdf(x)

    fig_cdf = plt.figure(figsize=(12,8))

    axes = fig_cdf.gca()

    axes.loglog(x,1-y,marker='o',ms=8, linestyle='--')

    axes.set_xlabel('Degree',size=20)

    axes.set_ylabel('ECCDF', size = 20)

    plt.title('ECCDF Scale Free Network '+g_name+"_"+graph_type)

    plots.append(fig_cdf)

    #pp.savefig()

    #plt.close()

    

    density = p = nx.density(G)

    p = density

    random_graph = nx.fast_gnp_random_graph(G.order(),p, directed=True)

    



    random_degree = list(dict(random_graph.degree()).values())

    

    cdf_random = ECDF(random_degree)

    x_random = np.unique(random_degree)

    y_random = cdf_random(x_random)



    cdf = ECDF(degree)

    x = np.unique(degree)

    y = cdf(x)

    

    cdf_in = ECDF(in_degree)

    x_in = np.unique(in_degree)

    y_in = cdf_in(x_in)

        

    cdf_out = ECDF(out_degree)

    x_out = np.unique(out_degree)

    y_out = cdf_out(x_out)

    



    cdf_sf = ECDF(sf_deg)

    x_sf = np.unique(sf_deg)

    y_sf = cdf_sf(x_sf)

    

    fig_cdf_fb = plt.figure(figsize=(12,8))

    

    

    axes = fig_cdf_fb.gca()

    axes.set_xscale('log')

    axes.set_yscale('log')

    axes.loglog(x,1-y,marker='o',label='degree',ms=8, linestyle='--')

    axes.loglog(x_random,1-y_random,marker='+',label='random_net_degree',ms=10, linestyle='--')

    axes.loglog(x_sf,1-y_sf,marker='+',color='r',label='scale_free_degree',ms=10, linestyle='--')

    plt.title("ECCDF comparison Network vs. Random Network vs. Scale Free Network "+g_name+"_"+graph_type)

    axes.set_xlabel('Degree',size=20)

    axes.set_ylabel('ECCDF', size = 20)

    handles, labels = axes.get_legend_handles_labels()

    axes.legend(handles, labels)

    #pp.savefig()

    #plt.show()

    plots.append(fig_cdf_fb)

    plt.close()

     

    fig_cdf_fb = plt.figure(figsize=(12,8))



    axes = fig_cdf_fb.gca()

    axes.set_xscale('log')

    axes.set_yscale('log')

    axes.loglog(x,1-y,marker='o',label='degree',ms=8, linestyle='--')

    axes.loglog(x_in,1-y_in,marker='o',label='in_degree',color='blue',ms=8, linestyle='--')

    axes.loglog(x_out,1-y_out,marker='o',label='out_degree',color='g',ms=8, linestyle='--')



    axes.loglog(x_random,1-y_random,marker='+',label='random_net_degree',ms=10, linestyle='--')

    axes.loglog(x_sf,1-y_sf,marker='+',color='r',label='scale_free_degree',ms=10, linestyle='--')

    plt.title("ECCDF comparison Network vs. Random Network vs. Scale Free Network "+g_name+"_"+graph_type)

    axes.set_xlabel('Degree',size=20)

    axes.set_ylabel('ECCDF', size = 20)

    handles, labels = axes.get_legend_handles_labels()

    axes.legend(handles, labels)

    #pp.savefig()

    #plt.show()

    plots.append(fig_cdf_fb)

    plt.close()

    

    

    # WEIGHTED

    degree_weighted= list(dict(G.degree(weight="weight")).values())

    in_degree_weighted = list(dict(G.in_degree(weight="weight")).values())

    out_degree_weighted = list(dict(G.out_degree(weight="weight")).values())



    cdf_deg_W = ECDF(degree_weighted)

    x_deg_W = np.unique(degree_weighted)

    y_deg_W = cdf_deg_W(x_deg_W)

    

    cdf_deg_in_W = ECDF(in_degree_weighted)

    x_deg_in_W = np.unique(in_degree_weighted)

    y_deg_in_W = cdf_deg_in_W(x_deg_in_W)

    

    

    cdf_deg_out_W = ECDF(out_degree_weighted)

    x_deg_out_W = np.unique(out_degree_weighted)

    y_deg_out_W = cdf_deg_out_W(x_deg_out_W)

    

    fig_cdf_fb = plt.figure(figsize=(12,8))



    axes = fig_cdf_fb.gca()

    axes.set_xscale('log')

    axes.set_yscale('log')

    axes.loglog(x_deg_W,1-y_deg_W,marker='o',label='degree_weighted',ms=8, linestyle='--')

    axes.loglog(x_deg_in_W,1-y_deg_in_W,marker='o',label='in_degree_weighted',color='blue',ms=8, linestyle='--')

    axes.loglog(x_deg_out_W,1-y_deg_out_W,marker='o',label='out_degree_weighted',color='g',ms=8, linestyle='--')



    axes.loglog(x_random,1-y_random,marker='+',label='random_net_degree',ms=10, linestyle='--')

    axes.loglog(x_sf,1-y_sf,marker='+',color='r',label='scale_free_degree',ms=10, linestyle='--')

    plt.title("ECCDF comparison Network vs. Random Network vs. Scale Free Network - WEIGHTED Degrees "+g_name+"_"+graph_type)

    axes.set_xlabel('Degree',size=20)

    axes.set_ylabel('ECCDF', size = 20)

    handles, labels = axes.get_legend_handles_labels()

    axes.legend(handles, labels)

    plots.append(fig_cdf_fb)

#    pp.savefig()

#    plt.show()

    plt.close()

    

    

    #plt.close()

    fig = plt.figure(figsize=(12,8)) 

    fig.clf()



    plt.title("Infos "+g_name+"_"+graph_type)

   

    

    txt = 'Number of nodes: {}'.format(random_graph.order())

    txt+='\n'+'Number of links: {}'.format(random_graph.size())

    txt+='\n'+'Random Net Standard deviation: {}'.format(np.std(random_degree))

    txt+='\n'+'Random Net Mean: {}'.format(np.mean(random_degree))

    txt+='\n'+'Random Net Median: {}'.format(np.median(random_degree))

    txt+='\n'+'Random Net Min: {}'.format(np.min(random_degree))

    txt+='\n'+'Random Net Max: {}'.format(np.max(random_degree))

    plt.text(0.2,0.7,txt, size=12, transform=fig.transFigure,va='center',ha='left')

    

    frame1 = plt.gca()

    frame1.axes.get_xaxis().set_visible(False)

    frame1.axes.get_yaxis().set_visible(False)

    plots.append(fig)   

    

    return plots

    #pp.savefig()

    

    #pp.close()



In [None]:
plots_dict.keys()

In [None]:
graph_type = 'cumulative'



if 'graphplots' not in os.listdir(ROOT_PATH):

    os.mkdir(ROOT_PATH+'graphplots')

if graph_type not in os.listdir(ROOT_PATH+'graphplots'):

    os.mkdir(ROOT_PATH+'graphplots/'+graph_type)

    

plots_dict = dict()

for g_name in G_dict_monthly_cumulative[list(G_dict_monthly_cumulative.keys())[0]].keys():

    plots_dict[g_name] = []

    

for date in G_dict_monthly_cumulative:

     for g_name in G_dict_monthly_cumulative[date]:

        print("======\n"+g_name+"\n")

        plots_dict[g_name] += compute_distribution_plots(G_dict_monthly_cumulative[date][g_name],date+"_"+g_name, graph_type = graph_type)

        plt.close()

        print()



In [None]:
graph_type = 'monthly'

    

if 'graphplots' not in os.listdir(ROOT_PATH):

    os.mkdir(ROOT_PATH+'graphplots')

if graph_type not in os.listdir(ROOT_PATH+'graphplots'):

    os.mkdir(ROOT_PATH+'graphplots/'+graph_type)

    

for date in G_dict_monthly:

     for g_name in G_dict_monthly[date]:

        print("======\n"+g_name+"\n")

        compute_distribution_plots(G_dict_monthly[date][g_name],date+"_"+g_name, graph_type = graph_type)

        print()



# **Scale-free analysis**

### **Definition**

A network is called scale-free if the characteristics of the network are independent of the size of the network (i.e. the number of nodes). This means that when the network grows, the underlying structure remains the same. A scale-free network is dened by the distribution of the number of edges of the nodes following a so called power law distribution. An example of central concern for macroeconomics are production networks whose scale-free nature has recently been put forward as a potentially major driver of macroeconomic fluctuation

### **Small World Theory**

Small world theory is based on the idea that two individuals will be connected through a series of intermediaries. In the 1960s, Stanley Milgram tested this theory [TM69] in which all nodes are distance from each other for a short path. The "Six degrees of separation" is the idea that all people on average are six, or fewer, social connections away from each other. As a result, a chain of "friend of a friend" statements can be made to connect any two people in a maximum of six steps.

### **Properties**

- Growth: a growth process where, over an extended period of time, new nodes join an already existing system.
- Preferential Attachment: In real networks new nodes prefer to link to the more connected nodes.
- Scale-free Resiliency: Scale-free networks are more resistant to random disconnection of nodes. It can be eliminated a considerable number of nodes randomly and the network's struc- ture is preserved and will not break into disconnected clusters. When the most connected nodes are targeted, the diameter of a scale-free network increases and the network breaks into isolated clusters. This occurs because when removing these nodes, the damage disturbs the heart of the system, whereas a random attack is most likely not.

In [None]:
import sys

import pandas as pd

sys.path.append('SFanalysis/')

import sfanalysis as sf



import visualisations as vz

In [None]:
def create_gml(df, df_type):

    if df_type == 'monthly_cumulative':

        if 'gmls' not in os.listdir(ROOT_PATH+"/SF_results"):

            os.makedirs(ROOT_PATH+"/SF_results/gmls/",exist_ok=True)

            os.makedirs(ROOT_PATH+"/SF_results/degseqs/gmls/",exist_ok=True)

            os.mkdir(ROOT_PATH/'gmls')

            os.mkdir('degseqs')

        folder = ROOT_PATH+'SF_results/gmls'

    if df_type == 'monthly':

        if 'gmls_month' not in os.listdir(ROOT_PATH+"/SF_results"):

            os.makedirs(ROOT_PATH+"/SF_results/gmls_month/",exist_ok=True)

            os.makedirs(ROOT_PATH+"/SF_results/degseqs_month/gmls_month",exist_ok=True)

        folder = ROOT_PATH+'SF_results/gmls_month'        

    if df_type != 'monthly' and df_type != 'monthly_cumulative':

        raise Exception("Specify the type of dataset: [monthly, monthly_cumulative]")

    for date in df:

        for instr in df[date]:

            nx.write_gml(df[date][instr], f"{folder}/{date}_{instr}.gml")

In [None]:
create_gml(G_dict_monthly_cumulative, df_type='monthly_cumulative')

create_gml(G_dict_monthly, df_type='monthly')

In [None]:
def get_scale_free(df_type, force =False):

    

    path = ROOT_PATH+'SF_results/'

    os.makedirs(path, exist_ok=True)



    if df_type =='cumulative':

        # location of gml files to analyze

        gml_dir = 'gmls/'

        # location to write degree sequences

        deg_dir = 'degseqs/' 

        name_csv_file = ""

    elif df_type =='monthly':

        gml_dir = 'gmls_month/'

        # location to write degree sequences

        deg_dir = 'degseqs_month/'

        name_csv_file = "_monthly"

    else:

        raise Exception("Specify the type of dataset: [monthly, monthly_cumulative]")

    

    # make catalog of gmls and write degree sequence files

    # each row of deg_df is a degree sequence file

    if not f'export_degree{name_csv_file}.csv' in os.listdir(path) or force:

        deg_df = sf.write_degree_sequences(gml_dir, deg_dir)

        deg_df.to_csv(f'{path}export_degree{name_csv_file}.csv')

    else:

        deg_df= pd.read_csv(f'{path}export_degree{name_csv_file}.csv',index_col=[0])

        

    # analyze all degree sequences (this will take a while for many or large data sets)    

    if not f'export_analysis{name_csv_file}.csv' in os.listdir(path) or force:   

        analysis_df = sf.analyze_degree_sequences(deg_dir, deg_df)

        analysis_df.to_csv(f'{path}export_analysis{name_csv_file}.csv')

    else:

        analysis_df = pd.read_csv(f'{path}export_analysis{name_csv_file}.csv',index_col=[0]) 

        

    # categorize networks (by unique gml file) into scale-free categories    

    if not f'export_hypothesis{name_csv_file}.csv' in os.listdir(path) or force:         

        hyps_df = sf.categorize_networks(analysis_df)

        hyps_df.to_csv(f'{path}export_hypothesis{name_csv_file}.csv')

    else:

        hyps_df = pd.read_csv(f'{path}export_hypothesis{name_csv_file}.csv',index_col=[0])



    return {'deg':deg_df, 'analysis':analysis_df, 'hyps':hyps_df}

In [None]:
cumulative_sf_dict = get_scale_free(df_type='cumulative')

In [None]:
hyps = cumulative_sf_dict['hyps']

not_sf = hyps[(hyps['Strongest'] == False)&(hyps['Strong'] == False)&(hyps['Super_Weak'] == False) & (hyps['Weakest'] == False) & (hyps['Weak'] == False)]



In [None]:
[i.split('/')[1].split('.gml')[0] for i in list(not_sf.index)]

In [None]:
monthly_sf_dict = get_scale_free(df_type='monthly')

In [None]:
 def get_scalefree_plots(sf_dict):

    deg_df = sf_dict['deg']

    analysis_df = sf_dict['analysis']

    hyps_df = sf_dict['hyps']

    

    plots = []

    not_sf = hyps_df[(hyps_df['Strongest'] == False)&(hyps_df['Strong'] == False)&(hyps_df['Super_Weak'] == False) & (hyps_df['Weakest'] == False) & (hyps_df['Weak'] == False)]

    not_sf_all = [i.split('/')[1].split('.gml')[0] for i in list(not_sf.index)]



    not_sf_dates = []

    not_sf_instr = []

    not_sf_sett = []

    for i in not_sf_all:

        splitting = i.split('_')

        not_sf_dates.append(splitting[0])

        not_sf_instr.append(splitting[1])

        not_sf_sett.append(splitting[2])



    not_sf_dates_dict = dict()

    for i in not_sf_dates:

        not_sf_dates_dict[i] = not_sf_dates.count(i)



    not_sf_instr_dict = dict()

    for i in not_sf_instr:

        not_sf_instr_dict[i] = not_sf_instr.count(i)





    not_sf_sett_dict = dict()

    for i in not_sf_sett:

        not_sf_sett_dict[i] = not_sf_sett.count(i)

    

    not_sf_dates_df = pd.DataFrame.from_dict(not_sf_dates_dict, orient='index', columns=['num'])

    not_sf_instr_df = pd.DataFrame.from_dict(not_sf_instr_dict, orient='index', columns=['num'])

    not_sf_sett_df = pd.DataFrame.from_dict(not_sf_sett_dict, orient='index', columns=['num'])

    

    fig = plt.figure(figsize=(15,12))  

    p = sns.barplot(x = not_sf_sett_df.index, y =not_sf_sett_df['num'])  

    plots.append(p)

    fig = plt.figure(figsize=(15,12))  

    p = sns.barplot(x = not_sf_instr_df.index, y =not_sf_instr_df['num'])

    plots.append(p)

    fig = plt.figure(figsize=(15,12)) 

    ax =sns.barplot(x = not_sf_dates_df.index, y = not_sf_dates_df['num'])

#    ax.set(title = "Emotions Distribution for "+name, xlabel = "Emotion",  ylabel = "Number")

    ax.tick_params(axis='x', rotation=90)

    plots.append(ax)

    

    fig, ax = plt.subplots(figsize=(10,7))



    vz.make_domain_ploth(ax, hyps_df, True)

    ax.text(-0.4, 10, 'All data sets' , fontsize=15, rotation=90, va='bottom')

    plots.append(ax)

    plt.show()

    return plots

In [None]:
sf_plots_cum = get_scalefree_plots(cumulative_sf_dict)

In [None]:
path = ROOT_PATH+'SF_results/'

with PdfPages(f'{path}/cumulative_scale_free.pdf') as pdf:

    for p in sf_plots_cum:      

        fig=p.get_figure()

        pdf.savefig(fig)

In [None]:
sf_plots_month = get_scalefree_plots(monthly_sf_dict)

In [None]:
path = ROOT_PATH+'SF_results/'

with PdfPages(f'{path}/monthly_scale_free.pdf') as pdf:

    for p in sf_plots_month:      

        fig=p.get_figure()

        pdf.savefig(fig)

# **Node Deletion**

A Network Resilience Analysis is performed for the monthly aggregated networks. In a network a node deletion means that a Company may goes to bankrupt or most commonly the company decides to exit the system. This deletion may alter the structure of the networks. To verify the structure alterations, it is possible to detect any changes in the Connected Component (CC) of a Network. If after a deletion, a change in the CC is present, this means that the node is a vulnerable one and the Network has been damaged and compromised. If after a deletion, there is no changes in the Connected Component, this implies that the networks has not been compromised. Moreover, the scale-free property strongly correlates with the network's robustness to failure. The major hubs are closely followed by smaller ones. These smaller hubs, in turn, are followed by other nodes with an even smaller degree and so on. Different node deletion approaches are applied:

- Random Node deletion: a node is deleted randomly form the Network
- Localized Node deletion: a deletion of a precisely selected node If failures occur at random and the vast majority of nodes are those with small degree, the likelihood that a hub would be affected is almost negligible. Localized attacks makes the scale free network more vulnerable compared to random attacks. In this case the targets are the most central nodes with high number of connection and amount counter-value delivered. Network of payments have shown scale-free properties in literature and they are resilient to random damage. This means that it is barely possible to destroy the network of payments by random removal, but if an exact portion of particularly selected nodes is removed, it breaks completely.

In [None]:
def random_deletion_breakpoint(G_init):

    count_cnt = []

    for i in range(1000):

        G = G_init.copy()

        init_len = len(G)

        flag = True

        cnt = 0

        while flag and len(G)>0:

            node_list = list(G.nodes())

            rand = random.randint(0,len(node_list)-1)

            remove = node_list[rand]

            G.remove_node(node_list[rand])

            cnt += 1

            if len(G)>0:

                if not nx.is_weakly_connected(G):

                    count_cnt.append(cnt)

                    flag = False

        

    return np.mean(count_cnt)

            



In [None]:
breakpoint_dict_cumulative= dict()

for month in G_dict_monthly_cumulative:

    print(month)

    breakpoint_dict_cumulative[month] = dict()

    for g_name in G_dict_monthly_cumulative[month]:

        breakpoint_dict_cumulative[month][g_name]=random_deletion_breakpoint(G_dict_monthly_cumulative[month][g_name])

In [None]:
pd.DataFrame.from_dict(breakpoint_dict_cumulative, orient='index')

In [None]:
breakpoint_dict_monthly = dict()

for month in G_dict_monthly:

    breakpoint_dict_monthly[month] = dict()

    for g_name in G_dict_monthly[month]:

        breakpoint_dict_monthly[month][g_name]=random_deletion_breakpoint(G_dict_monthly[month][g_name])

In [None]:
pd.DataFrame.from_dict(breakpoint_dict_monthly, orient='index')

In [None]:
def plot_deletion_distr(deletion_list,feature, name = ""):

    x = [str(i) for i in sorted(list(deletion_list.keys()))]

    y = [deletion_list[i][feature] for i in deletion_list]

    sns.set(rc={'figure.figsize':(25,10)}, style="white", font_scale=1.5)

    plot  = sns.lineplot(x=x,y=y, linewidth = 3)

    plot.set_title(name+"- Node deleting distribution - "+feature)

    plot.set_ylabel(feature)

    plot.set_xlabel("Node % deletion")

    plt.legend(labels=['random', 'hub'])



    return plot

In [None]:
def node_deletion_distribution(G_init, save_name="",mode="random",plot_mode='avg_path_length', save=False):

    G = G_init.copy()

    if mode == "hub":

        centr = nx.pagerank(G, weight= "weight")

        sort_orders = sorted(centr.items(), key=lambda x: x[1], reverse=True)

    if mode == "random":

        sort_orders = list(G.nodes())

    perc_dict_random = dict()



    init_len = len(G_init)

    node_list = list(G.nodes())

    five_perc = round(init_len/100*5)

    



    cnt = 0

    hub = 0

    while hub < len(sort_orders):

   # for hub in range(len(sort_orders)):

        if cnt%10 == 0:

            five_perc-=1

        else:

            five_perc+=1

        if len(G) <= 0 or cnt > 100:

                

            break

            

        node_list = list(G.nodes())

        

        perc_dict_random[cnt] = dict()

        perc_dict_random[cnt]['size'] =G.size()

        perc_dict_random[cnt]['order'] = G.order()

        perc_dict_random[cnt]['GWCC'] = [len(c) for c in sorted(nx.weakly_connected_components(G),key=len, reverse=True)]

        perc_dict_random[cnt]['num_GWCC'] = len([len(c) for c in sorted(nx.weakly_connected_components(G),key=len, reverse=True)])

        largest_cc = max(nx.weakly_connected_components(G), key=len)

        perc_dict_random[cnt]['avg_path_length'] = nx.average_shortest_path_length(G.subgraph(largest_cc).copy())

        

        counter = 0

        while counter < five_perc:

     #   for del_num in range(five_perc):

            if len(G) > 0:

                if mode == "random":

                    rand = random.randint(0,len(node_list)-1)

                    remove = node_list[rand]

                    G.remove_node(node_list[rand]) 

                    node_list = list(G.nodes())

                   

                if mode == 'hub':

                   # print(sort_orders[hub][0])

                    G.remove_node(sort_orders[hub][0])

    

            counter+=1

            hub+=1

 #       print(str(hub)+" - "+str(len(sort_orders)))

#        hub +=1

        cnt += 5

        

    for key in list(perc_dict_random.keys()):

        #print(key)

        if key > 100:

            perc_dict_random.pop(key)



    if save:

       # if 'deletion_distr' not in f'{ROOT_PATH}':

       #     os.mkdir(ROOT_PATH+'/deletion_distr')

        plot_obj = plot_deletion_distr(perc_dict_random, plot_mode)

        figure = plot_obj.get_figure() 

        if mode == 'hub':

            figure.savefig(f'{ROOT_PATH}/deletion_distr/{plot_mode}-{save_name}.png')

    return perc_dict_random

In [None]:
distr_deletion_monthly = dict()

plots = dict()

for month in G_dict_monthly:

    distr_deletion_monthly[month] = dict()

    

    print(month)

    os.makedirs(path, exist_ok=True)

    path = ROOT_PATH+'/deletion_distr/monthly/'

    for g_name in G_dict_monthly[month]:

        

        for f in ['avg_path_length', 'size']:

            #plt.clf()

 

            for m in ['random','hub']:

                distr_deletion_monthly[month][g_name] = node_deletion_distribution(G_dict_monthly[month][g_name], save_name=month+"_"+g_name+"_month",plot_mode=f,mode=m,save =False)

                plot = plot_deletion_distr( distr_deletion_monthly[month][g_name], f, name = month+"_"+g_name)

            if g_name in plots.keys():

                plots[g_name].append(plot)

            if g_name not in plots.keys():

                plots[g_name] = [plot]

               



            #plots.append(plot)

            plt.close()

for key in plots:



    with PdfPages(f'{path}{key}_avgpath_size.pdf') as pdf:

        for p in plots[key]:      

            fig=p.get_figure()

            pdf.savefig(fig)         





In [None]:
distr_deletion_monthly

In [None]:
plots.keys()

In [None]:
distr_deletion_cumulative = dict()

plots = dict()

for month in G_dict_monthly_cumulative:

    distr_deletion_cumulative[month] = dict()

    

    print(month)

    path = ROOT_PATH+'/deletion_distr/cumulative/'

    os.makedirs(path, exist_ok=True)

    for g_name in G_dict_monthly_cumulative[month]:

        

        for f in ['avg_path_length', 'size']:

            #plt.clf()

 

            for m in ['random','hub']:

                distr_deletion_cumulative[month][g_name] = node_deletion_distribution(G_dict_monthly_cumulative[month][g_name], save_name=month+"_"+g_name+"_cumulative",plot_mode=f,mode=m,save =False)

                plot = plot_deletion_distr( distr_deletion_cumulative[month][g_name], f, name = month+"_"+g_name)

            if g_name in plots.keys():

                plots[g_name].append(plot)

            if g_name not in plots.keys():

                plots[g_name] = [plot]

            plt.close()

            

            

for key in plots:



    with PdfPages(f'{path}{key}_avgpath_size.pdf') as pdf:

        for p in plots[key]:      

            fig=p.get_figure()

            pdf.savefig(fig)

In [None]:
distr_deletion_cumulative = dict()

for month in G_dict_monthly_cumulative:

    distr_deletion_cumulative[month] = dict()

    print(month)

    for g_name in G_dict_monthly_cumulative[month]:

        for f in ['avg_path_length', 'size']:

            plt.clf()

            for m in ['random','hub']:

                distr_deletion_cumulative[month][g_name] = node_deletion_distribution(G_dict_monthly_cumulative[month][g_name], save_name=month+"_"+g_name+"_cum",plot_mode=f,mode=m,save =True)

In [None]:
distr_deletion_cumulative

In [None]:
def hub_deletion_importance(G_init,name):

    #(G_init, ,mode="random",plot_mode='avg_path_length', save=False):

    G = G_init.copy()

    init_len = len(G)



    perc_dict_hub_sing = dict()



    #perc_dict_hub_three = dict()

    centr = nx.pagerank(G, weight= "weight")

    sort_orders = sorted(centr.items(), key=lambda x: x[1], reverse=True)



    for hub in sort_orders[:10]:

        G = G_init.copy()

        G.remove_node(hub[0])

        #print("removing;",hub[0])

        node_list = list(G.nodes())

        perc_dict_hub_sing[hub] = dict()

        perc_dict_hub_sing[hub]['size'] =G.size()

        perc_dict_hub_sing[hub]['order'] = G.order()

        perc_dict_hub_sing[hub]['GWCC'] = [len(c) for c in sorted(nx.weakly_connected_components(G),key=len, reverse=True)]

        perc_dict_hub_sing[hub]['num_GWCC'] = len([len(c) for c in sorted(nx.weakly_connected_components(G),key=len, reverse=True)])

        if len(G) != 0:

            if G.is_directed() and nx.is_weakly_connected(G):

                perc_dict_hub_sing[hub]['avg_path_length'] = nx.average_shortest_path_length(G)

                

    return perc_dict_hub_sing

In [None]:
hub_del_importance_month = dict()

for date in G_dict_monthly:

    hub_del_importance_month[date] = dict()

    for g_name in G_dict_monthly[date]:   

        hub_del_importance_month[date][g_name] = hub_deletion_importance(

                G_dict_monthly[date][g_name], name = date+"_"+g_name)



In [None]:
hub_del_importance_month

In [None]:
hub_del_importance_cumulative = dict()

for date in G_dict_monthly_cumulative:

    hub_del_importance_cumulative[date] = dict()

    for g_name in G_dict_monthly_cumulative[date]:   

        hub_del_importance_cumulative[date][g_name] = hub_deletion_importance(

                G_dict_monthly_cumulative[date][g_name], name = date+"_"+g_name)



In [None]:
def plot_del_companies_importance(df):

    count_dict = dict()

    for d in df:

        for g in df[d]:

            for comp in df[d][g]:

               # print(hub_del_importance_cumulative[d][g][comp])

               # break

                if df[d][g][comp]['num_GWCC'] >1:

                    #print(d)

                    #print(g)

                    #print(comp[0])

                    #print(hub_del_importance_cumulative[d][g][comp]['GWCC'])

                    #print(hub_del_importance_cumulative[d][g][comp])

                    #print('\n\n')

                    #if comp[0] == 'INTESA SANPAOLO':

                    #    print(comp[0])

                    if comp[0] not in count_dict:

                        count_dict[comp[0]] = 1

                    else:

                        count_dict[comp[0]] += 1

    count_dict = {k: v for k, v in sorted(count_dict.items(), key=lambda item: item[1], reverse=True)}

    fig = plt.figure(figsize=(15,12)) 

    ax =sns.barplot(x =  list(count_dict.keys()), y = list(count_dict.values()))

    ax.tick_params(axis='x', rotation=90)



    return count_dict

In [None]:
plot_del_companies_importance(hub_del_importance_cumulative)

In [None]:
plot_del_companies_importance(hub_del_importance_month)

# **Plot functions**

In [None]:
def compute_graph_plot_nx(G, layout='spring_layout'):

    if layout =='kamada_kawai_layout':

        pos = nx.kamada_kawai_layout(G)

    if layout =='circular_layout':      

        pos = nx.circular_layout(G)

    if layout == 'spring_layout':

        pos = nx.spring_layout(G, k=100000,iterations=5000)

    

    if layout == 'random_layout':

        pos = nx.random_layout(G)

    

    degree = list(dict(G.degree()).values())



    plt.figure(figsize=(15,15))

    color_list = [ [128,167,240,0.6] for i in list(G.nodes())]

    nx.draw_networkx_nodes(G, pos, node_size= [i*20+100 for i in degree], node_color= [ '#%02x%02x%02xB3' % (c[0],c[1],c[2]) for c in color_list ] )

    cmap = plt.cm.plasma

    ax= plt.gca()

    ax.collections[0].set_edgecolor("black")

    nx.draw_networkx_edges(G, pos,width=2, alpha=0.25, arrowstyle="->",

    arrowsize=10,

    edge_color='#93e685'            

   # edge_cmap=cmap,

)



    for y in pos:

        pos[y][1] = pos[y][1]-0.1

    nx.draw_networkx_labels(G,pos)

  #  pos = nx.spring_layout(G)

    

  #  pos_attrs = {}

  #  for node, coords in pos.items():

  #      pos[node] = (coords[0], coords[1] + 0.08)

  #      nx.draw_networkx_labels(G,pos,font_size=8)

   # labels = nx.get_edge_attributes(G,'weight')

   # nx.draw_networkx_edge_labels(G,pos,edge_labels=labels,font_color='r',font_size=10)

    plt.show()



def compute_graph_plot_ig(G, show_hubs=True, hubs_type="degree",save=False, name = None):

    g = ig.Graph.from_networkx(G)

    g.vs["label"] = list(G.nodes())

    if len(G.nodes())==0 or len(G.edges())== 0:

        return

    degree = list(G.degree())

     



    g.vs["label_size"] = 10

    g.vs["label_dist"] =1.2

    g.vs["label_color"] = "rgba(219, 10, 91, 0.8)"

    g.vs['size'] = [i/5+10 for i in list(dict(G.degree()).values())]

    g.es['width'] = 0.5

    g.es['arrow_size'] = 0.3

    g.es['color'] = 'gray'

    g.vs['color'] = "rgba(128,167,240,0.6)" 



    if show_hubs:

        if hubs_type == 'degree':

            degree = list(dict(G.degree()).values())

            dic = dict(G.degree()).items()

        if hubs_type == 'in':

            degree = list(dict(G.in_degree()).values())

            dic = dict(G.in_degree()).items()

        if hubs_type == 'out':

            degree = list(dict(G.out_degree()).values())

            dic = dict(G.out_degree()).items()

            

        if hubs_type == 'degree_w':

            degree = list(dict(G.degree(weight="weight")).values())

            dic = dict(G.degree(weight="weight")).items()

        if hubs_type == 'in_w':

            degree = list(dict(G.in_degree(weight="weight")).values())

            dic = dict(G.in_degree(weight="weight")).items()

        if hubs_type == 'out_w':

            degree = list(dict(G.out_degree(weight="weight")).values())

            dic = dict(G.out_degree(weight="weight")).items()

            

        percentile_90 = np.percentile(degree,90)   

        hub_nodi = [k for k,v in dic if v>= percentile_90]

        hub = [g.vs['label'].index(i) for i in hub_nodi]

    

        if len(hub) >0 :

            for i in hub:

                g.vs[i]['color'] = "rgba(31, 58, 147, 0.8)"

                g.vs[i]['label'] = list(G.nodes())[i]

                g.vs[i]['size'] = degree[i] #degree.index

                #g.es[i]['width'] = 1

                #g.es[i]['color'] = 'white'



   

    coords = ig.Graph.layout_graphopt(g,spring_length=50,node_mass=50,node_charge=10)

    if save:

      #  if 'igplots' not in os.listdir(ROOT_PATH):

      #      os.mkdir(ROOT_PATH+'igplots')

                

        out = ig.plot(g,layout = coords,  bbox=(1024,1024),margin = 150)

        #out = ig.plot(g,layout = coords)

        out.save(f'{ROOT_PATH}igplots/{name}.png')

       # .save(f'{ROOT_PATH}igplots/{name}.png')

   # ig.plot(tig,layout = coords, vertex_color=tig.degree(),vertex_size=[i/5+10 for i in tig.degree()])

    return g#ig.plot(g,layout = coords,  margin = 50)



def plot_part(g):

    g.graph.es['width'] = 0.3

    g.graph.es['color'] = 'gray'



    g.graph.es['arrow_size'] = 0.3

    #g.graph.vs['color'] = None

    g.graph.vs["label_dist"] =1.2

    g.graph.vs["label_color"] = "rgba(219, 10, 91, 0.8)"

    g.graph.vs["label_size"] =6

    return ig.plot(g)





def compute_graph_plot_partition(G,g,partition, partition_name, save=False,mark_group=False):

    

    #g = partition

    #colors = []

   # g = ig.Graph.from_networkx(G)

    if isinstance(partition,ig.clustering.VertexDendrogram):

                partition = partition.as_clustering()

    partition.graph.vs["label"] = list(G.nodes())

    partition.graph.vs["label_size"] = 10

    partition.graph.vs["label_dist"] =2

    partition.graph.vs["label_color"] = "rgba(219, 10, 91, 0.8)"

    partition.graph.vs['size'] = [i/5+10 for i in list(dict(G.degree()).values())]

    partition.graph.es['width'] = 0.5

    partition.graph.es['arrow_size'] = 0.3

    partition.graph.es['color'] = 'gray'

     

    pal = ig.drawing.colors.ClusterColoringPalette(len(partition))

    partition.graph.vs['color'] = pal.get_many(partition.membership)

    #cmap =  cm.get_cmap('turbo')

    #for col in colors:

    #    col[4] = 0.6

    #

    #cmap = cm.get_cmap('turbo',max(partition.membership)+1)   # PiYG

    #import matplotlib

    #colors = []

    #for i in range(cmap.N):

        #rgba = cmap(i)

        #rgba = list(rgba)

        #rgba[3] = 0.6

       

        #colors.append("rgba"+str(tuple(rgba)))

        # rgb2hex accepts rgb or rgba

        #print(matplotlib.colors.rgb2hex(rgba))

    #colors = [colors[i] for i in partition.membership]



    #partition.graph.vs['color'] = colors #[i.append(0.6) for i in colors]

    #print(colors)

    #coords = ig.Graph.layout_fruchterman_reingold(g.graph, weights=[i/100000000000 for i in g.graph.es['weight']])

    coords = ig.Graph.layout_graphopt(partition.graph,spring_length=50,node_mass=50,node_charge=10)

    

    

    out = ig.plot(partition.graph,layout = coords, bbox=(1024,1024),margin = 150,mark_groups = mark_group)

    if save:

        if 'communitiesplots' not in os.listdir(ROOT_PATH):

            os.mkdir(ROOT_PATH+'communitiesplots')

        #out = ig.plot(partition,layout = coords,margin = 50)

        out.save(f'{ROOT_PATH}communitiesplots/{g_name}_comm_{partition_name}.png')

        

    return out #ig.plot(partition,layout = coords,  margin = 50)



def compute_graph__plot_pyvis(G):

    degree = list(dict(G.degree()).values())



    percentile_99 = np.percentile(degree,99)

    hub_nodi = [k for k,v in dict(G.degree()).items() if v>= percentile_99]

    

    nt = Network(height='100%', width='100%', bgcolor='#222222', font_color='white')#, notebook =True)

    nt.force_atlas_2based(gravity=-300,spring_length=300)

    nt.from_nx(G)

    for i in range(len(nt.nodes)):

        nt.nodes[i]['size'] = dict(G.degree)[nt.nodes[i]['id']]

        if nt.nodes[i]['id'] in hub_nodi:

            nt.nodes[i]['color'] = '#93e685'

    

    nt.show('nx.html')



In [None]:
plots = []

os.makedirs(ROOT_PATH+'igplots/monthly', exist_ok=True)



for date in G_dict_monthly.keys():

    for g_name in G_dict_monthly[date].keys():

        p = compute_graph_plot_ig(G_dict_monthly[date][g_name], save = True, name = f'monthly/monthly_plot_{date}{g_name}')

        

      #  if p is not None:

       #     p.save(f'{ROOT_PATH}igplots/monthly/monthly_plot_{date}{g_name}.png') 





In [None]:
plots = []

os.makedirs(ROOT_PATH+'igplots/monthly', exist_ok=True)



for date in G_dict_monthly.keys():

    for g_name in G_dict_monthly[date].keys():

        p = compute_graph_plot_ig(G_dict_monthly[date][g_name], save = True, name = f'monthly/monthly_plot_{date}{g_name}')

        

      #  if p is not None:

       #     p.save(f'{ROOT_PATH}igplots/monthly/monthly_plot_{date}{g_name}.png') 



