Import relevant libraries for network and data handling / transformation

In [16]:
import networkx as nx
import pandas as pd
import os
from networkx import PowerIterationFailedConvergence

Read all files in the shareholder file directory

In [17]:
# Get all Files in raw_data/cartel_shareholders folder
files = os.listdir("transformed_data/base/shareholders")
files = [f for f in files if f.endswith(".xlsx")]

# Print number of files
print(f"Number of files in specified directory: {len(files)}")

# Create a list for all dataframes
dfs = []

# for every file in the folder
for f in files:
    # Read the file
    df = pd.read_excel("transformed_data/base/shareholders/" + f)
    # Append the dataframe to the list
    dfs.append(df)

print("---Finished---")

Number of files in specified directory: 61
---Finished---


Find the most common investors in the shareholder files (over all years)

In [18]:
# print number of dataframes, should be equal to number of files
print(f"Number of dataframes: {len(dfs)}")

# Merge all dataframes into one
df_merged = pd.concat(dfs)

investor_names = df_merged['investor_name'].unique()
cartel_names = df_merged['company_name'].unique()

# print amount of uniques
print(f"Number of unique investors: {len(investor_names)}")
print(f"Number of unique cartels: {len(cartel_names)}")

# order investors by number of companies they are in invested in
investor_counts = df_merged['investor_name'].value_counts()

# get all companies for each investor
investor_companies = df_merged.groupby('investor_name')['company_name'].apply(list)

# concatanate all companies for each investor into a string
investor_companies = investor_companies.apply(lambda x: ', '.join(x))

# merge investor_counts and investor_companies into a dataframe
investor_counts = pd.DataFrame(investor_counts)
investor_counts['companies'] = investor_companies

investor_counts = investor_counts.sort_values(by='count', ascending=False)

# save investor_counts to excel
investor_counts_df = pd.DataFrame(investor_counts)
investor_counts_df.to_excel("./transformed_data/connected_shareholder_data/investor_counts.xlsx", index=True)

Number of dataframes: 61
Number of unique investors: 8246
Number of unique cartels: 61


Create a graph for each year for the unioned shareholder files

In [19]:
# get all perc_os columns in the dataframe
perc_os_columns = [col for col in df_merged.columns if 'perc_os' in col]

# for each column in perc_os_columns
for col in perc_os_columns:

    # Get year from the column name
    year = col[-4:]

    # Define new Graph
    G = nx.Graph()

    # for each row in the dataframe
    for _, row in df_merged.iterrows():

        # if the column, row value is not null
        if pd.notnull(row[col]):

            # Get the company name
            company_name = row['company_name']

            # Get the investor name
            investor_name = row['investor_name']

            # Get the perc_os value
            perc_os = row[col]

            # Add the company and investor to the graph
            G.add_node(company_name, type='Company')
            G.add_node(investor_name, type='Investor')

            # Add an edge between the company and investor with the perc_os value as weight
            G.add_edge(company_name, investor_name, weight=perc_os)

    # Get the number of nodes in the graph
    num_nodes = len(G.nodes)
    # Get the number of edges in the graph
    num_edges = len(G.edges)

    # Print the number of nodes and edges
    print(f"Year: {year}, Number of nodes: {num_nodes}, Number of edges: {num_edges}")

    nx.write_graphml(G, f"./transformed_data/connected_shareholder_networks/connected_shareholder_network_{year}.graphml", named_key_ids=True, infer_numeric_types=True)

    print(f"Graph saved as graphml file for year {year}") 

print("---Finished---")           


Year: 2011, Number of nodes: 4531, Number of edges: 23494
Graph saved as graphml file for year 2011
Year: 2010, Number of nodes: 4756, Number of edges: 25037
Graph saved as graphml file for year 2010
Year: 2009, Number of nodes: 4743, Number of edges: 25607
Graph saved as graphml file for year 2009
Year: 2008, Number of nodes: 4538, Number of edges: 26401
Graph saved as graphml file for year 2008
Year: 2007, Number of nodes: 4423, Number of edges: 26198
Graph saved as graphml file for year 2007
Year: 2006, Number of nodes: 4353, Number of edges: 22368
Graph saved as graphml file for year 2006
Year: 2005, Number of nodes: 4124, Number of edges: 21055
Graph saved as graphml file for year 2005
Year: 2004, Number of nodes: 3602, Number of edges: 19783
Graph saved as graphml file for year 2004
Year: 2003, Number of nodes: 3351, Number of edges: 19690
Graph saved as graphml file for year 2003
Year: 2002, Number of nodes: 3286, Number of edges: 19786
Graph saved as graphml file for year 2002


Read the created graphml files back into a list

In [20]:
connected_shareholders_graph_list = [] # list of graphs
connected_shareholders_year_list = [] # list of years -> to see which graph belongs to which year

for file in os.listdir("./transformed_data/connected_shareholder_networks"):
    if file.endswith(".graphml"):
        # create new graph from .graphml file
        G = nx.read_graphml(f"./transformed_data/connected_shareholder_networks/{file}")
        # add graph to list
        connected_shareholders_graph_list.append(G)
        # add year to list
        connected_shareholders_year_list.append(file.split("_")[-1].split(".")[0])

print(f"Number of graphs read: {len(connected_shareholders_graph_list)}")

Number of graphs read: 15


Calculate stats over the whole network for each year

In [11]:
# Prepare list to collect rows
network_stats_list = []

# for each graph in the list
for graph in connected_shareholders_graph_list:
    # Get the year from the graph name
    year = connected_shareholders_year_list[connected_shareholders_graph_list.index(graph)]
    # Get the number of nodes in the graph
    num_nodes = len(graph.nodes)
    # Get the number of edges in the graph
    num_edges = len(graph.edges)
    # Calculate the density of the graph (How connected is the network)
    density = nx.density(graph)
    
    # Calculate modularity
    partition = nx.community.greedy_modularity_communities(graph)
    modularity = nx.community.modularity(graph, partition)
    
    # Average path length
    try:
        avg_path_length = nx.average_shortest_path_length(graph)
    except nx.NetworkXError:
        avg_path_length = float('inf')  # Handle disconnected graphs

    # Add this row to the list
    network_stats_list.append({
        'Year': year,
        'Number of nodes': num_nodes,
        'Number of edges': num_edges,
        'Density': density,
        'Modularity': modularity,
        'Average path length': avg_path_length
    })

    print(f"Year: {year} finished")

# Convert to DataFrame at the end
network_stats_df = pd.DataFrame(network_stats_list)

# Save to Excel
network_stats_df.to_excel("./transformed_data/connected_shareholder_data/shareholder_network_stats.xlsx", index=False)

print("---Finished---")

Year: 1997 finished
Year: 1998 finished
Year: 1999 finished
Year: 2000 finished
Year: 2001 finished
Year: 2002 finished
Year: 2003 finished
Year: 2004 finished
Year: 2005 finished
Year: 2006 finished
Year: 2007 finished
Year: 2008 finished
Year: 2009 finished
Year: 2010 finished
Year: 2011 finished
---Finished---


Calculate node stats for the investors (Since there are so many, we can only calculate a few metrics)

In [21]:
# Define a list to collect measures
investor_measure_list = []

# for each graph in the list
for graph in connected_shareholders_graph_list:

    # Get the year from the graph name
    year = connected_shareholders_year_list[connected_shareholders_graph_list.index(graph)]

    # For each node in the graph
    for node in graph.nodes:

        # Get the node type
        node_type = graph.nodes[node]['type']

        # Get the node name
        node_name = node
 
        if node_type == 'Investor':
            
            # Get the number of companies the investor is invested in
            degree = graph.degree(node)

            # Get the sum of the weights of the edges connected to the investor
            weight_sum = sum([graph[node][neighbor]['weight'] for neighbor in graph.neighbors(node)])

            # Get the average weight of the edges connected to the investor
            avg_weight = weight_sum / degree if degree > 0 else 0

            # Get the maximum weight of the edges connected to the investor
            max_weight = max([graph[node][neighbor]['weight'] for neighbor in graph.neighbors(node)]) if degree > 0 else 0

            # calculate centralities of nodes (Takes way too long - disabled for now)
            #degree_centrality = nx.degree_centrality(graph)[node]
            #closeness_centrality = nx.closeness_centrality(graph)[node]
            #betweenness_centrality = nx.betweenness_centrality(graph)[node]
            #eigenvector_centrality = nx.eigenvector_centrality(graph)[node]
            #harmonic_centrality = nx.harmonic_centrality(graph)[node]
            #katz_centrality = nx.katz_centrality(graph)[node]

            # add measures to dictionary
            measures = {
                'year': year,
                'node_name': node_name,
                'node_type': node_type,
                'degree': degree,
                'weight_sum': weight_sum,
                'avg_weight': avg_weight,
                'max_weight': max_weight,
            }

            investor_measure_list.append(measures)
    
    print(f"Year: {year} finished")

# Convert to DataFrame at the end
df = pd.DataFrame(investor_measure_list)

# Save to Excel
df.to_excel("./transformed_data/connected_shareholder_data/shareholder_node_stats.xlsx", index=False)


Year: 1997 finished
Year: 1998 finished
Year: 1999 finished
Year: 2000 finished
Year: 2001 finished
Year: 2002 finished
Year: 2003 finished
Year: 2004 finished
Year: 2005 finished
Year: 2006 finished
Year: 2007 finished
Year: 2008 finished
Year: 2009 finished
Year: 2010 finished
Year: 2011 finished


Calculate the stats for the Company nodes (In total 61 so we should be able to calculate a few metrics) takes a long time anyway

In [24]:
# Define a list to collect measures
company_measure_list = []

# for each graph in the list
for graph in connected_shareholders_graph_list:

    # Get the year 
    year = connected_shareholders_year_list[connected_shareholders_graph_list.index(graph)]

    # For each node in the graph
    for node in graph.nodes:

        # Get the node type
        node_type = graph.nodes[node]['type']

        node_name = node
 
        if node_type == 'Company':

            # Get the number of companies the investor is invested in
            degree = graph.degree(node)

            # Get the sum of the weights of the edges connected to the company
            weight_sum = sum([graph[node][neighbor]['weight'] for neighbor in graph.neighbors(node)])

            # Get the average weight of the edges connected to the company
            avg_weight = weight_sum / degree if degree > 0 else 0

            # Get the maximum weight of the edges connected to the company
            max_weight = max([graph[node][neighbor]['weight'] for neighbor in graph.neighbors(node)]) if degree > 0 else 0

            # calculate the centralities for the company node
            degree_centrality = nx.degree_centrality(graph)[node]
            closeness_centrality = nx.closeness_centrality(graph)[node]
            betweenness_centrality = nx.betweenness_centrality(graph)[node]
            harmonic_centrality = nx.harmonic_centrality(graph)[node]
            pagerank = nx.pagerank(graph)[node]
            
            try:
                eigenvector_centrality = nx.eigenvector_centrality(graph)[node]    
                katz_centrality = nx.katz_centrality(graph)[node]

            except PowerIterationFailedConvergence:
                # if the eigenvector centrality calculation fails, set it to None
                eigenvector_centrality = None
                katz_centrality = None

            # add measures to dictionary
            measures = {
                'year': year,
                'node_name': node_name,
                'node_type': node_type,
                'degree': degree,
                'weight_sum': weight_sum,
                'avg_weight': avg_weight,
                'max_weight': max_weight,
                'degree_centrality': degree_centrality,
                'closeness_centrality': closeness_centrality,
                'betweenness_centrality': betweenness_centrality,
                'eigenvector_centrality': eigenvector_centrality,
                'harmonic_centrality': harmonic_centrality,
                'katz_centrality': katz_centrality,
                'pagerank': pagerank
            }

            company_measure_list.append(measures)
    
    print(f"Year: {year} finished")

df = pd.DataFrame(company_measure_list)

# Save to Excel
df.to_excel("./transformed_data/connected_shareholder_data/company_node_stats.xlsx", index=False)

KeyboardInterrupt: 