In [1]:
import networkx as nx
from pyvis.network import Network
import pandas as pd
import math
import matplotlib.pyplot as plt
import os

In [2]:
# Get all Files in raw_data/cartel_shareholders folder
files = os.listdir("raw_data/company_shareholders")
files = [f for f in files if f.endswith(".xlsx")]

# Print number of files
print(len(files))

# Create a list for all dataframes
dfs = []

# for every file in the folder
for f in files:
    # Read the file
    df = pd.read_excel("raw_data/company_shareholders/" + f)
    
    # drop last 3 rows (total, empty)
    df = df[:-3]

    # Drop empty rows
    df_cleaned = df.dropna(how='all', axis=0)

    # Get relevant columns with % O/S in the name and filing types
    relevant_columns_percent = df_cleaned.columns[df_cleaned.columns.str.contains('% O/S')]
    relevant_columns_filing = df_cleaned.columns[df_cleaned.columns.str.contains('Filing Type')]

    # Keep only relevant columns
    df_cleaned = df_cleaned[['Investor Name', 'Investor Sub-Type'] + list(relevant_columns_percent) + list(relevant_columns_filing)]

    # Change column names to be more readable by renaming % O/S to perc_os_{month_year} and Filing Type to filing_type_{month_year}
    col_list = []

    # for every column in the dataframe
    for col in df_cleaned.columns:

        # if the column contains % O/S, extract the month and year and rename the column to perc_os_{month_year}
        if '% O/S' in col:
            month_year = col.split(' ')[-1]
            col_list.append(f'perc_os_{month_year}')
        # if the column contains Filing Type, extract the month and year and rename the column to filing_type_{month_year}
        elif 'Filing Type' in col:
            month_year = col.split(' ')[-1]
            col_list.append(f'filing_type_{month_year}')
        # else, rename the column to lowercase and replace spaces with underscores
        else:
            col_list.append(str.lower(str.replace(col, ' ', '_')))

    # rename the columns
    df_cleaned.columns = col_list

    # add the company name to the dataframe
    df_cleaned["company_name"] = f.split(".")[0]

    # print the company name to see progress
    print(f.split(".")[0])
    
    # append the dataframe to the list of dataframes
    dfs.append(df_cleaned)

print("---Finished---")

dfs[0].head(10)

61
Aalberts
ABB
Akzo Nobel
Alstom SA
Areva
Asahi
AU Optronics
BAM
Bayern
Boliden
British Airways
Cathay Pacific
Chemtura
Chimei
chiquita
Chungwa
Commerzbank
del monte
Dow
Elpida
ENI
EON
Exxon Mobil
Fuji Electric
Fujifilm
GDF suez
Hannstar Display
Henkel
hitachi ltd
Hitachi Maxell
ICI
IMI PLC
Infineon
LG Display
Micron
Mitsubishi
Mueller Industries
Nanya Tech
NEC
Nippon electric glass
Panasonic
Pilkington
procter gamble
Qantas
rautaruukki
Repsol YPF
samsung
SAS AB
Siemens
Singapore Airlines
SKW Stahl
Sony
Toshiba
Total
Unilever NV
Unilever PLC
Unipetrol
United technologies corp
Uralita
Whirlpool
Zeon
---Finished---


Unnamed: 0,investor_name,investor_sub-type,perc_os_31-Dec-2011,perc_os_31-Dec-2010,perc_os_31-Dec-2009,perc_os_31-Dec-2008,perc_os_31-Dec-2007,perc_os_31-Dec-2006,perc_os_31-Dec-2005,perc_os_31-Dec-2004,...,filing_type_31-Dec-2005,filing_type_31-Dec-2004,filing_type_31-Dec-2003,filing_type_31-Dec-2002,filing_type_31-Dec-2001,filing_type_31-Dec-2000,filing_type_31-Dec-1999,filing_type_31-Dec-1998,filing_type_31-Dec-1997,company_name
0,Étoile Gestion,Investment Advisor,,,,,,,,0.0,...,,Aggregate MFs,,,,,,,,Aalberts
1,WestLB Mellon Asset Management Kapitalanlagege...,Investment Advisor,0.03,0.05,0.03,0.14,0.17,0.05,0.06,0.07,...,Aggregate MFs,Aggregate MFs,,,Aggregate MFs,Aggregate MFs,,Aggregate MFs,Aggregate MFs,Aalberts
2,Wealth Management Partners NV,Investment Advisor,,,,,,,,,...,,,,,Other Substantial/Declarable,Other Substantial/Declarable,,,,Aalberts
3,Waddell & Reed Investment Management Company,Investment Advisor,,,,0.0,0.22,,,,...,,,,,,,,,,Aalberts
4,W & W Asset Management GmbH,Investment Advisor,0.0,,,,,,,,...,,,,,,,,,,Aalberts
5,"Vontobel Asset Management, Inc.",Investment Advisor/Hedge Fund,,,,0.0,0.28,,,,...,,,,,,,,,,Aalberts
6,"Vanguard Group, Inc.",Investment Advisor,0.41,0.25,0.02,,,,,,...,,,,,,,,,,Aalberts
7,VPB Finance S.A.,Investment Advisor,,,0.0,0.0,,,,,...,,,,,,,,,,Aalberts
8,Universal-Investment-Gesellschaft mbH,Investment Advisor/Hedge Fund,0.01,,0.0,0.34,0.29,0.57,0.63,0.15,...,Aggregate MFs,Aggregate MFs,Aggregate MFs,Aggregate MFs,,Aggregate MFs,Aggregate MFs,,,Aalberts
9,Union Investment Group,Investment Advisor,0.0,0.04,0.11,,,,,,...,,,,,,,,,,Aalberts


In [3]:
# print number of dataframes, should be equal to number of files
print(f"Number of dataframes: {len(dfs)}")

# Merge all dataframes into one
df_merged = pd.concat(dfs)

investor_names = df_merged['investor_name'].unique()
cartel_names = df_merged['company_name'].unique()

# print amount of uniques
print(f"Number of unique investors: {len(investor_names)}")
print(f"Number of unique cartels: {len(cartel_names)}")

# order investors by number of companies they are in invested in
investor_counts = df_merged['investor_name'].value_counts()

# get all companies for each investor
investor_companies = df_merged.groupby('investor_name')['company_name'].apply(list)

# concatanate all companies for each investor into a string
investor_companies = investor_companies.apply(lambda x: ', '.join(x))

# merge investor_counts and investor_companies into a dataframe
investor_counts = pd.DataFrame(investor_counts)
investor_counts['companies'] = investor_companies

investor_counts = investor_counts.sort_values(by='count', ascending=False)

print(investor_counts)

# save investor_counts to excel
investor_counts_df = pd.DataFrame(investor_counts)
investor_counts_df.to_excel("./transformed_data/shareholder_data/investor_counts.xlsx", index=True)

Number of dataframes: 61
Number of unique investors: 8246
Number of unique cartels: 61
                                          count  \
investor_name                                     
Universal-Investment-Gesellschaft mbH        58   
Norges Bank Investment Management (NBIM)     56   
Vanguard Group, Inc.                         56   
Dimensional Fund Advisors, LP                56   
TIAA-CREF                                    56   
...                                         ...   
C I Kasei Company Ltd.                        1   
Asahi Kasei Corp                              1   
Asahi Kasei Chemicals Corporation             1   
First Union National Bank                     1   
Euclid Advisors LLC                           1   

                                                                                  companies  
investor_name                                                                                
Universal-Investment-Gesellschaft mbH     Aalberts, ABB, Akzo

In [4]:
# get types of columns
types = df_merged.dtypes
# print types
print(types)

# transform all perc_os columns to numeric
perc_os_columns = df_merged.columns[df_merged.columns.str.contains('perc_os')]
for col in perc_os_columns:
    df_merged[col] = pd.to_numeric(df_merged[col], errors='coerce')

# transform all filing_type columns to string
filing_type_columns = df_merged.columns[df_merged.columns.str.contains('filing_type')]
for col in filing_type_columns:
    df_merged[col] = df_merged[col].astype(str)

# print the first 10 rows of the merged dataframe
df_merged.head(10)


investor_name               object
investor_sub-type           object
perc_os_31-Dec-2011         object
perc_os_31-Dec-2010        float64
perc_os_31-Dec-2009        float64
perc_os_31-Dec-2008        float64
perc_os_31-Dec-2007        float64
perc_os_31-Dec-2006        float64
perc_os_31-Dec-2005        float64
perc_os_31-Dec-2004        float64
perc_os_31-Dec-2003        float64
perc_os_31-Dec-2002        float64
perc_os_31-Dec-2001        float64
perc_os_31-Dec-2000        float64
perc_os_31-Dec-1999        float64
perc_os_31-Dec-1998        float64
perc_os_31-Dec-1997        float64
filing_type_31-Dec-2011     object
filing_type_31-Dec-2010     object
filing_type_31-Dec-2009     object
filing_type_31-Dec-2008     object
filing_type_31-Dec-2007     object
filing_type_31-Dec-2006     object
filing_type_31-Dec-2005     object
filing_type_31-Dec-2004     object
filing_type_31-Dec-2003     object
filing_type_31-Dec-2002     object
filing_type_31-Dec-2001     object
filing_type_31-Dec-2

Unnamed: 0,investor_name,investor_sub-type,perc_os_31-Dec-2011,perc_os_31-Dec-2010,perc_os_31-Dec-2009,perc_os_31-Dec-2008,perc_os_31-Dec-2007,perc_os_31-Dec-2006,perc_os_31-Dec-2005,perc_os_31-Dec-2004,...,filing_type_31-Dec-2005,filing_type_31-Dec-2004,filing_type_31-Dec-2003,filing_type_31-Dec-2002,filing_type_31-Dec-2001,filing_type_31-Dec-2000,filing_type_31-Dec-1999,filing_type_31-Dec-1998,filing_type_31-Dec-1997,company_name
0,Étoile Gestion,Investment Advisor,,,,,,,,0.0,...,,Aggregate MFs,,,,,,,,Aalberts
1,WestLB Mellon Asset Management Kapitalanlagege...,Investment Advisor,0.03,0.05,0.03,0.14,0.17,0.05,0.06,0.07,...,Aggregate MFs,Aggregate MFs,,,Aggregate MFs,Aggregate MFs,,Aggregate MFs,Aggregate MFs,Aalberts
2,Wealth Management Partners NV,Investment Advisor,,,,,,,,,...,,,,,Other Substantial/Declarable,Other Substantial/Declarable,,,,Aalberts
3,Waddell & Reed Investment Management Company,Investment Advisor,,,,0.0,0.22,,,,...,,,,,,,,,,Aalberts
4,W & W Asset Management GmbH,Investment Advisor,0.0,,,,,,,,...,,,,,,,,,,Aalberts
5,"Vontobel Asset Management, Inc.",Investment Advisor/Hedge Fund,,,,0.0,0.28,,,,...,,,,,,,,,,Aalberts
6,"Vanguard Group, Inc.",Investment Advisor,0.41,0.25,0.02,,,,,,...,,,,,,,,,,Aalberts
7,VPB Finance S.A.,Investment Advisor,,,0.0,0.0,,,,,...,,,,,,,,,,Aalberts
8,Universal-Investment-Gesellschaft mbH,Investment Advisor/Hedge Fund,0.01,,0.0,0.34,0.29,0.57,0.63,0.15,...,Aggregate MFs,Aggregate MFs,Aggregate MFs,Aggregate MFs,,Aggregate MFs,Aggregate MFs,,,Aalberts
9,Union Investment Group,Investment Advisor,0.0,0.04,0.11,,,,,,...,,,,,,,,,,Aalberts


In [5]:
# get all perc_os columns in the dataframe
perc_os_columns = [col for col in df_merged.columns if 'perc_os' in col]

# for each column in perc_os_columns
for col in perc_os_columns:

    # Get year from the column name
    year = col[-4:]

    # Define new Graph
    G = nx.Graph()

    # for each row in the dataframe
    for _, row in df_merged.iterrows():

        # if the column, row value is not null
        if pd.notnull(row[col]):

            # Get the company name
            company_name = row['company_name']

            # Get the investor name
            investor_name = row['investor_name']

            # Get the perc_os value
            perc_os = row[col]

            # Add the company and investor to the graph
            G.add_node(company_name, type='company')
            G.add_node(investor_name, type='investor')

            # Add an edge between the company and investor with the perc_os value as weight
            G.add_edge(company_name, investor_name, weight=perc_os)

    # Get the number of nodes in the graph
    num_nodes = len(G.nodes)
    # Get the number of edges in the graph
    num_edges = len(G.edges)

    # Print the number of nodes and edges
    print(f"Year: {year}, Number of nodes: {num_nodes}, Number of edges: {num_edges}")

    nx.write_graphml(G, f"./transformed_data/connected_shareholder_networks/connected_shareholder_network_{year}.graphml", named_key_ids=True, infer_numeric_types=True)

    print(f"Graph saved as graphml file for year {year}") 

print("---Finished---")           


Year: 2011, Number of nodes: 4531, Number of edges: 23494
Graph saved as graphml file for year 2011
Year: 2010, Number of nodes: 4756, Number of edges: 25037
Graph saved as graphml file for year 2010
Year: 2009, Number of nodes: 4743, Number of edges: 25607
Graph saved as graphml file for year 2009
Year: 2008, Number of nodes: 4538, Number of edges: 26401
Graph saved as graphml file for year 2008
Year: 2007, Number of nodes: 4423, Number of edges: 26198
Graph saved as graphml file for year 2007
Year: 2006, Number of nodes: 4353, Number of edges: 22368
Graph saved as graphml file for year 2006
Year: 2005, Number of nodes: 4124, Number of edges: 21055
Graph saved as graphml file for year 2005
Year: 2004, Number of nodes: 3602, Number of edges: 19783
Graph saved as graphml file for year 2004
Year: 2003, Number of nodes: 3351, Number of edges: 19690
Graph saved as graphml file for year 2003
Year: 2002, Number of nodes: 3286, Number of edges: 19786
Graph saved as graphml file for year 2002


In [6]:
connected_shareholders_graph_list = []
connected_shareholders_year_list = []

for file in os.listdir("./transformed_data/connected_shareholder_networks"):
    if file.endswith(".graphml"):
        # create new graph from .graphml file
        G = nx.read_graphml(f"./transformed_data/connected_shareholder_networks/{file}")
        # add graph to list
        connected_shareholders_graph_list.append(G)
        # add year to list
        connected_shareholders_year_list.append(file.split("_")[-1].split(".")[0])

print(f"Number of graphs read: {len(connected_shareholders_graph_list)}")

Number of graphs read: 15


In [7]:
# Prepare list to collect rows
network_stats_list = []

# for each graph in the list
for graph in connected_shareholders_graph_list:
    # Get the year from the graph name
    year = connected_shareholders_year_list[connected_shareholders_graph_list.index(graph)]
    # Get the number of nodes in the graph
    num_nodes = len(graph.nodes)
    # Get the number of edges in the graph
    num_edges = len(graph.edges)
    # Calculate the density of the graph (How connected is the network)
    density = nx.density(graph)
    
    # Calculate modularity
    partition = nx.community.greedy_modularity_communities(graph)
    modularity = nx.community.modularity(graph, partition)
    
    # Average path length
    try:
        avg_path_length = nx.average_shortest_path_length(graph)
    except nx.NetworkXError:
        avg_path_length = float('inf')  # Handle disconnected graphs

    # Add this row to the list
    network_stats_list.append({
        'Year': year,
        'Number of nodes': num_nodes,
        'Number of edges': num_edges,
        'Density': density,
        'Modularity': modularity,
        'Average path length': avg_path_length
    })

    print(f"Year: {year} finished")

# Convert to DataFrame at the end
network_stats_df = pd.DataFrame(network_stats_list)

# Save to Excel
network_stats_df.to_excel("./transformed_data/connected_shareholder_data/shareholder_network_stats.xlsx", index=False)

print("---Finished---")

Year: 1997 finished
Year: 1998 finished
Year: 1999 finished
Year: 2000 finished
Year: 2001 finished
Year: 2002 finished
Year: 2003 finished
Year: 2004 finished
Year: 2005 finished
Year: 2006 finished
Year: 2007 finished
Year: 2008 finished
Year: 2009 finished
Year: 2010 finished
Year: 2011 finished
---Finished---


In [13]:
measure_list = []

for graph in connected_shareholders_graph_list:

    year = connected_shareholders_year_list[connected_shareholders_graph_list.index(graph)]

    for node in graph.nodes:
        # Get the node type
        node_type = graph.nodes[node]['type']

        node_name = node
 
        if node_type == 'investor':
            # Get the number of companies the investor is invested in
            degree = graph.degree(node)

            # Get the sum of the weights of the edges connected to the investor
            weight_sum = sum([graph[node][neighbor]['weight'] for neighbor in graph.neighbors(node)])

            # Get the average weight of the edges connected to the investor
            avg_weight = weight_sum / degree if degree > 0 else 0

            # Get the maximum weight of the edges connected to the investor
            max_weight = max([graph[node][neighbor]['weight'] for neighbor in graph.neighbors(node)]) if degree > 0 else 0

            # calculate centralities of nodes (Takes way too long - disabled for now)
            #degree_centrality = nx.degree_centrality(graph)[node]
            #closeness_centrality = nx.closeness_centrality(graph)[node]
            #betweenness_centrality = nx.betweenness_centrality(graph)[node]
            #eigenvector_centrality = nx.eigenvector_centrality(graph)[node]
            #harmonic_centrality = nx.harmonic_centrality(graph)[node]
            #katz_centrality = nx.katz_centrality(graph)[node]

            # add measures to dictionary
            measures = {
                'year': year,
                'node_name': node_name,
                'node_type': node_type,
                'degree': degree,
                'weight_sum': weight_sum,
                'avg_weight': avg_weight,
                'max_weight': max_weight,
            }

            measure_list.append(measures)
    
    print(f"Year: {year} finished")

df = pd.DataFrame(measure_list)

# Save to Excel
df.to_excel("./transformed_data/connected_shareholder_data/shareholder_node_stats.xlsx", index=False)


Year: 1997 finished
Year: 1998 finished
Year: 1999 finished
Year: 2000 finished
Year: 2001 finished
Year: 2002 finished
Year: 2003 finished
Year: 2004 finished
Year: 2005 finished
Year: 2006 finished
Year: 2007 finished
Year: 2008 finished
Year: 2009 finished
Year: 2010 finished
Year: 2011 finished
