Import relevant libraries for network and data handling / transformation

In [1]:
import networkx as nx
import pandas as pd
import os
from networkx import PowerIterationFailedConvergence

Read all transformed shareholder files

In [2]:
# Create list to hold shareholder dataframes
shareholder_dfs = []

# for each file in directory
for filename in os.listdir('./transformed_data/base/shareholders'):
    # check if file is a .xlsx file
    if filename.endswith('.xlsx'):
        # read the file into a dataframe
        df = pd.read_excel(f'./transformed_data/base/shareholders/{filename}')
        # add the dataframe to the list
        shareholder_dfs.append(df)

# print number of dataframes, should be equal to number of files -> should be 61
print(f"Number of dataframes: {len(shareholder_dfs)}")

Number of dataframes: 61


Create a graph for every year in every file

In [3]:
for sh_df in shareholder_dfs:
    # Get all columns with perc_os in the name
    perc_os_columns = sh_df.columns[sh_df.columns.str.contains('perc_os')]
    
    # Get the name of the company e.g. ABB (Take 0 index because we only have one company per file -> its the same for all rows)
    company_name = sh_df["company_name"][0]
    
    # Print the name of the company to see progress
    print(company_name)

    # Create a graph for each column with perc_os in the name
    for column in perc_os_columns:
        G = nx.Graph()

        for _, row in sh_df.iterrows():

            investor = row["investor_name"] # name of the investor
            company = row["company_name"] # redundant, but for clarity
            weight = row[column] # percentage ownership

            # Check if weight (perc_os) is not null
            if not pd.isna(weight):  
                G.add_node(company, type="Company") 
                G.add_node(investor, type="Investor")
                G.add_edge(investor, company, weight=weight)
        
        # Save the graph as a graphml file
        nx.write_graphml(G, f"./transformed_data/shareholder_networks/shareholder_network_{company_name}_{str.replace(column, 'perc_os_', '')}.graphml", named_key_ids=True, infer_numeric_types=True)

print("---Finished---")

Aalberts
ABB
Akzo Nobel
Alstom SA
Areva
Asahi
AU Optronics
BAM
Bayern
Boliden
British Airways
Cathay Pacific
Chemtura
Chimei
chiquita
Chungwa
Commerzbank
del monte
Dow
Elpida
ENI
EON
Exxon Mobil
Fuji Electric
Fujifilm
GDF suez
Hannstar Display
Henkel
hitachi ltd
Hitachi Maxell
ICI
IMI PLC
Infineon
LG Display
Micron
Mitsubishi
Mueller Industries
Nanya Tech
NEC
Nippon electric glass
Panasonic
Pilkington
procter gamble
Qantas
rautaruukki
Repsol YPF
samsung
SAS AB
Siemens
Singapore Airlines
SKW Stahl
Sony
Toshiba
Total
Unilever NV
Unilever PLC
Unipetrol
United technologies corp
Uralita
Whirlpool
Zeon
---Finished---


Calculate the herfindal-hirschmann index and the centrality metrics for each file and each year

In [4]:
company_metrics = []

# for each graph-file in directory
for filename in os.listdir('./transformed_data/shareholder_networks/'):

    # check if file is a .graphml file
    if filename.endswith('.graphml'):

        # read the file into a graph
        G = nx.read_graphml(f'./transformed_data/shareholder_networks/{filename}')

        # get the company name from the filename
        company_name = filename.split('_')[2]

        # get the year from the filename
        year = filename.split('.')[0][-4:]
        
        print(f"Calculating metrics for {company_name} in {year}")

        # get the weights of the edges
        weights = [data['weight'] for _, _, data in G.edges(data=True)]

        # calculate the herfindahl index and round it to 2 decimal places
        herfindahl_index = round(sum([w**2 for w in weights]), 2)

        try:
            # calculate the centralities for the company node
            company_node = [n for n, d in G.nodes(data=True) if d['type'] == 'Company'][0]
            degree_centrality = nx.degree_centrality(G)[company_node]
            closeness_centrality = nx.closeness_centrality(G)[company_node]
            betweenness_centrality = nx.betweenness_centrality(G)[company_node]
            try:
                # calculate the eigenvector centrality
                eigenvector_centrality = nx.eigenvector_centrality(G)[company_node]
                katz_centrality = nx.katz_centrality(G)[company_node]
            except PowerIterationFailedConvergence:
                # if the eigenvector centrality calculation fails, set it to None
                eigenvector_centrality = None
                katz_centrality = None
            pagerank_centrality = nx.pagerank(G)[company_node]
        except IndexError:
            # if the company node is not found, set the centralities to None
            degree_centrality = None
            closeness_centrality = None
            betweenness_centrality = None
            eigenvector_centrality = None
            katz_centrality = None
            pagerank_centrality = None

        
        # number of investors
        num_investors = len([n for n, d in G.nodes(data=True) if d['type'] == 'Investor'])

        # create a dictionary with the metrics
        metrics = {
            'company_name': company_name,
            'year': year,
            'num_investors': num_investors,
            'herfindahl_index': herfindahl_index,
            'degree_centrality': degree_centrality,
            'closeness_centrality': closeness_centrality,
            'betweenness_centrality': betweenness_centrality,
            'eigenvector_centrality': eigenvector_centrality,
            'katz_centrality': katz_centrality,
            'pagerank_centrality': pagerank_centrality
        }

        # append the metrics to the list
        company_metrics.append(metrics)

    
# create a dataframe from the list of metrics
company_metrics_df = pd.DataFrame(company_metrics)

# save the dataframe to a xlsx file
company_metrics_df.to_excel('./transformed_data/shareholder_data/company_metrics.xlsx', index=False)

Calculating metrics for Aalberts in 1997
Calculating metrics for Aalberts in 1998
Calculating metrics for Aalberts in 1999
Calculating metrics for Aalberts in 2000
Calculating metrics for Aalberts in 2001
Calculating metrics for Aalberts in 2002
Calculating metrics for Aalberts in 2003
Calculating metrics for Aalberts in 2004
Calculating metrics for Aalberts in 2005
Calculating metrics for Aalberts in 2006
Calculating metrics for Aalberts in 2007
Calculating metrics for Aalberts in 2008
Calculating metrics for Aalberts in 2009
Calculating metrics for Aalberts in 2010
Calculating metrics for Aalberts in 2011
Calculating metrics for ABB in 1997
Calculating metrics for ABB in 1998
Calculating metrics for ABB in 1999
Calculating metrics for ABB in 2000
Calculating metrics for ABB in 2001
Calculating metrics for ABB in 2002
Calculating metrics for ABB in 2003
Calculating metrics for ABB in 2004
Calculating metrics for ABB in 2005
Calculating metrics for ABB in 2006
Calculating metrics for A

KeyboardInterrupt: 

Calculate the top 5 investors for each year and each file

In [5]:
# List for the top investors
top_investors_list = []

for sh_df in shareholder_dfs:
    # Get all columns with perc_os in the name
    perc_os_columns = sh_df.columns[sh_df.columns.str.contains('perc_os')]

    # Convert all perc_os columns to numeric
    sh_df[perc_os_columns] = sh_df[perc_os_columns].apply(pd.to_numeric, errors='coerce')

    # Get the name of the company e.g. ABB (Take 0 index because we only have one company per file -> its the same for all rows)
    company_name = sh_df["company_name"][0]

    for column in perc_os_columns:
        year = column[-4:]

        # Get top 5 investors for the column
        top5 = sh_df.nlargest(5, column)[["investor_name", column]].reset_index(drop=True)

        for rank in range(5):
            try:
                investor = top5.loc[rank, "investor_name"]
                perc = top5.loc[rank, column]

                if pd.isna(perc):
                    continue  # If no value is available, skip this investor

                # Result dictionary
                result = {
                    "company_name": company_name,
                    "year": int(year),
                    "rank": rank + 1,
                    "investor_name": investor,
                    "perc_os": perc
                }

                # Append to the list
                top_investors_list.append(result)

            except IndexError:
                # less than 5 investors
                continue

# Transform the list into a DataFrame
top_investors_df = pd.DataFrame(top_investors_list)

# Sort the DataFrame by company_name, year, and rank
top_investors_df = top_investors_df.sort_values(by=["company_name", "year", "rank"])

top_investors_df.to_excel("transformed_data/shareholder_data/top_investors.xlsx", index=False)