In [1]:
import networkx as nx
from pyvis.network import Network
import pandas as pd
import math
import matplotlib.pyplot as plt
import os

Read all transformed shareholder files

In [2]:
# Create list to hold shareholder dataframes
shareholder_dfs = []

# for each file in directory
for filename in os.listdir('./transformed_data/base/shareholders'):
    # check if file is a .xlsx file
    if filename.endswith('.xlsx'):
        # read the file into a dataframe
        df = pd.read_excel(f'./transformed_data/base/shareholders/{filename}')
        # add the dataframe to the list
        shareholder_dfs.append(df)

# print number of dataframes, should be equal to number of files -> should be 61
print(f"Number of dataframes: {len(shareholder_dfs)}")

Number of dataframes: 61


Create a graph for every year in every file

In [3]:
for sh_df in shareholder_dfs:
    # Get all columns with perc_os in the name
    perc_os_columns = sh_df.columns[sh_df.columns.str.contains('perc_os')]
    
    # Get the name of the company e.g. ABB (Take 0 index because we only have one company per file -> its the same for all rows)
    company_name = sh_df["company_name"][0]
    
    # Print the name of the company to see progress
    print(company_name)

    # Create a graph for each column with perc_os in the name
    for column in perc_os_columns:
        G = nx.Graph()

        for _, row in df.iterrows():

            investor = row["investor_name"] # name of the investor
            company = row["company_name"] # redundant, but for clarity
            weight = row[column] # percentage ownership

            # Check if weight (perc_os) is not null
            if not pd.isna(weight):  
                G.add_node(company, type="Company") 
                G.add_node(investor, type="Investor")
                G.add_edge(investor, company, weight=weight)
        
        # Save the graph as a graphml file
        nx.write_graphml(G, f"./transformed_data/shareholder_networks/shareholder_network_{company_name}_{str.replace(column, 'perc_os_', '')}.graphml", named_key_ids=True, infer_numeric_types=True)

print("---Finished---")

Aalberts
ABB
Akzo Nobel
Alstom SA
Areva
Asahi
AU Optronics
BAM
Bayern
Boliden
British Airways
Cathay Pacific
Chemtura
Chimei
chiquita
Chungwa
Commerzbank
del monte
Dow
Elpida
ENI
EON
Exxon Mobil
Fuji Electric
Fujifilm
GDF suez
Hannstar Display
Henkel
hitachi ltd
Hitachi Maxell
ICI
IMI PLC
Infineon
LG Display
Micron
Mitsubishi
Mueller Industries
Nanya Tech
NEC
Nippon electric glass
Panasonic
Pilkington
procter gamble
Qantas
rautaruukki
Repsol YPF
samsung
SAS AB
Siemens
Singapore Airlines
SKW Stahl
Sony
Toshiba
Total
Unilever NV
Unilever PLC
Unipetrol
United technologies corp
Uralita
Whirlpool
Zeon
---Finished---


Calculate the herfindal-hirschmann index and the top inverstors for every year in every file 

In [4]:
# for each graph-file in directory
for filename in os.listdir('./transformed_data/shareholder_networks/'):

    # check if file is a .graphml file
    if filename.endswith('.graphml'):

        # read the file into a graph
        G = nx.read_graphml(f'./transformed_data/shareholder_networks/{filename}')

        # get the company name from the filename
        company_name = filename.split('_')[2]

        # get the year from the filename
        year = filename.split('.')[0][-4:]
        
        print(f"Calculating metrics for {company_name} in {year}")

        # get the weights of the edges
        weights = [data['weight'] for _, _, data in G.edges(data=True)]

        # calculate the herfindahl index and round it to 2 decimal places
        herfindahl_index = round(sum([w**2 for w in weights]), 2)

        # calculate the centralities for the company node
        company_node = [n for n, d in G.nodes(data=True) if d['type'] == 'Company'][0]
        degree_centrality = nx.degree_centrality(G)[company_node]
        closeness_centrality = nx.closeness_centrality(G)[company_node]
        betweenness_centrality = nx.betweenness_centrality(G)[company_node]
        eigenvector_centrality = nx.eigenvector_centrality(G)[company_node]
        pagerank_centrality = nx.pagerank(G)[company_node]
        # number of investors
        num_investors = len([n for n, d in G.nodes(data=True) if d['type'] == 'Investor'])

        


Calculating metrics for Aalberts in 1997
Calculating metrics for Aalberts in 1998
Calculating metrics for Aalberts in 1999
Calculating metrics for Aalberts in 2000
Calculating metrics for Aalberts in 2001
Calculating metrics for Aalberts in 2002
Calculating metrics for Aalberts in 2003
Calculating metrics for Aalberts in 2004
Calculating metrics for Aalberts in 2005
Calculating metrics for Aalberts in 2006
Calculating metrics for Aalberts in 2007
Calculating metrics for Aalberts in 2008
Calculating metrics for Aalberts in 2009
Calculating metrics for Aalberts in 2010
Calculating metrics for Aalberts in 2011
Calculating metrics for ABB in 1997
Calculating metrics for ABB in 1998
Calculating metrics for ABB in 1999
Calculating metrics for ABB in 2000
Calculating metrics for ABB in 2001
Calculating metrics for ABB in 2002
Calculating metrics for ABB in 2003
Calculating metrics for ABB in 2004
Calculating metrics for ABB in 2005
Calculating metrics for ABB in 2006
Calculating metrics for A

In [5]:
# Liste für die finalen Ergebnisse
top_investors_list = []

herfindal_list = []

for sh_df in shareholder_dfs:
    # Alle Spalten mit 'perc_os' (also die Ownership-Anteile)
    perc_os_columns = sh_df.columns[sh_df.columns.str.contains('perc_os')]

    # Convert all perc_os columns to numeric
    sh_df[perc_os_columns] = sh_df[perc_os_columns].apply(pd.to_numeric, errors='coerce')

    # Get the name of the company e.g. ABB (Take 0 index because we only have one company per file -> its the same for all rows)
    company_name = sh_df["company_name"][0]

    for column in perc_os_columns:
        year = column[-4:]

        
        ###################################### Top 5 Investors per year ######################################
        # Top 5 Investoren für das Jahr
        top5 = df.nlargest(5, column)[["investor_name", column]].reset_index(drop=True)

        for rank in range(5):
            try:
                investor = top5.loc[rank, "investor_name"]
                perc = top5.loc[rank, column]

                if pd.isna(perc):
                    continue  # Falls kein Wert vorhanden ist, überspringen

                # Ergebniszeile
                result = {
                    "company_name": company_name,
                    "year": int(year),
                    "rank": rank + 1,
                    "investor_name": investor,
                    "perc_os": perc
                }

                top_investors_list.append(result)

            except IndexError:
                # Weniger als 5 Investoren
                continue

        ###################################### Top 5 Investors per year ######################################

        ###################################### Herfindahl Index ######################################

        # Get all values in column
        values = df[column].values

        # calculate herfindahl index
        herfindahl = sum([math.pow(value, 2) for value in values if not math.isnan(value)])

        # get amount of investors for this year
        amount_investors = len(values) - sum([math.isnan(value) for value in values])

        result_herfindahl = {
            "company_name": company_name,
            "year": int(year),
            "herfindahl_index": herfindahl,
            "amount_investors": amount_investors
        }

        herfindal_list.append(result_herfindahl)

        ###################################### Herfindahl Index ######################################


###################################### Top 5 Investors per year ######################################
# In DataFrame umwandeln
top_investors_df = pd.DataFrame(top_investors_list)

# Sortieren nach Unternehmen, Jahr, Rang
top_investors_df = top_investors_df.sort_values(by=["company_name", "year", "rank"])

top_investors_df.to_excel("transformed_data/shareholder_data/top_investors.xlsx", index=False)
###################################### Top 5 Investors per year ######################################

###################################### Herfindahl Index ######################################

herfindahl_df = pd.DataFrame(herfindal_list)

# Sortieren nach Unternehmen, Jahr
herfindahl_df = herfindahl_df.sort_values(by=["company_name", "year"])

herfindahl_df.to_excel("transformed_data/shareholder_data/herfindahl_index.xlsx", index=False)

###################################### Herfindahl Index ######################################


Combine Shareholder Files with cartel file (ignoring the years) ((Experimentell))

In [6]:
# read mapping file
mapping_df = pd.read_excel("raw_data/mapping/filename_to_entity_mapping.xlsx")

# define new column, if entity_name_direct_match is null then use entity_name_indirect_match_1
mapping_df['company_name'] = mapping_df['entity_name_direct_match'].combine_first(mapping_df['entity_name_indirect_match_1'])

# read cartel file
cartel_df = pd.read_excel("raw_data/cartel_connections/Cartels.xls", sheet_name='Data_on_all_cartelfirms')

# Drop columns that are not needed
cartel_df = cartel_df[['Case:', 'Cartel:', 'Entity Name', 'Start:', 'End:', 'Duration', 'Sector', 'Subsector', 'CartelClassification']]
# rename columns
cartel_df.columns = ['case', 'cartel_name', 'entity_name', 'start', 'end', 'duration', 'sector', 'subsector', 'cartel_classification']

# Merge cartel_df with mapping_df on cartel_name
cartel_df = cartel_df.merge(mapping_df, left_on="entity_name", right_on="company_name", how="left")

# Drop columns that are not needed
cartel_df = cartel_df[['case', 'cartel_name', 'entity_name', 'company_name', 'file_name', 'start', 'end', 'duration', 'sector', 'subsector', 'cartel_classification']]

cartel_df.head(10)

FileNotFoundError: [Errno 2] No such file or directory: 'raw_data/cartel_connections/Cartels.xls'

In [None]:
cartel_df = cartel_df.merge(df_merged, left_on="company_name", right_on="company_name", how="left")

cartel_df.head(10)

Unnamed: 0,case,cartel_name,entity_name,company_name,file_name,start,end,duration,sector,subsector,...,filing_type_31-Dec-2006,filing_type_31-Dec-2005,filing_type_31-Dec-2004,filing_type_31-Dec-2003,filing_type_31-Dec-2002,filing_type_31-Dec-2001,filing_type_31-Dec-2000,filing_type_31-Dec-1999,filing_type_31-Dec-1998,filing_type_31-Dec-1997
0,31865,PVC (II),Elf Aquitaine SA,,,1981,1994,14,3,3,...,,,,,,,,,,
1,31865,PVC (II),BASF SE,,,1981,1994,14,3,3,...,,,,,,,,,,
2,31865,PVC (II),Koninklijke DSM,,,1981,1994,14,3,3,...,,,,,,,,,,
3,31865,PVC (II),ENI,ENI,ENI,1981,1994,14,3,3,...,Aggregate MFs,Aggregate MFs,Aggregate MFs,Aggregate MFs,Aggregate MFs,Aggregate MFs,Aggregate MFs,Aggregate MFs,Aggregate MFs,Aggregate MFs
4,31865,PVC (II),ENI,ENI,ENI,1981,1994,14,3,3,...,Aggregate MFs,Aggregate MFs,Aggregate MFs,Aggregate MFs,Aggregate MFs,,,,,
5,31865,PVC (II),ENI,ENI,ENI,1981,1994,14,3,3,...,Aggregate MFs,Aggregate MFs,Aggregate MFs,Aggregate MFs,Aggregate MFs,Aggregate MFs,,,,
6,31865,PVC (II),ENI,ENI,ENI,1981,1994,14,3,3,...,Aggregate MFs,Aggregate MFs,Aggregate MFs,Aggregate MFs,Aggregate MFs,,,,,
7,31865,PVC (II),ENI,ENI,ENI,1981,1994,14,3,3,...,Aggregate MFs,,,,,,,,,
8,31865,PVC (II),ENI,ENI,ENI,1981,1994,14,3,3,...,Aggregate MFs,Aggregate MFs,,,,,,,,
9,31865,PVC (II),ENI,ENI,ENI,1981,1994,14,3,3,...,,,,,,,,,,


In [None]:
# count rows in df_cartel
print(f"Number of rows in cartel_df: {len(cartel_df)}")

Number of rows in cartel_df: 39745


In [None]:
# build a network ignoring the years
G_complete = nx.Graph()

# add nodes and edges
for _, row in cartel_df.iterrows():
    cartel_name = row['cartel_name']
    entity_name = row['entity_name']
    investor_name = row['investor_name']

    G_complete.add_node(cartel_name, type="Cartel")
    G_complete.add_node(entity_name, type="Entity")
    G_complete.add_node(investor_name, type="Investor")

    G_complete.add_edge(cartel_name, entity_name)
    G_complete.add_edge(entity_name, investor_name)

# Save the graph as a graphml file
nx.write_graphml(G_complete, f"./transformed_data/tests/complete_cartel_network.graphml", named_key_ids=True, infer_numeric_types=True)

In [None]:
# get all columns with perc_os in the name
perc_os_columns = cartel_df.columns[cartel_df.columns.str.contains('perc_os')]

# for each column in perc_os_columns
for column in perc_os_columns:
    # Create a graph for each column with perc_os in the name
    G = nx.Graph()

    for _, row in cartel_df.iterrows():
        cartel_name = row["cartel_name"]
        entity_name = row["entity_name"]
        investor_name = row["investor_name"]
        weight = row[column]

        # Check if weight is not null
        if not pd.isna(weight):  
            G.add_node(cartel_name, type="Cartel")
            G.add_node(entity_name, type="Entity")
            G.add_node(investor_name, type="Investor")
            
            G.add_edge(cartel_name, entity_name)
            G.add_edge(entity_name, investor_name, weight=weight)
    
    # Save the graph as a graphml file
    nx.write_graphml(G, f"./transformed_data/tests/complete_cartel_network_{str.replace(column, 'perc_os_', '')}.graphml", named_key_ids=True, infer_numeric_types=True)



In [None]:
#get degrees for complete_cartel_network
G_complete = nx.read_graphml(f"./transformed_data/tests/complete_cartel_network.graphml")
degrees = G_complete.degree()
degrees = dict(degrees)
degrees_df = pd.DataFrame(degrees.items(), columns=['node', 'degree'])
degrees_df['type'] = degrees_df['node'].apply(lambda x: G_complete.nodes[x]['type'] if x in G_complete.nodes else None)

# order by degree
degrees_df = degrees_df.sort_values(by=['degree'], ascending=False)

# filter by type
degrees_df = degrees_df[degrees_df['type'].isin(['Investor'])]

degrees_df.head(100)

Unnamed: 0,node,degree,type
2,,201,Investor
201,State Street Global Advisors (France) S.A.,27,Investor
77,Universal-Investment-Gesellschaft mbH,27,Investor
1414,BNP Paribas Investment Partners (France),26,Investor
546,Mellon Capital Management Corporation,26,Investor
...,...,...,...
308,Robeco Institutional Asset Management B.V.,22,Investor
985,Fidelity Management & Research Company,22,Investor
667,Kleinwort Benson Investors Dublin Ltd,22,Investor
380,Pioneer Investment Management Ltd.,22,Investor
