In [15]:
import networkx as nx
from pyvis.network import Network
import pandas as pd
import math
import matplotlib.pyplot as plt

In [16]:
df = pd.read_excel('raw_data/company_shareholders/ABB.xlsx')

# drop last 3 rows
df = df[:-3]

df_cleaned = df.dropna(how='all').dropna(axis=1, how='all')

# Get relevant columns with % O/S in the name
relevant_columns_percent = df_cleaned.columns[df_cleaned.columns.str.contains('% O/S')]
relevant_columns_filing = df_cleaned.columns[df_cleaned.columns.str.contains('Filing Type')]

df_cleaned = df_cleaned[['Investor Name', 'Investor Sub-Type'] + list(relevant_columns_percent) + list(relevant_columns_filing)]

# Change column names to be more readable by renaming % O/S to perc_os_{month_year} and Filing Type to filing_type_{month_year}
col_list = []

for col in df_cleaned.columns:
    if '% O/S' in col:
        month_year = col.split(' ')[-1]
        col_list.append(f'perc_os_{month_year}')
    elif 'Filing Type' in col:
        month_year = col.split(' ')[-1]
        col_list.append(f'filing_type_{month_year}')
    else:
        col_list.append(str.lower(str.replace(col, ' ', '_')))

df_cleaned.columns = col_list

df_cleaned.head(10)

Unnamed: 0,investor_name,investor_sub-type,perc_os_31-Dec-2011,perc_os_31-Dec-2010,perc_os_31-Dec-2009,perc_os_31-Dec-2008,perc_os_31-Dec-2007,perc_os_31-Dec-2006,perc_os_31-Dec-2005,perc_os_31-Dec-2004,...,filing_type_31-Dec-2009,filing_type_31-Dec-2008,filing_type_31-Dec-2007,filing_type_31-Dec-2006,filing_type_31-Dec-2005,filing_type_31-Dec-2004,filing_type_31-Dec-2003,filing_type_31-Dec-2002,filing_type_31-Dec-2001,filing_type_31-Dec-2000
0,Ålandsbanken Asset Management Ab,Investment Advisor,,,0.0,,,,,,...,Aggregate MFs,,,,,,,,,
1,Wilmington Trust Investment Management LLC,Bank and Trust,0.0,0.0,0.0,,,,,,...,Aggregate MFs,,,,,,,,,
2,"William Blair & Company, L.L.C.",Investment Advisor/Hedge Fund,,,,,0.0,0.57,,,...,,,13F,13F,,,,,,
3,Wells Capital Management Inc.,Investment Advisor/Hedge Fund,0.0,0.07,0.07,,,,,,...,Aggregate MFs,,,,,,,,,
4,Warburg Invest Kapitalanlagegesellschaft mbH,Investment Advisor,,,0.0,0.0,,,,,...,Aggregate MFs,Aggregate MFs,,,,,,,,
5,"Vontobel Asset Management, Inc.",Investment Advisor/Hedge Fund,,,,0.0,,,,,...,,Aggregate MFs,,,,,,,,
6,Veritas Investment Trust GmbH,Investment Advisor,,,,0.17,0.17,,,,...,,Aggregate MFs,Aggregate MFs,,,,,,,
7,"Vanguard Group, Inc.",Investment Advisor,0.01,0.0,0.25,0.2,0.17,,,,...,Aggregate MFs,Aggregate MFs,Aggregate MFs,,,,,,,
8,Universal-Investment-Gesellschaft mbH,Investment Advisor/Hedge Fund,,,,0.0,0.01,,,,...,,Aggregate MFs,Aggregate MFs,,,,,,,
9,Udwadia (D E),Individual Investor,,0.0,0.0,,,,,,...,Other Substantial/Declarable,,,,,,,,,


In [18]:
# Get all Files in raw_data/cartel_shareholders folder
import os
files = os.listdir("raw_data/company_shareholders")
files = [f for f in files if f.endswith(".xlsx")]

# Print number of files
print(len(files))

# Create a list for all dataframes
dfs = []

# for every file in the folder
for f in files:
    # Read the file
    df = pd.read_excel("raw_data/company_shareholders/" + f)
    
    # drop last 3 rows (total, empty)
    df = df[:-3]

    # Drop empty rows
    df_cleaned = df.dropna(how='all', axis=0)

    # Get relevant columns with % O/S in the name and filing types
    relevant_columns_percent = df_cleaned.columns[df_cleaned.columns.str.contains('% O/S')]
    relevant_columns_filing = df_cleaned.columns[df_cleaned.columns.str.contains('Filing Type')]

    # Keep only relevant columns
    df_cleaned = df_cleaned[['Investor Name', 'Investor Sub-Type'] + list(relevant_columns_percent) + list(relevant_columns_filing)]

    # Change column names to be more readable by renaming % O/S to perc_os_{month_year} and Filing Type to filing_type_{month_year}
    col_list = []

    # for every column in the dataframe
    for col in df_cleaned.columns:

        # if the column contains % O/S, extract the month and year and rename the column to perc_os_{month_year}
        if '% O/S' in col:
            month_year = col.split(' ')[-1]
            col_list.append(f'perc_os_{month_year}')
        # if the column contains Filing Type, extract the month and year and rename the column to filing_type_{month_year}
        elif 'Filing Type' in col:
            month_year = col.split(' ')[-1]
            col_list.append(f'filing_type_{month_year}')
        # else, rename the column to lowercase and replace spaces with underscores
        else:
            col_list.append(str.lower(str.replace(col, ' ', '_')))

    # rename the columns
    df_cleaned.columns = col_list

    # add the company name to the dataframe
    df_cleaned["company_name"] = f.split(".")[0]

    # print the company name to see progress
    print(f.split(".")[0])
    
    # append the dataframe to the list of dataframes
    dfs.append(df_cleaned)

print("---Finished---")

61
Aalberts
ABB
Akzo Nobel
Alstom SA
Areva
Asahi
AU Optronics
BAM
Bayern
Boliden
British Airways
Cathay Pacific
Chemtura
Chimei
chiquita
Chungwa
Commerzbank
del monte
Dow
Elpida
ENI
EON
Exxon Mobil
Fuji Electric
Fujifilm
GDF suez
Hannstar Display
Henkel
hitachi ltd
Hitachi Maxell
ICI
IMI PLC
Infineon
LG Display
Micron
Mitsubishi
Mueller Industries
Nanya Tech
NEC
Nippon electric glass
Panasonic
Pilkington
procter gamble
Qantas
rautaruukki
Repsol YPF
samsung
SAS AB
Siemens
Singapore Airlines
SKW Stahl
Sony
Toshiba
Total
Unilever NV
Unilever PLC
Unipetrol
United technologies corp
Uralita
Whirlpool
Zeon
---Finished---


In [19]:
# print number of dataframes, should be equal to number of files
print(f"Number of dataframes: {len(dfs)}")

# Merge all dataframes into one
df_merged = pd.concat(dfs)

investor_names = df_merged['investor_name'].unique()
cartel_names = df_merged['company_name'].unique()

# print amount of uniques
print(f"Number of unique investors: {len(investor_names)}")
print(f"Number of unique cartels: {len(cartel_names)}")

# order investors by number of companies they are in invested in
investor_counts = df_merged['investor_name'].value_counts()

investor_counts = investor_counts.sort_values(ascending=False)

print(investor_counts)

Number of dataframes: 61
Number of unique investors: 8246
Number of unique cartels: 61
investor_name
Universal-Investment-Gesellschaft mbH                58
Dimensional Fund Advisors, LP                        56
TIAA-CREF                                            56
State Street Global Advisors (France) S.A.           56
Vanguard Group, Inc.                                 56
                                                     ..
Geisler (James E)                                     1
Garnier (Jean-Pierre)                                 1
Faraci (John V)                                       1
Euroamérica Administradora General de Fondos S.A.     1
David (George A L)                                    1
Name: count, Length: 8246, dtype: int64


In [20]:
for df in dfs:
    # Get all columns with perc_os in the name
    perc_os_columns = df.columns[df.columns.str.contains('perc_os')]

    # Get the name of the company e.g. ABB
    name = df["company_name"][0]
    
    # Print the name of the company to see progress
    print(name)

    # Create a graph for each column with perc_os in the name
    for column in perc_os_columns:
        G = nx.Graph()

        for _, row in df.iterrows():
            investor = row["investor_name"]
            firm = row["company_name"]
            weight = row[column]

            # Check if weight is not null
            if not pd.isna(weight):  
                G.add_node(firm, type="Company")
                G.add_node(investor, type="Investor")
                G.add_edge(investor, firm, weight=weight)
        
        # Save the graph as a graphml file
        nx.write_graphml(G, f"./transformed_data/shareholder_networks/shareholder_network_{name}_{str.replace(column, 'perc_os_', '')}.graphml", named_key_ids=True, infer_numeric_types=True)

print("---Finished---")

Aalberts
ABB
Akzo Nobel
Alstom SA
Areva
Asahi
AU Optronics
BAM
Bayern
Boliden
British Airways
Cathay Pacific
Chemtura
Chimei
chiquita
Chungwa
Commerzbank
del monte
Dow
Elpida
ENI
EON
Exxon Mobil
Fuji Electric
Fujifilm
GDF suez
Hannstar Display
Henkel
hitachi ltd
Hitachi Maxell
ICI
IMI PLC
Infineon
LG Display
Micron
Mitsubishi
Mueller Industries
Nanya Tech
NEC
Nippon electric glass
Panasonic
Pilkington
procter gamble
Qantas
rautaruukki
Repsol YPF
samsung
SAS AB
Siemens
Singapore Airlines
SKW Stahl
Sony
Toshiba
Total
Unilever NV
Unilever PLC
Unipetrol
United technologies corp
Uralita
Whirlpool
Zeon


In [21]:
# Liste für die finalen Ergebnisse
top_investors_list = []

herfindal_list = []

for df in dfs:
    # Alle Spalten mit 'perc_os' (also die Ownership-Anteile)
    perc_os_columns = df.columns[df.columns.str.contains('perc_os')]

    # In numerisch konvertieren
    df[perc_os_columns] = df[perc_os_columns].apply(pd.to_numeric, errors='coerce')

    # Name des Unternehmens (cartel)
    company_name = df["company_name"].iloc[0]

    for column in perc_os_columns:
        year = column[-4:]

        
        ###################################### Top 5 Investors per year ######################################
        # Top 5 Investoren für das Jahr
        top5 = df.nlargest(5, column)[["investor_name", column]].reset_index(drop=True)

        for rank in range(5):
            try:
                investor = top5.loc[rank, "investor_name"]
                perc = top5.loc[rank, column]

                if pd.isna(perc):
                    continue  # Falls kein Wert vorhanden ist, überspringen

                # Ergebniszeile
                result = {
                    "company_name": company_name,
                    "year": int(year),
                    "rank": rank + 1,
                    "investor_name": investor,
                    "perc_os": perc
                }

                top_investors_list.append(result)

            except IndexError:
                # Weniger als 5 Investoren
                continue

        ###################################### Top 5 Investors per year ######################################

        ###################################### Herfindahl Index ######################################

        # Get all values in column
        values = df[column].values

        # calculate herfindahl index
        herfindahl = sum([math.pow(value, 2) for value in values if not math.isnan(value)])

        # get amount of investors for this year
        amount_investors = len(values) - sum([math.isnan(value) for value in values])

        result_herfindahl = {
            "company_name": company_name,
            "year": int(year),
            "herfindahl_index": herfindahl,
            "amount_investors": amount_investors
        }

        herfindal_list.append(result_herfindahl)

        ###################################### Herfindahl Index ######################################


###################################### Top 5 Investors per year ######################################

# In DataFrame umwandeln
top_investors_df = pd.DataFrame(top_investors_list)

# Sortieren nach Unternehmen, Jahr, Rang
top_investors_df = top_investors_df.sort_values(by=["company_name", "year", "rank"])

top_investors_df.to_excel("transformed_data/shareholder_data/top_investors.xlsx", index=False)
###################################### Top 5 Investors per year ######################################

###################################### Herfindahl Index ######################################

herfindahl_df = pd.DataFrame(herfindal_list)

# Sortieren nach Unternehmen, Jahr
herfindahl_df = herfindahl_df.sort_values(by=["company_name", "year"])

herfindahl_df.to_excel("transformed_data/shareholder_data/herfindahl_index.xlsx", index=False)

###################################### Herfindahl Index ######################################
