In [10]:
import pandas as pd

In [11]:
# Read Excel File which contains data on cartels and their corresponding firms
df_entities = pd.read_excel("./raw_data/cartel_connections/Cartels.xls", sheet_name='Data_on_all_cartelfirms')

# Get unique entities from the "Entity Name" column
entities = df_entities["Entity Name"].unique()

# Get all filenames from mapping file (contains all filenames)
df_mapping = pd.read_excel("./raw_data/mapping/filename_to_entity_mapping.xlsx")

# if entity_name_direct_match column is not empty take entity_name_direct_match else take entity_name_indirect_match_1
df_mapping['entity_name'] = df_mapping.apply(
    lambda x: x['entity_name_direct_match'] if pd.notna(x['entity_name_direct_match']) else x['entity_name_indirect_match_1'], axis=1)

# Get unique entities from the mapping file
mapping_entities = df_mapping['entity_name'].unique()

# count unique entities in the mapping file
mapping_entities_count = len(mapping_entities)

print(mapping_entities)
print(mapping_entities_count)


['Aalberts Industries NV' 'ABB' 'Akzo Nobel NV' 'Alstom SA' 'AREVA T&D SA'
 'Asahi' 'AU Optronics' 'Koninklijke BAM Groep NV' nan 'Boliden AB'
 'British Airways' 'Cathay Pacific Airways' 'Chemtura'
 'Chimei InnoLux Corporation' 'Chiquita' 'Commerzbank AG' 'Del Monte '
 'Dow' 'Elpida ' 'ENI' 'Exxon Mobil' 'Fuji Electric'
 'Fujifilm Holdings Corp.' 'GDF SUEZ SA' 'HannStar Display Corporation'
 'Henkel AG & Co. ' 'Hitachi Ltd.' 'Hitachi Maxell Limited' 'ICI'
 'IMI PLC' 'Infineon' 'LG Display' 'Micron' 'Mitsubishi'
 'Mueller Industries Inc' 'Nanya' 'NEC Corporation'
 'Nippon Electric Glass' 'Panasonic Corporation' 'Pilkington'
 'The Procter & Gamble Company' 'Qantas' 'Rautaruukki' 'Repsol YPF SA'
 'Samsung' 'SAS AB' 'Siemens' 'Singapore Airlines'
 'SKW Stahl- Metallurgie and ARQUES Industries' 'Sony Corporation'
 'Toshiba' 'Total' 'Unilever NV' 'Unilever PLC' 'Unipetrol'
 'United Technologies Corp.' 'Uralita' 'Whirlpool' 'Zeon']
59


In [12]:
# define new dataframe with 2 columns entity_name and file_name
df_results = pd.DataFrame(columns=['entity_name', 'file_name'])

# Iterate through each entity to check if it exists in the mapping file
for entity in entities:
    # Check if the entity exists in the mapping file
    if entity not in mapping_entities:
        # add entity and None to the dataframe
        df_results.loc[len(df_results)] = [entity, None]
    else:
        # Get the corresponding file name for the entity
        file_name = df_mapping.loc[df_mapping['entity_name'] == entity, 'file_name'].values[0]
        # add entity and file_name to the dataframe
        df_results.loc[len(df_results)] = [entity, file_name]

# count entities where file_name is not None
df_results_count = df_results[df_results['file_name'].notna()].shape[0]
# count entities where file_name is None
df_results_none_count = df_results[df_results['file_name'].isna()].shape[0]

print(f"Number of Entities with investor file: {df_results_count}")
print(f"Number of Entities without investor file: {df_results_none_count}")
print(f"Number of Entities total: {len(df_results)}")

Number of Entities with investor file: 58
Number of Entities without investor file: 171
Number of Entities total: 229


In [13]:
# save the results to a new Excel file
df_results.to_excel("./temp_data/entities_with_corresponding_filenames.xlsx", index=False)

In [14]:
# for each filename in df_mapping
for file_name in df_mapping['file_name']:
    # check if the file exists in the mapping file
    if file_name not in df_results['file_name'].values:
        # add file_name and None to the dataframe
        print(file_name)


Bayern
Chungwa
EON


In [15]:
import os

# get all filenames from directory
directory = "./raw_data/company_shareholders/"
filenames = os.listdir(directory)
# remove .xlsx from filenames
filenames = [filename.replace('.xlsx', '') for filename in filenames]

for filename in filenames:
    # check if the file exists in the mapping file
    if filename not in df_mapping['file_name'].values:
        # add file_name and None to the dataframe
        print(filename)