PCLab#5 - Group 2 - Emanuele Sala, Luca Soleri, Fabio Stefana

<div style="border: 4px solid #007bff; padding: 10px; background-color: #e9f5ff; border-radius: 5px;">
    <h1 style="color: #007bff;">Importing libraries and Dataset</h1>
</div>


In [1]:
import pandas as pd
import os

In [None]:
directory = "data/sigwatch_data"
df_list = []

for file in os.listdir(directory):
    if file.endswith(".dta"):
        file_path = os.path.join(directory, file)
        temp_df = pd.read_stata(file_path)
        df_list.append(temp_df)
        
df = pd.concat(df_list, ignore_index=True)

<div style="border: 4px solid #007bff; padding: 10px; background-color: #e9f5ff; border-radius: 5px;">
    <h1 style="color: #007bff;">Preliminary data exploration</h1>
</div>

In [None]:
# With this filter we keep only the banks
df = df[df["corp_industry_sector1"] == "Finance"]

In [None]:
# And with this filter we only keep the countries form the US, UK or EU
countries = ['Austria',
             'US',
             'Denmark',
             'UK',
             'Germany',
             'Luxembourg',
             'France',
             'Italy',
             'Netherlands',
             'Belgium',
             'Sweden',
             'Spain',
             'Ireland',
             'Portugal',
             'Poland',
             'Finland',
             'USA',
             'Croatia',
             'Bulgaria',
             'Montenegro',
             'Bosnia and Herzegovina']

df = df[df['country_corp'].isin(countries)]

In [None]:
# We count the unique number of ud_archive as some have more than one row but still count as one isngle campaing
n_of_campaigns = len(list(df["uid_archive"].unique()))
print(f"There are {n_of_campaigns} unique campaigns for US UK and EU banks")

In [None]:
list_of_ngo_columns = []
for i in range(5):
    i = i+1
    ngo_column_number = f"ngo_name{i}"
    ngo_col = list(df[ngo_column_number])
    list_of_ngo_columns += ngo_col
unique_ngos = list(set(list_of_ngo_columns))

# we do -1 because we have to account for the null value
print(f"There are {len(unique_ngos) - 1} unique NGO organizations involved in this dataset")

In [None]:
# Numero di aziende targettate

In [None]:
columns_we_want = ["uid_archive", 
                   "date", 
                   "company",
                   'country_corp', # Country of the Company
                   'corp_industry_sector1', # Industry of the company
                   'company_parent',
                   'company_parent_country',
                   "sentiment",
                   'issue_name1',
                   'issue_name2',
                   'issue_name3']

<div style="border: 4px solid #007bff; padding: 10px; background-color: #e9f5ff; border-radius: 5px;">
    <h1 style="color: #007bff;">Task #4 : estimate parameters of the CAPM </h1>
</div>

In [3]:
def read_banks_ri(directory):
    # banks_ri.xlsm has the total return for each bank
    banks_ri = pd.ExcelFile(directory)
    
    # Each workbook is divided into sheets that divide the banks into different countires
    # the first sheet is a request table so we will ignore it
    banks_ri_sheets = banks_ri.sheet_names[1:]
    
    # We are gonna read all the different sheets and put them into a dataframe, store it 
    # into a list and concatenate them all toghether so we will have a big df with all the 
    # banks_ri info in it.
    # Since all the sheets have a column for the date, we will only read it for the first
    # sheet and skip it for the others.
    first_df = banks_ri.parse(banks_ri_sheets[0])
    first_df = first_df.rename(columns={first_df.columns[0]: "Date"})
    other_dfs = [banks_ri.parse(sheet_name).iloc[:, 1:] for sheet_name in banks_ri_sheets[1:]]
    
    banks_ri_df = pd.concat([first_df] + other_dfs, axis=1)
    return banks_ri_df

def clean_banks_ri(banks_ri_df):
    # Rename columns so that they have cleaner names
    rename_dict = {"Date": "Date"}
    for col in banks_ri_df.columns[1:]:
        rename_dict[col] = col[:-17]
    banks_ri_df.rename(columns=rename_dict, inplace=True)
    
    # As we have 158 banks, we will drop all the banks that are marked as Dead, this is a simple 
    # solution around this problem and we are only doing this because we can afford to do so 
    # seeing how many banks we have. 
    # With this operation we will be dropping only 3 banks.
    dead_banks = []
    for col in banks_ri_df.columns[1:]:
        if "dead" in col.lower():
            dead_banks.append(col)
    banks_ri_df.drop(dead_banks, axis = 1, inplace = True)
    return banks_ri_df

banks_ri_df = read_banks_ri("data/banks_data_bocconi/banks_ri.xlsm")
banks_ri_df = clean_banks_ri(banks_ri_df)
banks_ri_df

Unnamed: 0,Date,AAREAL BANK,COMDIRECT BANK,COMMERZBANK,DT.PFANDBRIEFBANK,PROCREDIT HOLDING,UMWELTBANK,ALLIANZ,DEUTSCHE BANK,BANQUE NATIONALE DE BELGIQUE,...,VIRGIN MONEY UK,HSBC HOLDINGS,LLOYDS BANKING GROUP,METRO BANK,ROYAL BANK OF SCTL.GP.,STANDARD CHARTERED,TBC BANK GROUP,ADMIRAL GROUP,ALLIANCE TRUST,SAGA GROUP
0,2008-01-01,182.27,33.09,974.59,,,260.15,3364.43,1937.46,2658.11,...,,1502.17,271.20,,82190.06,79476.00,,449.44,14717.18,1438.33
1,2008-01-02,183.81,33.05,956.43,,,259.22,3304.22,1900.87,2684.50,...,,1489.68,269.47,,81079.38,79001.88,,447.39,14764.00,1438.33
2,2008-01-03,181.73,33.05,944.94,,,261.69,3285.59,1900.00,2682.82,...,,1495.93,270.91,,81912.38,78657.13,,449.44,14866.32,1438.33
3,2008-01-04,173.31,33.09,939.75,,,263.24,3220.61,1881.60,2641.91,...,,1476.30,261.43,,78348.94,77665.81,,436.36,14538.91,1438.33
4,2008-01-07,167.91,33.20,943.08,,,259.37,3179.94,1880.52,2632.54,...,,1486.11,259.71,,77423.38,77924.38,,431.87,14549.14,1438.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3211,2020-04-22,126.90,86.08,21.01,81.91,40.14,1387.93,6352.29,201.17,2595.15,...,35.26,1551.10,47.22,4.65,2799.64,28616.78,79.08,1936.39,38350.87,1438.33
3212,2020-04-23,125.18,87.56,21.56,81.53,40.47,1382.27,6220.43,206.42,2620.59,...,36.34,1569.07,48.03,4.60,2846.44,29297.06,79.38,1932.15,38515.23,1438.33
3213,2020-04-24,127.08,87.16,21.35,80.14,40.81,1348.28,6189.82,196.74,2582.43,...,35.83,1537.90,46.94,4.50,2796.97,28130.86,77.57,1957.59,38131.72,1438.33
3214,2020-04-27,127.17,87.69,22.33,83.36,41.98,1319.95,6310.70,219.14,2582.43,...,38.63,1592.02,48.20,4.66,2874.51,28639.21,78.78,1938.93,39337.04,1438.33


In [None]:
threshold = int(0.7 * len(banks_ri_df))

# Drop columns that don't meet the threshold
banks_ri_df.dropna(axis=1, thresh=threshold)