PCLab#5 - Group 2 - Emanuele Sala, Luca Soleri, Fabio Stefana

<div style="border: 4px solid #007bff; padding: 10px; background-color: #e9f5ff; border-radius: 5px;">
    <h1 style="color: #007bff;">Importing libraries and Dataset</h1>
</div>


In [1]:
import pandas as pd
import os
import gc

In [None]:
directory = "data/sigwatch_data"
df_list = []

for file in os.listdir(directory):
    if file.endswith(".dta"):
        file_path = os.path.join(directory, file)
        temp_df = pd.read_stata(file_path)
        df_list.append(temp_df)
        
df = pd.concat(df_list, ignore_index=True)

<div style="border: 4px solid #007bff; padding: 10px; background-color: #e9f5ff; border-radius: 5px;">
    <h1 style="color: #007bff;">Preliminary data exploration</h1>
</div>

In [None]:
# With this filter we keep only the banks
df = df[df["corp_industry_sector1"] == "Finance"]

In [None]:
# And with this filter we only keep the countries form the US, UK or EU
countries = ['Austria',
             'US',
             'Denmark',
             'UK',
             'Germany',
             'Luxembourg',
             'France',
             'Italy',
             'Netherlands',
             'Belgium',
             'Sweden',
             'Spain',
             'Ireland',
             'Portugal',
             'Poland',
             'Finland',
             'USA',
             'Croatia',
             'Bulgaria',
             'Montenegro',
             'Bosnia and Herzegovina']

df = df[df['country_corp'].isin(countries)]

In [None]:
# We count the unique number of ud_archive as some have more than one row but still count as one isngle campaing
n_of_campaigns = len(list(df["uid_archive"].unique()))
print(f"There are {n_of_campaigns} unique campaigns for US UK and EU banks")

In [None]:
list_of_ngo_columns = []
for i in range(5):
    i = i+1
    ngo_column_number = f"ngo_name{i}"
    ngo_col = list(df[ngo_column_number])
    list_of_ngo_columns += ngo_col
unique_ngos = list(set(list_of_ngo_columns))

# we do -1 because we have to account for the null value
print(f"There are {len(unique_ngos) - 1} unique NGO organizations involved in this dataset")

In [None]:
# Numero di aziende targettate

In [None]:
columns_we_want = ["uid_archive", 
                   "date", 
                   "company",
                   'country_corp', # Country of the Company
                   'corp_industry_sector1', # Industry of the company
                   'company_parent',
                   'company_parent_country',
                   "sentiment",
                   'issue_name1',
                   'issue_name2',
                   'issue_name3']

<div style="border: 4px solid #007bff; padding: 10px; background-color: #e9f5ff; border-radius: 5px;">
    <h1 style="color: #007bff;">Task #4 : estimate parameters of the CAPM </h1>
</div>

IMPORTANT NOTE

- Price Index: This index only reflects the price movements of a stock without considering dividends or other distributions. It’s a pure measure of price appreciation, so when you use the Price Index, you’re capturing the changes in the stock’s market price alone.
- Total Return Index: This includes both price changes and the effect of dividends reinvested back into the stock. If the bank pays dividends, these dividends are considered to be reinvested, adding to the growth of the index. This index provides a more comprehensive view of the stock’s overall return by including income from dividends, making it useful for capturing the full picture of what an investor earns from holding the stock.

For CAPM estimation, you typically need the Total Return Index. CAPM aims to model the total returns an investor would expect, factoring in all sources of return. Calculating returns using the Total Return Index allows you to include dividends, which are a critical component of a stock’s performance from an investor’s perspective.

In [2]:
def read_banks_ri(directory):
    # banks_ri.xlsm has the total return for each bank
    banks_ri = pd.ExcelFile(directory)
    
    # Each workbook is divided into sheets that divide the banks into different countires
    # the first sheet is a request table so we will ignore it
    banks_ri_sheets = banks_ri.sheet_names[1:]
    
    # We are gonna read all the different sheets and put them into a dataframe, store it 
    # into a list and concatenate them all toghether so we will have a big df with all the 
    # banks_ri info in it.
    # Since all the sheets have a column for the date, we will only read it for the first
    # sheet and skip it for the others.
    first_df = banks_ri.parse(banks_ri_sheets[0])
    first_df = first_df.rename(columns={first_df.columns[0]: "Date"})
    other_dfs = [banks_ri.parse(sheet_name).iloc[:, 1:] for sheet_name in banks_ri_sheets[1:]]
    
    banks_ri_df = pd.concat([first_df] + other_dfs, axis=1)
    return banks_ri_df

def clean_banks_ri(banks_ri_df):
    # Rename columns so that they have cleaner names
    rename_dict = {"Date": "Date"}
    for col in banks_ri_df.columns[1:]:
        rename_dict[col] = col[:-17] + "_TR"
    banks_ri_df.rename(columns=rename_dict, inplace=True)
    
    # As we have 158 banks, we will drop all the banks that are marked as Dead, this is a simple 
    # solution around this problem and we are only doing this because we can afford to do so 
    # seeing how many banks we have. 
    # With this operation we will be dropping only 3 banks.
    dead_banks = []
    for col in banks_ri_df.columns[1:]:
        if "dead" in col.lower():
            dead_banks.append(col)
    banks_ri_df.drop(dead_banks, axis = 1, inplace = True)
    return banks_ri_df

banks_ri_df = read_banks_ri("data/banks_data_bocconi/banks_ri.xlsm")
banks_ri_df = clean_banks_ri(banks_ri_df)

In [3]:
def read_banks_pi(directory):
    # banks_ri.xlsm has the total return for each bank
    banks_pi = pd.ExcelFile(directory)
    
    # Each workbook is divided into sheets that divide the banks into different countires
    # the first sheet is a request table so we will ignore it
    banks_pi_sheets = banks_pi.sheet_names[1:]
    
    # We are gonna read all the different sheets and put them into a dataframe, store it 
    # into a list and concatenate them all toghether so we will have a big df with all the 
    # banks_ri info in it.
    # Since all the sheets have a column for the date, we will only read it for the first
    # sheet and skip it for the others.
    first_df = banks_pi.parse(banks_pi_sheets[0])
    first_df = first_df.rename(columns={first_df.columns[0]: "Date"})
    other_dfs = [banks_pi.parse(sheet_name).iloc[:, 1:] for sheet_name in banks_pi_sheets[1:]]
    
    banks_ri_df = pd.concat([first_df] + other_dfs, axis=1)
    return banks_ri_df

def clean_banks_pi(banks_pi_df):
    # Rename columns so that they have cleaner names
    rename_dict = {"Date": "Date"}
    for col in banks_pi_df.columns[1:]:
        rename_dict[col] = col[:-14] + "_PI"
    banks_pi_df.rename(columns=rename_dict, inplace=True)
    
    # As we have 158 banks, we will drop all the banks that are marked as Dead, this is a simple 
    # solution around this problem and we are only doing this because we can afford to do so 
    # seeing how many banks we have. 
    # With this operation we will be dropping only 3 banks.
    dead_banks = []
    for col in banks_pi_df.columns[1:]:
        if "dead" in col.lower():
            dead_banks.append(col)
    banks_pi_df.drop(dead_banks, axis = 1, inplace = True)
    return banks_pi_df

banks_pi_df = read_banks_pi("data/banks_data_bocconi/banks_pi.xlsm")
banks_pi_df = clean_banks_pi(banks_pi_df)
# We drop the date form this df so that when we concatenate them there is no double "Date" column
banks_pi_df.drop("Date", axis = 1, inplace = True)

The banks_pi dataset contains 3 more banks than the banks_pi dataset, those banks are all 3 located in Austria and they are:
- ERSTE GROUP BANK
- RAIFFEISEN BANK INTL
- VOLKSBANK VBG.PARTN.


In [4]:
bank_data = pd.concat([banks_ri_df, banks_pi_df], axis=1)
bank_data

# Delete superflous dfs to free up memory
del banks_ri_df, banks_pi_df
gc.collect()

3857

In [5]:
bank_data

Unnamed: 0,Date,AAREAL BANK_TR,COMDIRECT BANK_TR,COMMERZBANK_TR,DT.PFANDBRIEFBANK_TR,PROCREDIT HOLDING_TR,UMWELTBANK_TR,ALLIANZ_TR,DEUTSCHE BANK_TR,BANQUE NATIONALE DE BELGIQUE_TR,...,CLOSE BROTHERS GROUP_PI,VIRGIN MONEY UK_PI,HSBC HOLDINGS_PI,LLOYDS BANKING GROUP_PI,METRO BANK_PI,ROYAL BANK OF SCTL.GP._PI,STANDARD CHARTERED_PI,ADMIRAL GROUP_PI,ALLIANCE TRUST_PI,SAGA GROUP_PI
0,2008-01-01,182.27,33.09,974.59,,,260.15,3364.43,1937.46,2658.11,...,8202.5,,784.5,137.4,,11418.8,8372.5,400.0,3044.2,854.3
1,2008-01-02,183.81,33.05,956.43,,,259.22,3304.22,1900.87,2684.50,...,8198.2,,778.0,136.5,,11264.5,8322.5,398.2,3037.9,854.3
2,2008-01-03,181.73,33.05,944.94,,,261.69,3285.59,1900.00,2682.82,...,8202.5,,781.2,137.3,,11380.3,8286.2,400.0,3059.0,854.3
3,2008-01-04,173.31,33.09,939.75,,,263.24,3220.61,1881.60,2641.91,...,8332.0,,771.0,132.5,,10885.2,8181.8,388.4,2991.6,854.3
4,2008-01-07,167.91,33.20,943.08,,,259.37,3179.94,1880.52,2632.54,...,8336.3,,776.1,131.6,,10756.6,8209.0,384.4,2993.7,854.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3211,2020-04-22,126.90,86.08,21.01,81.91,40.14,1387.93,6352.29,201.17,2595.15,...,9005.5,34.6,433.6,17.6,4.7,321.6,2165.4,830.2,5894.7,854.3
3212,2020-04-23,125.18,87.56,21.56,81.53,40.47,1382.27,6220.43,206.42,2620.59,...,9204.0,35.6,438.7,17.9,4.6,326.9,2216.9,828.4,5920.0,854.3
3213,2020-04-24,127.08,87.16,21.35,80.14,40.81,1348.28,6189.82,196.74,2582.43,...,9005.5,35.1,430.0,17.5,4.5,321.3,2128.7,839.3,5861.1,854.3
3214,2020-04-27,127.17,87.69,22.33,83.36,41.98,1319.95,6310.70,219.14,2582.43,...,8867.3,37.9,445.1,17.9,4.7,330.2,2167.1,831.3,6046.3,854.3


In [6]:
# Since we have different Fama-French factors for EU and US banks we will define which banks are form the US,
# also since we know that both banks_pi and banks_ri have the same number of us banks its indifferent form wich
# file we get this info
us_banks = []
banks_pi = pd.ExcelFile("data/banks_data_bocconi/banks_ri.xlsm")
us_banks_raw = list(banks_pi.parse("US").columns)[1:]
for bank in us_banks_raw:
    us_banks.append(bank[:-17])

# Delete banks_pi and us_banks_raw to free memory
del banks_pi, us_banks_raw
gc.collect()

3847

In [8]:
# Now we read the Fama-French files with the factos and add them
# First we get the start and end date of our bank data
start = bank_data["Date"].iloc[0]
end = bank_data["Date"].iloc[-1]

# And now we can read in the Factors and keep only the days we need
EU_FF = pd.read_excel("data/banks_data_bocconi/Europe_3_Factors_Daily.xlsx")
print("EU:", len(EU_FF))
EU_FF['date'] = pd.to_datetime(EU_FF['date'], format="%m/%d/%Y")
EU_FF = EU_FF[(EU_FF["date"] >= start) & (EU_FF["date"] <= end)]
rename_dict_EU = {"date": "date"}
for col in EU_FF.columns[1:]:
    rename_dict_EU[col] = col + "_EU"
EU_FF.rename(columns=rename_dict_EU, inplace=True)
print("EU:", len(EU_FF))

US_FF = pd.read_excel("data/banks_data_bocconi/North_America_3_Factors_Daily.xlsx")
print("US:", len(US_FF))
US_FF['date'] = pd.to_datetime(US_FF['date'], format="%m/%d/%Y")
US_FF = US_FF[(US_FF["date"] >= start) & (US_FF["date"] <= end)]
US_FF.drop("date", axis = 1, inplace = True)
rename_dict_US = {}
for col in US_FF.columns:
    rename_dict_US[col] = col + "_US"
US_FF.rename(columns=rename_dict_US, inplace=True)
print("US:", len(US_FF))

FF = pd.concat([EU_FF, US_FF], axis = 1)
FF.reset_index(drop = True, inplace = True)
FF

EU: 7893
EU: 3216
US: 7893
US: 3216


Unnamed: 0,date,Mkt-RF_EU,SMB_EU,HML_EU,RF_EU,Mkt-RF_US,SMB_US,HML_US,RF_US
0,2008-01-01,0.02,0.00,0.00,0.01,0.04,0.05,0.01,0.01
1,2008-01-02,-0.11,0.94,0.23,0.01,-1.25,0.09,-0.22,0.01
2,2008-01-03,-0.13,-0.22,0.11,0.01,-0.03,-0.66,-0.42,0.01
3,2008-01-04,-1.74,0.39,0.30,0.01,-2.59,-0.45,0.29,0.01
4,2008-01-07,-0.79,-1.03,0.16,0.01,0.02,-0.22,0.24,0.01
...,...,...,...,...,...,...,...,...,...
3211,2020-04-22,1.29,-0.77,0.11,0.00,2.27,-0.45,-1.10,0.00
3212,2020-04-23,0.42,0.66,0.92,0.00,0.13,1.03,0.54,0.00
3213,2020-04-24,-0.33,0.40,-0.37,0.00,1.40,0.35,-0.33,0.00
3214,2020-04-27,1.89,-0.58,0.44,0.00,1.75,1.90,1.37,0.00


In [11]:
bank_data_complete = pd.concat([bank_data, FF], axis = 1)
bank_data_complete.drop(["date"], axis = 1, inplace = True)

In [None]:
# Before we estimate the CAPM we need to define market returns and the risk free

In [None]:
# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

def estimateCAPM(returns_df):
    # Initialize an empty DataFrame to store results
    CAPM_df = pd.DataFrame()  
     # Loop through all stock columns, excluding the Date and sp500
    for stock in list(returns_df.columns)[1:-1]: 
        stock_i = returns_df[f"{stock}"]  
        mkt = returns_df["sp500_r"]
        
        # Add a constant term (intercept) to the regression model, this will be our Alpha
        X = sm.add_constant(mkt)  
        # Fit the OLS regression
        model = sm.OLS(stock_i, X).fit()  
    
        # Extract parameter of intrest
        alpha = model.params[0]
        beta = model.params[1]
        p_value_alpha = model.pvalues[0]
        p_value_beta = model.pvalues[1]
    
        # Create a dictionary to store the results for the current stock
        result_i = {"Alpha": alpha,
                    "Beta": beta,
                    "P Value Alpha": p_value_alpha,
                    "P Value Beta": p_value_beta}
    
        # Convert result_i to a DataFrame and concatenate it to CAPM_df
        result_df = pd.DataFrame(result_i, index=[stock])  
        CAPM_df = pd.concat([CAPM_df, result_df])  

CAPM_df.reset_index(inplace = True)
CAPM_df.rename(columns={'index': 'Stock'}, inplace = True)

In [None]:
threshold = int(0.7 * len(banks_ri_df))

# Drop columns that don't meet the threshold
banks_ri_df.dropna(axis=1, thresh=threshold)