PCLab#5 - Group 2 - Emanuele Sala, Luca Soleri, Fabio Stefana

<div style="border: 4px solid #007bff; padding: 10px; background-color: #e9f5ff; border-radius: 5px;">
    <h1 style="color: #007bff;">Importing libraries and Dataset</h1>
</div>


In [1]:
import pandas as pd
import os
import gc

In [None]:
directory = "data/sigwatch_data"
df_list = []

for file in os.listdir(directory):
    if file.endswith(".dta"):
        file_path = os.path.join(directory, file)
        temp_df = pd.read_stata(file_path)
        df_list.append(temp_df)
        
df = pd.concat(df_list, ignore_index=True)

<div style="border: 4px solid #007bff; padding: 10px; background-color: #e9f5ff; border-radius: 5px;">
    <h1 style="color: #007bff;">Preliminary data exploration</h1>
</div>

In [None]:
# With this filter we keep only the banks
df = df[df["corp_industry_sector1"] == "Finance"]

In [None]:
# And with this filter we only keep the countries form the US, UK or EU
countries = ['Austria',
             'US',
             'Denmark',
             'UK',
             'Germany',
             'Luxembourg',
             'France',
             'Italy',
             'Netherlands',
             'Belgium',
             'Sweden',
             'Spain',
             'Ireland',
             'Portugal',
             'Poland',
             'Finland',
             'USA',
             'Croatia',
             'Bulgaria',
             'Montenegro',
             'Bosnia and Herzegovina']

df = df[df['country_corp'].isin(countries)]

In [None]:
# We count the unique number of ud_archive as some have more than one row but still count as one isngle campaing
n_of_campaigns = len(list(df["uid_archive"].unique()))
print(f"There are {n_of_campaigns} unique campaigns for US UK and EU banks")

In [None]:
list_of_ngo_columns = []
for i in range(5):
    i = i+1
    ngo_column_number = f"ngo_name{i}"
    ngo_col = list(df[ngo_column_number])
    list_of_ngo_columns += ngo_col
unique_ngos = list(set(list_of_ngo_columns))

# we do -1 because we have to account for the null value
print(f"There are {len(unique_ngos) - 1} unique NGO organizations involved in this dataset")

In [None]:
# Numero di aziende targettate

In [None]:
columns_we_want = ["uid_archive", 
                   "date", 
                   "company",
                   'country_corp', # Country of the Company
                   'corp_industry_sector1', # Industry of the company
                   'company_parent',
                   'company_parent_country',
                   "sentiment",
                   'issue_name1',
                   'issue_name2',
                   'issue_name3']

<div style="border: 4px solid #007bff; padding: 10px; background-color: #e9f5ff; border-radius: 5px;">
    <h1 style="color: #007bff;">Day 2: Exploring bank data</h1>
</div>

In [None]:
def read_banks_ri(directory):
    # banks_ri.xlsm has the total return for each bank
    banks_ri = pd.ExcelFile(directory)
    
    # Each workbook is divided into sheets that divide the banks into different countires
    # the first sheet is a request table so we will ignore it
    banks_ri_sheets = banks_ri.sheet_names[1:]
    
    # We are gonna read all the different sheets and put them into a dataframe, store it 
    # into a list and concatenate them all toghether so we will have a big df with all the 
    # banks_ri info in it.
    # Since all the sheets have a column for the date, we will only read it for the first
    # sheet and skip it for the others.
    first_df = banks_ri.parse(banks_ri_sheets[0])
    first_df = first_df.rename(columns={first_df.columns[0]: "Date"})
    other_dfs = [banks_ri.parse(sheet_name).iloc[:, 1:] for sheet_name in banks_ri_sheets[1:]]
    
    banks_ri_df = pd.concat([first_df] + other_dfs, axis=1)
    return banks_ri_df

def clean_banks_ri(banks_ri_df):
    # Rename columns so that they have cleaner names
    rename_dict = {"Date": "Date"}
    for col in banks_ri_df.columns[1:]:
        rename_dict[col] = col[:-17] + "_TR"
    banks_ri_df.rename(columns=rename_dict, inplace=True)
    
    # As we have 158 banks, we will drop all the banks that are marked as Dead, this is a simple 
    # solution around this problem and we are only doing this because we can afford to do so 
    # seeing how many banks we have. 
    # With this operation we will be dropping only 3 banks.
    dead_banks = []
    for col in banks_ri_df.columns[1:]:
        if "dead" in col.lower():
            dead_banks.append(col)
    banks_ri_df.drop(dead_banks, axis = 1, inplace = True)
    return banks_ri_df

banks_ri_df = read_banks_ri("data/banks_data_bocconi/banks_ri.xlsm")
banks_ri_df = clean_banks_ri(banks_ri_df)
banks_ri_df.head(5)

The banks_ri dataset provides the Total Return Index, this includes both price changes and the effect of dividends reinvested back into the stock. If the bank pays dividends, these dividends are considered to be reinvested, adding to the growth of the index. 

This index provides a more comprehensive view of the stock’s overall return by including income from dividends, making it useful for capturing the full picture of what an investor earns from holding the stock.

In [None]:
def read_banks_pi(directory):
    banks_pi = pd.ExcelFile(directory)
    banks_pi_sheets = banks_pi.sheet_names[1:]
    first_df = banks_pi.parse(banks_pi_sheets[0])
    first_df = first_df.rename(columns={first_df.columns[0]: "Date"})
    other_dfs = [banks_pi.parse(sheet_name).iloc[:, 1:] for sheet_name in banks_pi_sheets[1:]]
    banks_ri_df = pd.concat([first_df] + other_dfs, axis=1)
    return banks_ri_df

def clean_banks_pi(banks_pi_df):
    rename_dict = {"Date": "Date"}
    for col in banks_pi_df.columns[1:]:
        rename_dict[col] = col[:-14] + "_PI"
    banks_pi_df.rename(columns=rename_dict, inplace=True)
    dead_banks = []
    for col in banks_pi_df.columns[1:]:
        if "dead" in col.lower():
            dead_banks.append(col)
    banks_pi_df.drop(dead_banks, axis = 1, inplace = True)
    return banks_pi_df

banks_pi_df = read_banks_pi("data/banks_data_bocconi/banks_pi.xlsm")
banks_pi_df = clean_banks_pi(banks_pi_df)
banks_pi_df.head(5)

NOTE: The banks_pi dataset contains 3 more banks than the banks_pi dataset, those banks are all 3 located in Austria and they are:
- ERSTE GROUP BANK
- RAIFFEISEN BANK INTL
- VOLKSBANK VBG.PARTN.

On the other hand, the banks_pi dataset contains the Price Index, which only reflects a stock’s price movements without accounting for dividends or other distributions. It provides a pure measure of price appreciation, capturing changes in the stock's market price alone. 

This makes it more suitable for CAPM estimation, as we are focused on price shifts rather than the total return an investor would earn if they held the stock and reinvested dividends.

<strong>To clarify, we'll now rely exclusively on the banks_pi dataset to carry out our analysis.</strong>

<div style="border: 4px solid #007bff; padding: 10px">
    <h3 style="color: #007bff;">Now we load the Fama-French info</h1>
</div>

In [None]:
# First we get the start and end date of our bank data
start = banks_pi_df["Date"].iloc[0]
end = banks_pi_df["Date"].iloc[-1]

# And now we can read in the Factors and keep only the days we need
EU_FF = pd.read_excel("data/banks_data_bocconi/Europe_3_Factors_Daily.xlsx")
EU_FF['date'] = pd.to_datetime(EU_FF['date'], format="%m/%d/%Y")
EU_FF = EU_FF[(EU_FF["date"] >= start) & (EU_FF["date"] <= end)]
rename_dict_EU = {"date": "date"}
for col in EU_FF.columns[1:]:
    rename_dict_EU[col] = col + "_EU"
EU_FF.rename(columns=rename_dict_EU, inplace=True)

US_FF = pd.read_excel("data/banks_data_bocconi/North_America_3_Factors_Daily.xlsx")
US_FF['date'] = pd.to_datetime(US_FF['date'], format="%m/%d/%Y")
US_FF = US_FF[(US_FF["date"] >= start) & (US_FF["date"] <= end)]
US_FF.drop("date", axis = 1, inplace = True)
rename_dict_US = {}
for col in US_FF.columns:
    rename_dict_US[col] = col + "_US"
US_FF.rename(columns=rename_dict_US, inplace=True)

FF = pd.concat([EU_FF, US_FF], axis = 1)
FF.reset_index(drop = True, inplace = True)

# Now we calculate the market return, as we have (rm -rf) and rf its quite simple
FF["MKT_EU"] = FF["Mkt-RF_EU"] - FF["RF_EU"]
FF["MKT_US"] = FF["Mkt-RF_US"] - FF["RF_US"]

print(f"Banks_pi length: {len(banks_pi_df)}")
print(f"Banks_pi length: {len(FF)}")
FF

<div style="border: 4px solid #007bff; padding: 10px">
    <h3 style="color: #007bff;">Now we calculate the daily returns for each bank</h1>
</div>

In [None]:
def calculate_return(stock, returns_df):
    shifted = returns_df[stock].shift(1)
    return_series = (returns_df[stock] - shifted) / shifted
    return return_series.rename(f"{stock}_r")

returns_dict = {}

tickers = banks_pi_df.columns[1:]
for stock in tickers:
    returns_dict[f"{stock}_r"] = calculate_return(stock, banks_pi_df)

returns_df = pd.concat(returns_dict.values(), axis=1)

<div style="border: 4px solid #007bff; padding: 10px">
    <h3 style="color: #007bff;">Now we put everything togheter into a single panel</h1>
</div>

In [None]:
bank_data = pd.concat([banks_pi_df, returns_df, FF], axis=1)
bank_data

In [None]:
type(bank_data["Date"][0])

<div style="border: 4px solid #007bff; padding: 10px; background-color: #e9f5ff; border-radius: 5px;">
    <h1 style="color: #007bff;">Task 4: Estimating the CAPM </h1>
</div>

In [None]:
us_banks = []
banks_pi = pd.ExcelFile("data/banks_data_bocconi/banks_ri.xlsm")
us_banks_raw = list(banks_pi.parse("US").columns)[1:]
for bank in us_banks_raw:
    us_banks.append(bank[:-17] + "_PI")

# Delete banks_pi and us_banks_raw to free memory
del banks_pi, us_banks_raw
gc.collect()

In [None]:
us_banks

Now we will estimate the CAPM on a rolling window we decided to use 130 days (as there are about 130 working days in 6 months)as a window

In [None]:

def get_mkt_data(bank_data, bank, us_banks):
    if bank in us_banks:
        mkt_data = bank_data["MKT_US"].copy()
    else:
        mkt_data = bank_data["MKT_EU"].copy()
    return mkt_data

def rollin_window_CAPM(bank_data, bank, us_banks, window_size = 150):
    current_bank = bank_data[["Date", f"{bank}"]].copy()
    mkt_data = get_mkt_data(bank_data, bank, us_banks)
    current_bank["MKT"] = mkt_data

    # Iterate using sliding window approach
    for i in range(len(current_bank) - window_size + 1):
        window = current_bank.iloc[i:i + window_size] 

        # Add a constant term (intercept) to the regression model, this will be our Alpha
        X = sm.add_constant(window["MKT"])  
        # Fit the OLS regression
        model = sm.OLS(current_bank, X).fit()  
    
        # Extract parameter of intrest
        alpha = model.params[0]
        beta = model.params[1]
        p_value_alpha = model.pvalues[0]
        p_value_beta = model.pvalues[1]

        ## BISOGNA CAPIRE COME MEETTERE VIA I RISULTATI
    
        # Create a dictionary to store the results for the current stock
        result_i = {"Alpha": alpha,
                    "Beta": beta,
                    "P Value Alpha": p_value_alpha,
                    "P Value Beta": p_value_beta}
        
    
    
    return current_bank
    

In [5]:
# Sample DataFrame
df = pd.DataFrame({
    'Date': pd.date_range('2024-01-01', periods=10, freq='D'),
    'Value': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
})

# Window size
window_size = 5

# Iterate using sliding window approach
for i in range(len(df) - window_size + 1):
    window = df.iloc[i:i + window_size]  # Select a window of size 'window_size'
    print(window)

        Date  Value
0 2024-01-01      1
1 2024-01-02      2
2 2024-01-03      3
3 2024-01-04      4
4 2024-01-05      5
        Date  Value
1 2024-01-02      2
2 2024-01-03      3
3 2024-01-04      4
4 2024-01-05      5
5 2024-01-06      6
        Date  Value
2 2024-01-03      3
3 2024-01-04      4
4 2024-01-05      5
5 2024-01-06      6
6 2024-01-07      7
        Date  Value
3 2024-01-04      4
4 2024-01-05      5
5 2024-01-06      6
6 2024-01-07      7
7 2024-01-08      8
        Date  Value
4 2024-01-05      5
5 2024-01-06      6
6 2024-01-07      7
7 2024-01-08      8
8 2024-01-09      9
        Date  Value
5 2024-01-06      6
6 2024-01-07      7
7 2024-01-08      8
8 2024-01-09      9
9 2024-01-10     10


In [None]:
prova = rollin_window_CAPM(bank_data, "BANK OF AMERICA_PI", us_banks)
prova

In [None]:
# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

def estimateCAPM(returns_df):
    # Initialize an empty DataFrame to store results
    CAPM_df = pd.DataFrame()  
     # Loop through all stock columns, excluding the Date and sp500
    for stock in list(returns_df.columns)[1:-1]: 
        stock_i = returns_df[f"{stock}"]  
        mkt = returns_df["sp500_r"]
        
        # Add a constant term (intercept) to the regression model, this will be our Alpha
        X = sm.add_constant(mkt)  
        # Fit the OLS regression
        model = sm.OLS(stock_i, X).fit()  
    
        # Extract parameter of intrest
        alpha = model.params[0]
        beta = model.params[1]
        p_value_alpha = model.pvalues[0]
        p_value_beta = model.pvalues[1]
    
        # Create a dictionary to store the results for the current stock
        result_i = {"Alpha": alpha,
                    "Beta": beta,
                    "P Value Alpha": p_value_alpha,
                    "P Value Beta": p_value_beta}
    
        # Convert result_i to a DataFrame and concatenate it to CAPM_df
        result_df = pd.DataFrame(result_i, index=[stock])  
        CAPM_df = pd.concat([CAPM_df, result_df])  

CAPM_df.reset_index(inplace = True)
CAPM_df.rename(columns={'index': 'Stock'}, inplace = True)

In [None]:
threshold = int(0.7 * len(banks_ri_df))

# Drop columns that don't meet the threshold
banks_ri_df.dropna(axis=1, thresh=threshold)