PCLab#5 - Group 2 - Emanuele Sala, Luca Soleri, Fabio Stefana

<div style="border: 4px solid #007bff; padding: 10px; background-color: #e9f5ff; border-radius: 5px;">
    <h1 style="color: #007bff;">Importing libraries and Dataset</h1>
</div>


In [4]:
import pandas as pd
import os

In [5]:
directory = "data/sigwatch_data"
df_list = []

for file in os.listdir(directory):
    if file.endswith(".dta"):
        file_path = os.path.join(directory, file)
        temp_df = pd.read_stata(file_path)
        df_list.append(temp_df)
        
df = pd.concat(df_list, ignore_index=True)

<div style="border: 4px solid #007bff; padding: 10px; background-color: #e9f5ff; border-radius: 5px;">
    <h1 style="color: #007bff;">Preliminary data exploration</h1>
</div>

In [6]:
# With this filter we keep only the banks
df = df[df["corp_industry_sector1"] == "Finance"]

In [7]:
# And with this filter we only keep the countries form the US, UK or EU
countries = ['Austria',
             'US',
             'Denmark',
             'UK',
             'Germany',
             'Luxembourg',
             'France',
             'Italy',
             'Netherlands',
             'Belgium',
             'Sweden',
             'Spain',
             'Ireland',
             'Portugal',
             'Poland',
             'Finland',
             'USA',
             'Croatia',
             'Bulgaria',
             'Montenegro',
             'Bosnia and Herzegovina']

df = df[df['country_corp'].isin(countries)]

In [8]:
# We count the unique number of ud_archive as some have more than one row but still count as one isngle campaing
n_of_campaigns = len(list(df["uid_archive"].unique()))
print(f"There are {n_of_campaigns} unique campaigns for US UK and EU banks")

There are 2370 unique campaigns for US UK and EU banks


In [9]:
list_of_ngo_columns = []
for i in range(5):
    i = i+1
    ngo_column_number = f"ngo_name{i}"
    ngo_col = list(df[ngo_column_number])
    list_of_ngo_columns += ngo_col
unique_ngos = list(set(list_of_ngo_columns))

# we do -1 because we have to account for the null value
print(f"There are {len(unique_ngos) - 1} unique NGO organizations involved in this dataset")

There are 932 unique NGO organizations involved in this dataset


In [None]:
# Numero di aziende targettate

In [None]:
columns_we_want = ["uid_archive", 
                   "date", 
                   "company",
                   'country_corp', # Country of the Company
                   'corp_industry_sector1', # Industry of the company
                   'company_parent',
                   'company_parent_country',
                   "sentiment",
                   'issue_name1',
                   'issue_name2',
                   'issue_name3']

<div style="border: 4px solid #007bff; padding: 10px; background-color: #e9f5ff; border-radius: 5px;">
    <h1 style="color: #007bff;">Task #4 : estimate parameters of the CAPM </h1>
</div>

IMPORTANT NOTE

- Price Index: This index only reflects the price movements of a stock without considering dividends or other distributions. It’s a pure measure of price appreciation, so when you use the Price Index, you’re capturing the changes in the stock’s market price alone.
- Total Return Index: This includes both price changes and the effect of dividends reinvested back into the stock. If the bank pays dividends, these dividends are considered to be reinvested, adding to the growth of the index. This index provides a more comprehensive view of the stock’s overall return by including income from dividends, making it useful for capturing the full picture of what an investor earns from holding the stock.

For CAPM estimation, you typically need the Total Return Index. CAPM aims to model the total returns an investor would expect, factoring in all sources of return. Calculating returns using the Total Return Index allows you to include dividends, which are a critical component of a stock’s performance from an investor’s perspective.

In [2]:
def read_banks_ri(directory):
    # banks_ri.xlsm has the total return for each bank
    banks_ri = pd.ExcelFile(directory)
    
    # Each workbook is divided into sheets that divide the banks into different countires
    # the first sheet is a request table so we will ignore it
    banks_ri_sheets = banks_ri.sheet_names[1:]
    
    # We are gonna read all the different sheets and put them into a dataframe, store it 
    # into a list and concatenate them all toghether so we will have a big df with all the 
    # banks_ri info in it.
    # Since all the sheets have a column for the date, we will only read it for the first
    # sheet and skip it for the others.
    first_df = banks_ri.parse(banks_ri_sheets[0])
    first_df = first_df.rename(columns={first_df.columns[0]: "Date"})
    other_dfs = [banks_ri.parse(sheet_name).iloc[:, 1:] for sheet_name in banks_ri_sheets[1:]]
    
    banks_ri_df = pd.concat([first_df] + other_dfs, axis=1)
    return banks_ri_df

def clean_banks_ri(banks_ri_df):
    # Rename columns so that they have cleaner names
    rename_dict = {"Date": "Date"}
    for col in banks_ri_df.columns[1:]:
        rename_dict[col] = col[:-17] + "_TR"
    banks_ri_df.rename(columns=rename_dict, inplace=True)
    
    # As we have 158 banks, we will drop all the banks that are marked as Dead, this is a simple 
    # solution around this problem and we are only doing this because we can afford to do so 
    # seeing how many banks we have. 
    # With this operation we will be dropping only 3 banks.
    dead_banks = []
    for col in banks_ri_df.columns[1:]:
        if "dead" in col.lower():
            print(col)
            dead_banks.append(col)
    banks_ri_df.drop(dead_banks, axis = 1, inplace = True)
    return banks_ri_df

banks_ri_df = read_banks_ri("data/banks_data_bocconi/banks_ri.xlsm")
banks_ri_df = clean_banks_ri(banks_ri_df)
banks_ri_df

BANQUE DE SAVOIE DEAD - 01/04/10_TR
DELTA LLOYD GROUP DEAD - DEAD 01/06/17_TR
ACE CASH EXPRESS (BER) DEAD - 10/10/06_TR


Unnamed: 0,Date,AAREAL BANK_TR,COMDIRECT BANK_TR,COMMERZBANK_TR,DT.PFANDBRIEFBANK_TR,PROCREDIT HOLDING_TR,UMWELTBANK_TR,ALLIANZ_TR,DEUTSCHE BANK_TR,BANQUE NATIONALE DE BELGIQUE_TR,...,VIRGIN MONEY UK_TR,HSBC HOLDINGS_TR,LLOYDS BANKING GROUP_TR,METRO BANK_TR,ROYAL BANK OF SCTL.GP._TR,STANDARD CHARTERED_TR,TBC BANK GROUP_TR,ADMIRAL GROUP_TR,ALLIANCE TRUST_TR,SAGA GROUP_TR
0,2008-01-01,182.27,33.09,974.59,,,260.15,3364.43,1937.46,2658.11,...,,1502.17,271.20,,82190.06,79476.00,,449.44,14717.18,1438.33
1,2008-01-02,183.81,33.05,956.43,,,259.22,3304.22,1900.87,2684.50,...,,1489.68,269.47,,81079.38,79001.88,,447.39,14764.00,1438.33
2,2008-01-03,181.73,33.05,944.94,,,261.69,3285.59,1900.00,2682.82,...,,1495.93,270.91,,81912.38,78657.13,,449.44,14866.32,1438.33
3,2008-01-04,173.31,33.09,939.75,,,263.24,3220.61,1881.60,2641.91,...,,1476.30,261.43,,78348.94,77665.81,,436.36,14538.91,1438.33
4,2008-01-07,167.91,33.20,943.08,,,259.37,3179.94,1880.52,2632.54,...,,1486.11,259.71,,77423.38,77924.38,,431.87,14549.14,1438.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3211,2020-04-22,126.90,86.08,21.01,81.91,40.14,1387.93,6352.29,201.17,2595.15,...,35.26,1551.10,47.22,4.65,2799.64,28616.78,79.08,1936.39,38350.87,1438.33
3212,2020-04-23,125.18,87.56,21.56,81.53,40.47,1382.27,6220.43,206.42,2620.59,...,36.34,1569.07,48.03,4.60,2846.44,29297.06,79.38,1932.15,38515.23,1438.33
3213,2020-04-24,127.08,87.16,21.35,80.14,40.81,1348.28,6189.82,196.74,2582.43,...,35.83,1537.90,46.94,4.50,2796.97,28130.86,77.57,1957.59,38131.72,1438.33
3214,2020-04-27,127.17,87.69,22.33,83.36,41.98,1319.95,6310.70,219.14,2582.43,...,38.63,1592.02,48.20,4.66,2874.51,28639.21,78.78,1938.93,39337.04,1438.33


In [3]:
def read_banks_pi(directory):
    # banks_ri.xlsm has the total return for each bank
    banks_pi = pd.ExcelFile(directory)
    
    # Each workbook is divided into sheets that divide the banks into different countires
    # the first sheet is a request table so we will ignore it
    banks_pi_sheets = banks_pi.sheet_names[1:]
    
    # We are gonna read all the different sheets and put them into a dataframe, store it 
    # into a list and concatenate them all toghether so we will have a big df with all the 
    # banks_ri info in it.
    # Since all the sheets have a column for the date, we will only read it for the first
    # sheet and skip it for the others.
    first_df = banks_pi.parse(banks_pi_sheets[0])
    first_df = first_df.rename(columns={first_df.columns[0]: "Date"})
    other_dfs = [banks_pi.parse(sheet_name).iloc[:, 1:] for sheet_name in banks_pi_sheets[1:]]
    
    banks_ri_df = pd.concat([first_df] + other_dfs, axis=1)
    return banks_ri_df

def clean_banks_pi(banks_pi_df):
    # Rename columns so that they have cleaner names
    rename_dict = {"Date": "Date"}
    for col in banks_pi_df.columns[1:]:
        rename_dict[col] = col[:-17] + "_PI"
    banks_pi_df.rename(columns=rename_dict, inplace=True)
    
    # As we have 158 banks, we will drop all the banks that are marked as Dead, this is a simple 
    # solution around this problem and we are only doing this because we can afford to do so 
    # seeing how many banks we have. 
    # With this operation we will be dropping only 3 banks.
    dead_banks = []
    for col in banks_pi_df.columns[1:]:
        if "dead" in col.lower():
            print(col)
            dead_banks.append(col)
    banks_pi_df.drop(dead_banks, axis = 1, inplace = True)
    return banks_pi_df

banks_pi_df = read_banks_pi("data/banks_data_bocconi/banks_pi.xlsm")
banks_pi_df = clean_banks_pi(banks_pi_df)
banks_pi_df

BANQUE DE SAVOIE DEAD - 01/04_PI
DELTA LLOYD GROUP DEAD - DEAD 01/06_PI
ACE CASH EXPRESS (BER) DEAD - 10/10_PI


Unnamed: 0,Date,AAREAL B_PI,COMDIRECT B_PI,COMMERZB_PI,DT.PFANDBRIEFB_PI,PROCREDIT HOLD_PI,UMWELTB_PI,ALLI_PI,DEUTSCHE B_PI,BANQUE NATIONALE DE BELGI_PI,...,CLOSE BROTHERS GR_PI,VIRGIN MONEY_PI,HSBC HOLDI_PI,LLOYDS BANKING GR_PI,METRO B_PI,ROYAL BANK OF SCTL._PI,STANDARD CHARTE_PI,ADMIRAL GR_PI,ALLIANCE TR_PI,SAGA GR_PI
0,2008-01-01,171.1,27.2,353.5,,,224.0,2132.9,802.2,714.9,...,8202.5,,784.5,137.4,,11418.8,8372.5,400.0,3044.2,854.3
1,2008-01-02,172.6,27.1,346.9,,,223.2,2094.8,787.1,722.0,...,8198.2,,778.0,136.5,,11264.5,8322.5,398.2,3037.9,854.3
2,2008-01-03,170.6,27.1,342.7,,,225.3,2082.9,786.7,721.5,...,8202.5,,781.2,137.3,,11380.3,8286.2,400.0,3059.0,854.3
3,2008-01-04,162.7,27.2,340.8,,,226.7,2041.8,779.1,710.5,...,8332.0,,771.0,132.5,,10885.2,8181.8,388.4,2991.6,854.3
4,2008-01-07,157.7,27.3,342.1,,,223.3,2016.0,778.7,708.0,...,8336.3,,776.1,131.6,,10756.6,8209.0,384.4,2993.7,854.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3211,2020-04-22,86.6,41.4,6.9,60.5,36.7,830.9,2331.4,65.2,467.6,...,9005.5,34.6,433.6,17.6,4.7,321.6,2165.4,830.2,5894.7,854.3
3212,2020-04-23,85.4,42.1,7.1,60.2,37.0,827.5,2283.0,66.9,472.2,...,9204.0,35.6,438.7,17.9,4.6,326.9,2216.9,828.4,5920.0,854.3
3213,2020-04-24,86.7,41.9,7.0,59.2,37.3,807.1,2271.8,63.8,465.3,...,9005.5,35.1,430.0,17.5,4.5,321.3,2128.7,839.3,5861.1,854.3
3214,2020-04-27,86.8,42.2,7.4,61.5,38.3,790.2,2316.1,71.0,465.3,...,8867.3,37.9,445.1,17.9,4.7,330.2,2167.1,831.3,6046.3,854.3


In [None]:
# Before we estimate the CAPM we need to define market returns and the risk free

In [None]:
# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

def estimateCAPM(returns_df):
    # Initialize an empty DataFrame to store results
    CAPM_df = pd.DataFrame()  
     # Loop through all stock columns, excluding the Date and sp500
    for stock in list(returns_df.columns)[1:-1]: 
        stock_i = returns_df[f"{stock}"]  
        mkt = returns_df["sp500_r"]
        
        # Add a constant term (intercept) to the regression model, this will be our Alpha
        X = sm.add_constant(mkt)  
        # Fit the OLS regression
        model = sm.OLS(stock_i, X).fit()  
    
        # Extract parameter of intrest
        alpha = model.params[0]
        beta = model.params[1]
        p_value_alpha = model.pvalues[0]
        p_value_beta = model.pvalues[1]
    
        # Create a dictionary to store the results for the current stock
        result_i = {"Alpha": alpha,
                    "Beta": beta,
                    "P Value Alpha": p_value_alpha,
                    "P Value Beta": p_value_beta}
    
        # Convert result_i to a DataFrame and concatenate it to CAPM_df
        result_df = pd.DataFrame(result_i, index=[stock])  
        CAPM_df = pd.concat([CAPM_df, result_df])  

CAPM_df.reset_index(inplace = True)
CAPM_df.rename(columns={'index': 'Stock'}, inplace = True)

In [None]:
threshold = int(0.7 * len(banks_ri_df))

# Drop columns that don't meet the threshold
banks_ri_df.dropna(axis=1, thresh=threshold)