# Final Project

- Saige Belanger
    - (20951877)
- Dylan Faelker
    - (20960747)
- Ethan Liu
    - (20959615)
- Timothy Zheng
    - t54zheng (20939203)

In [77]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
from sklearn import linear_model
import statsmodels.api as sm
import scipy.stats as stats
from math import sqrt
import math

from sklearn import preprocessing
from sklearn.impute import SimpleImputer
import datetime as dt

import os.path

warnings.filterwarnings('ignore')

# Factors
We start with an initial list of factors from the provided list of 50 Factors in the ML examples.

TODO: Increase our breadth of factors to the category chosen by downloading and creating them, then merging them 

https://wrds-www.wharton.upenn.edu/pages/get-data/compustat-capital-iq-standard-poors/compustat/north-america-daily/fundamentals-quarterly/

In [78]:
all_monthly_data = pd.read_sas("merged_df.sas7bdat", encoding = 'ISO-8859-1')

In [79]:
# Save all gvkeys - for WRDS Queries

# with open("gvkeys.txt", "w") as file:
#     for gvkey in set(all_monthly_data["gvkey"].dropna()):
#         file.write(f"{int(gvkey)},\n")

In [80]:
permnos = set(all_monthly_data["permno"])

In [81]:
gvkey_permno_dict = {}
for gvkey in set(all_monthly_data["gvkey"].dropna()):
    permno = all_monthly_data[all_monthly_data["gvkey"] == gvkey]["permno"].dropna().iloc[0]
    gvkey_permno_dict[gvkey] = permno

KeyboardInterrupt: 

In [None]:
all_monthly_data.drop(["ticker", "conm", "gvkey", "cusip", "naics", "gsubind"], axis=1, inplace=True) # We don't use these columns anyway, drop them

In [None]:
factors = ['IM', 'range_20', 'log_vol_dollar_20',
       'range_120', 'log_vol_dollar_120', 'xret_5', 'xret_10', 'xret_20',
       'xret_indsize_20', 'xret_indsize_std20', 'xret_40', 'xret_120',
       'xret_indsize_120', 'xret_indsize_std120', 'KDJ_20', 'deviation_pct20',
       'MoneyFlowIndex_20', 'RSI_20', 'KDJ_120', 'deviation_pct120',
       'MoneyFlowIndex_120', 'RSI_120', 'IV_capm', 'mdr', 'ami_3', 'beta_3y',
       'beta_5y', 'tail_2y', 'dp', 'leverage', 'BL', 'roe', 'roa',
       'profitability', 'sales_g_q', 'sales_g_ttm', 'op_income_g_q', 'ni_g_q',
       'op_income_g_ttm', 'ni_g_ttm', 'sue_NI', 'BM', 'AM', 'EP', 'SP',
       'roe_q', 'roa_q', 'Cto', 'pe_ttm', 'lag_log_size']

ret_cols = ['ret_f1', 'ret_f2', 'ret_f3', 'ret_f4', 'ret_f5', 'ret_f6', 
            'ret_f7', 'ret_f8', 'ret_f9', 'ret_f10', 'ret_f11', 'ret_f12']

In [None]:
non_data_cols = [x for x in all_monthly_data.columns if x not in factors and x not in ret_cols]
non_data_cols

['permno', 'yyyymm', 'monthid', 'PRC', 'VOL', 'RET', 'SHROUT']

# Adding New Factors
* When you add a factor, document it here: [link](https://docs.google.com/spreadsheets/d/1rs9633QSYLVY5Z5DoGNy3USP2MROGtqTIKcbLG68wpE/edit#gid=1579135478) and fill properly
* Download the data file, if it's too large add it to the drive
* Also download the other files that arent on github but on the drive before working on this part of the notebook
    * https://drive.google.com/drive/u/0/folders/1D1eIYlkNxNLfzHJLzkGeE9ymr7doXg_6

## IMPORTANT NOTE - FACTOR/RETURN TIME
- When adding factors make sure you add such that factor is reported at t-1, **RET** has **T** returns (in same row)
- This means you need to download data from the range **(1979-12 to 2019-11)**

***

- Treasury and CPI Rates: [Link](https://wrds-www.wharton.upenn.edu/pages/get-data/center-research-security-prices-crsp/annual-update/index-treasury-and-inflation/us-treasury-and-inflation-indexes/)
- Federal Reserve Data: [Link](https://wrds-www.wharton.upenn.edu/pages/get-data/federal-reserve-bank-reports/interest-rates/data/)
- SEC Filings: https://wrds-www.wharton.upenn.edu/pages/get-data/wrds-sec-analytics-suite/wrds-sec-filings-queries/list-of-filings-exhibits/
- Analyzed Data: https://wrds-www.wharton.upenn.edu/pages/get-data/wrds-sec-analytics-suite/wrds-sec-text-analysis/readability-and-sentiment/

**TBD**
- Other Factors: Downloaded from https://wrds-www.wharton.upenn.edu/pages/get-data/compustat-capital-iq-standard-poors/compustat/north-america-daily/fundamentals-quarterly/

In [None]:
# Add new generated factors here
macro_factors = ['FF_O', 'SL_Y20', 'MORTG_NA', 
                'PRIME_NA', 'CD_M1', 'CD_M3', 'CD_M6', 'ED_M1', 'ED_M3', 
                'ED_M6', 'TB_M3', 'TB_M6', 'TB_Y1',
                'TCMNOM_M3', 'TCMNOM_M6', 'TCMNOM_Y1', 'TCMNOM_Y2', 
                'TCMNOM_Y3', 'TCMNOM_Y5', 'TCMNOM_Y7', 'TCMNOM_Y10', 
                'TCMNOM_Y20', 'TCMNOM_Y30', 'AAA_NA', 'BAA_NA', 'B30RET', 'B20RET', 
                 'B10RET', 'B7RET', 'B5RET', 'B2RET', 'B1RET', 
                'T90RET', 'T30RET', 'CPIRET', ]
recession_factors = ["10M2", "volinc", "recession_affinity"]
tone_factors = ['10Q_tone_3m', '10Q_tone_6m', 
                '10K_tone_1y', 
                '8K_tone_1y',]
factors += macro_factors
factors += recession_factors
factors += tone_factors

# Macro Factors

In [None]:
treasury_inflation = pd.read_sas("treasury_inflation.sas7bdat", encoding = 'ISO-8859-1')
federal_reserve = pd.read_sas("federal_reserve.sas7bdat", encoding = 'ISO-8859-1')

In [None]:
#dropping factors with excess null values
federal_reserve.dropna(axis = 1, thresh = 300, inplace=True)

In [None]:
#adding yyyymm to merge taking into account 1 month shift
federal_reserve["yyyymm"] = (federal_reserve["date"] + pd.DateOffset(months=1)).dt.strftime("%Y%m").astype(float)
treasury_inflation["yyyymm"] = (treasury_inflation["CALDT"] + pd.DateOffset(months=1)).dt.strftime("%Y%m").astype(float)


In [None]:
#merging to all_monthly_data
all_monthly_data = pd.merge(all_monthly_data, federal_reserve, on="yyyymm")
all_monthly_data = pd.merge(all_monthly_data, treasury_inflation, on="yyyymm")

# SEC Filings Sentiment Factors

In [None]:
sec_analytics = pd.read_sas("sec_filing_analysis_wrds.sas7bdat", encoding = 'ISO-8859-1')

In [None]:
sec_analytics["PERMNO"] = sec_analytics["GVKEY"].map(gvkey_permno_dict)

In [None]:
# Import from Dylan's code
sec_analytics['yyyymm'] = (sec_analytics['FDATE'].astype('string').str[:4] + sec_analytics['FDATE'].astype('string').str[5:7]).astype('float64')
sec_analytics_subset_forms = sec_analytics[(sec_analytics['FORM'].isin(['10-K', '10-Q', '8-K']))][["PERMNO", "yyyymm", "FORM", "LM_POSITIVE_COUNT", "LM_NEGATIVE_COUNT", "WORD_COUNT"]]

sec_analytics_subset_forms['tone'] = (sec_analytics_subset_forms['LM_POSITIVE_COUNT'] - sec_analytics_subset_forms['LM_NEGATIVE_COUNT']) / sec_analytics_subset_forms['WORD_COUNT']

sec_analytics_subset_forms.head()

Unnamed: 0,PERMNO,yyyymm,FORM,LM_POSITIVE_COUNT,LM_NEGATIVE_COUNT,WORD_COUNT,tone
0,86594.0,201306.0,10-K,2387.0,2500.0,260142.0,-0.000434
9,86594.0,200703.0,10-Q,110.0,115.0,14978.0,-0.000334
11,86594.0,201209.0,8-K,35.0,31.0,3111.0,0.001286
14,86594.0,200502.0,8-K,279.0,931.0,53640.0,-0.012155
17,86594.0,200606.0,8-K,15.0,6.0,907.0,0.009923


In [None]:

def yyyymm_add(yyyymm, years, months):
    return yyyymm + ((years + math.floor(months / 12)) * 100) + (months % 12)

def yyyymm_sub(yyyymm, years, months):
    return yyyymm - ((years + math.floor(months / 12)) * 100) - (months % 12)

In [None]:
!pip install multiprocess



In [None]:
from multiprocess import cpu_count # You might have to change to multiprocessing if on windows
from multiprocess.pool import ThreadPool

In [None]:
# Threaded Approach. Takes estimate >2h
def calc_tone_factors(permno):
    results = []

    for date in all_monthly_data['yyyymm'].unique():
        sentiments = {}
        # when there is no relevant data. Saves time
        if sec_analytics_subset_forms.loc[(permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 1, 0))].shape[0] == 0:
            sentiments = {"permno": permno, 
                            "yyyymm": date,
                            '10Q_tone_3m': 0,
                            '10Q_tone_6m': 0,
                            '10K_tone_1y': 0,
                            '8K_tone_1y': 0,
                            }
        elif sec_analytics_subset_forms.loc[(permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 0, 6))].shape[0] == 0:
            sentiments = {"permno": permno, 
                            "yyyymm": date,
                            '10Q_tone_3m': 0,
                            '10Q_tone_6m': 0,
                            '10K_tone_1y': sec_analytics_subset_forms.loc[('10-K' == sec_analytics_subset_forms['FORM']) & (permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 1, 0)), 'tone'].sum(),
                            '8K_tone_1y': sec_analytics_subset_forms.loc[('8-K' == sec_analytics_subset_forms['FORM']) & (permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 1, 0)), 'tone'].sum(),
                            }
        elif sec_analytics_subset_forms.loc[(permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 0, 3))].shape[0] == 0:
            sentiments = {"permno": permno, 
                            "yyyymm": date,
                            '10Q_tone_3m': 0,
                            '10Q_tone_6m': sec_analytics_subset_forms.loc[('10-Q' == sec_analytics_subset_forms['FORM']) & (permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 0, 6)), 'tone'].sum(),
                            '10K_tone_1y': sec_analytics_subset_forms.loc[('10-K' == sec_analytics_subset_forms['FORM']) & (permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 1, 0)), 'tone'].sum(),
                            '8K_tone_1y': sec_analytics_subset_forms.loc[('8-K' == sec_analytics_subset_forms['FORM']) & (permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 1, 0)), 'tone'].sum(),
                            }
        else:
            sentiments = {"permno": permno, 
                            "yyyymm": date,
                            '10Q_tone_3m': sec_analytics_subset_forms.loc[('10-Q' == sec_analytics_subset_forms['FORM']) & (permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 0, 3)), 'tone'].sum(),
                            '10Q_tone_6m': sec_analytics_subset_forms.loc[('10-Q' == sec_analytics_subset_forms['FORM']) & (permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 0, 6)), 'tone'].sum(),
                            '10K_tone_1y': sec_analytics_subset_forms.loc[('10-K' == sec_analytics_subset_forms['FORM']) & (permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 1, 0)), 'tone'].sum(),
                            '8K_tone_1y': sec_analytics_subset_forms.loc[('8-K' == sec_analytics_subset_forms['FORM']) & (permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 1, 0)), 'tone'].sum(),
                            }
        results.append(sentiments)
    
    return results

permnos = set(all_monthly_data["permno"])
with ThreadPool(cpu_count() - 1) as P:
    sentiments = P.map(calc_tone_factors, permnos)
    sentiments = [item for sublist in sentiments for item in sublist]
    sentiments = pd.DataFrame(sentiments)
    all_monthly_data = pd.merge(all_monthly_data, sentiments, on=["permno", "yyyymm"], how='outer')

In [None]:
all_monthly_data[all_monthly_data["permno"] == 86594.0].sort_values(by="monthid").head()

Unnamed: 0,permno,yyyymm,monthid,IM,range_20,log_vol_dollar_20,range_120,log_vol_dollar_120,xret_5,xret_10,...,B5IND,B2IND,B1IND,T90IND,T30IND,CPIIND,10Q_tone_3m,10Q_tone_6m,10K_tone_1y,8K_tone_1y
578148,86594.0,199901.0,229.0,,,,,,,,...,911.8428,834.0447,814.6826,684.099,579.5938,385.647,0.0,0.0,0.0,0.0
578149,86594.0,199902.0,230.0,,,,,,,,...,917.7434,837.0339,817.7434,686.5426,581.6188,386.5882,0.0,0.0,0.0,0.0
578150,86594.0,199903.0,231.0,,,,,,,,...,893.9178,831.4367,817.5643,688.631,583.5801,387.0588,0.0,0.0,0.0,-0.023482
578151,86594.0,199904.0,232.0,,,,,,,,...,901.8773,838.1214,822.952,691.6135,585.9588,388.2352,0.000887,0.000887,0.0,-0.023482
578152,86594.0,199905.0,233.0,,,,,,,,...,903.9543,840.5721,826.0899,694.0735,588.176,391.0588,0.000887,0.000887,0.0,-0.023482


# Recession Factor
- During recessions, companies that sell essential products/services typically outperform companies that offer products that are categorized as discretionary spending by consumers.
- Basically we want to come up with a factor such that during recessions, the factor is high for companies selling essential products/services and low for companies producing goods/services that are highly sensitive to recessions. Then we want the factor to be flipped when the company is out of a recession.
- We will categorize a recession as whenever the yield curve is inverted

### Proxy factors
- [10M2] Yield curve: US Treasury 10Y - US Treasury 2 Year: https://fred.stlouisfed.org/series/T10Y2YM
    - Recession if 10M2 < 0 [inverted yield curve]

- [volinc] Annual Income Volatility: Standard Deviation of annual net income growth [ni_g_ttm] for past 5 years, minimum past 1 year
    - Using trailing twelve month (TTM) measure because it avoids any seasonality considerations

In [None]:
all_monthly_data[all_monthly_data["permno"] == 86594.0][["ni_g_ttm", "yyyymm"]].dropna().head(12)

Unnamed: 0,ni_g_ttm,yyyymm
578160,-0.319421,200001.0
578161,-0.319421,200002.0
578162,-0.319421,200003.0
578163,-0.298619,200004.0
578164,-0.298619,200005.0
578165,-0.298619,200006.0
578166,0.319728,200007.0
578167,0.319728,200008.0
578168,0.319728,200009.0
578169,0.334539,200010.0


In [None]:
# Add annual income volatility - std of ni_g_ttm for past 5 years, minimum of past 1 year

volinc = {"yyyymm": [], "permno": [], "volinc": []}
for permno in permnos:
    ni_g_ttm = all_monthly_data[all_monthly_data["permno"] == permno][["ni_g_ttm", "yyyymm"]].dropna()
    date_range = sorted(list(ni_g_ttm["yyyymm"]))

    for i, yyyymm in enumerate(date_range):
        
        window = set(date_range[max(0, i-59):i+1]) # Look past 5 years (60 months)
        window_data = ni_g_ttm[ni_g_ttm["yyyymm"].isin(window)]
        
        if len(window_data) < 12:
            continue

        # Add std ni_g_ttm of past 5 years to volinc factor
        # Note this avoids lookahead bias because the data up to and including i
        # should be known (since ni_g_ttm is from i-1 as per data manual)
        volinc["yyyymm"].append(yyyymm)
        volinc["permno"].append(permno)
        volinc["volinc"].append(window_data["ni_g_ttm"].std())

volinc_df = pd.DataFrame(volinc)
volinc_df.head()

Unnamed: 0,yyyymm,permno,volinc
0,198012.0,49154.0,0.023537
1,198101.0,49154.0,0.02416
2,198102.0,49154.0,0.024472
3,198103.0,49154.0,0.024585
4,198104.0,49154.0,0.023754


In [None]:
all_monthly_data = pd.merge(all_monthly_data, volinc_df, on=["yyyymm", "permno"], how="outer")

## 10M2 Yield Curve

In [None]:
treasury_inflation = pd.read_sas("treasury_inflation.sas7bdat", encoding = 'ISO-8859-1')

In [None]:
fact_10M2 = treasury_inflation[["CALDT", "B2RET", "B10RET"]]

# Add one month to fit RET and factor time
fact_10M2["yyyymm"] = (fact_10M2["CALDT"] + pd.DateOffset(months=1)).dt.strftime("%Y%m").astype(float)

In [None]:
fact_10M2["10M2"] = fact_10M2["B10RET"] - fact_10M2["B2RET"]
fact_10M2

Unnamed: 0,CALDT,B2RET,B10RET,yyyymm,10M2
0,1979-12-31,0.005695,0.011951,198001.0,0.006256
1,1980-01-31,-0.000164,-0.037477,198002.0,-0.037313
2,1980-02-29,-0.036947,-0.050507,198003.0,-0.013560
3,1980-03-31,0.010329,0.048345,198004.0,0.038016
4,1980-04-30,0.084198,0.084375,198005.0,0.000177
...,...,...,...,...,...
477,2019-09-30,-0.001297,-0.013852,201910.0,-0.012555
478,2019-10-31,0.003274,-0.000742,201911.0,-0.004016
479,2019-11-29,-0.001038,-0.007410,201912.0,-0.006372
480,2019-12-31,0.002274,-0.011292,202001.0,-0.013566


In [None]:
all_monthly_data = pd.merge(all_monthly_data, fact_10M2[["yyyymm", "10M2"]], on="yyyymm")

## Recession Affinity
* Recession affinity is calculated as

- 1 / volinc **if 10M2 < 0** (recession)
- volinc $\times$ 1500 **if 10M2 > 0** (no recession)
    - TBH Times 1500 descision is arbitrary but it makes sense in "levelling" both sides of the variable, ie:
    - Values when 10M2 < 0 and 10M2 > 0 are relatively equal

In [None]:
all_monthly_data["recession_affinity"] = np.where(all_monthly_data["10M2"] < 0, 1 / all_monthly_data["volinc"], 1500 * all_monthly_data["volinc"])

In [None]:
test = all_monthly_data[["10M2", "volinc", "recession_affinity"]].dropna()

In [None]:
test[test["10M2"] < 0]["recession_affinity"].mean()

105.45824976257666

In [None]:
test[test["10M2"] > 0]["recession_affinity"].mean()

78.82993778650686

In [None]:
all_monthly_data

Unnamed: 0,permno,yyyymm,monthid,IM,range_20,log_vol_dollar_20,range_120,log_vol_dollar_120,xret_5,xret_10,...,T90IND,T30IND,CPIIND,10Q_tone_3m,10Q_tone_6m,10K_tone_1y,8K_tone_1y,volinc,10M2,recession_affinity
0,10026.0,198001.0,,,,,,,,,...,,,,0.000000,0.000000,0.000000,0.000000,,0.006256,
1,10032.0,198001.0,,,,,,,,,...,,,,0.000000,0.000000,0.000000,0.000000,,0.006256,
2,10051.0,198001.0,,,,,,,,,...,,,,0.000000,0.000000,0.000000,0.000000,,0.006256,
3,10104.0,198001.0,,,,,,,,,...,,,,0.000000,0.000000,0.000000,0.000000,,0.006256,
4,10107.0,198001.0,,,,,,,,,...,,,,0.000000,0.000000,0.000000,0.000000,,0.006256,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
718555,93374.0,201912.0,480.0,-0.147586,0.010868,17.310472,0.015246,17.388666,0.000246,-0.019599,...,1021.196,832.7484,605.1952,-0.010593,-0.020941,-0.011014,-0.068925,0.008777,-0.006372,113.930650
718556,93419.0,201912.0,480.0,-0.147586,0.012941,16.775314,0.015804,16.961089,0.008011,0.004462,...,1021.196,832.7484,605.1952,-0.001085,-0.003564,-0.002398,-0.050815,0.010539,-0.006372,94.884444
718557,93422.0,201912.0,480.0,-0.046054,0.059361,16.763782,0.067228,17.048398,0.015794,-0.033674,...,1021.196,832.7484,605.1952,-0.018540,-0.037721,-0.017328,-0.178637,0.125235,-0.006372,7.984961
718558,93427.0,201912.0,480.0,0.260954,0.025465,16.473392,0.028009,16.544964,-0.005524,-0.006388,...,1021.196,832.7484,605.1952,-0.005819,-0.005819,-0.004089,-0.018317,0.016243,-0.006372,61.566110


***
# Data Cleanup
Done creating all factors, will clean up data before training step 

In [None]:
# Inputation - as in ML Lecture 1

# Drop NA in all non-numerical columns
all_monthly_data.dropna(subset=non_data_cols, inplace=True)

grouped_med = all_monthly_data.groupby(by='monthid')
# the lambda function gets the median per group in the groupby object, and fills the NaN values with the median per group
imputed_grouped = grouped_med.transform(lambda y: y.fillna(y.median()))

# This line assigns the values of the medians 
all_monthly_data = all_monthly_data.assign(**imputed_grouped.to_dict(orient='series'))
all_monthly_data.dropna(inplace=True)

In [None]:
# Filtering data by min price and min market share for each year

# Commenting out for runtime - **does not drop any rows**

# all_monthly_data['yyyy'] = all_monthly_data['yyyymm'].astype(str).str[:4]
# all_monthly_data['MKTSHR'] = all_monthly_data['PRC'] * all_monthly_data['SHROUT'] * 1_000

# to_drop_indices = []

# for permno in all_monthly_data.permno.unique():
#     for year in all_monthly_data['yyyy'].unique():
#         mask = (all_monthly_data['permno'] == permno) & (all_monthly_data['yyyy'] == year)
#         if all_monthly_data[mask].shape[0] != 0 != 0 and (all_monthly_data[mask]['MKTSHR'].iloc[0] < 100_000_000 or all_monthly_data[mask]['PRC'].iloc[0] <= 5):
#             to_drop_indices += list(all_monthly_data[mask].index)
# all_monthly_data.drop(to_drop_indices, inplace=True)

In [None]:
# Winsorizing factors--should winsorize the variables by quarter
for column in factors:
    for date in set(list(all_monthly_data["monthid"])):
        mask = (all_monthly_data["monthid"] == date)
        
        std = all_monthly_data[column][mask].std()
        mean = all_monthly_data[column][mask].mean()

        upper = mean + 3 * std
        lower = mean - 3 * std
        
        all_monthly_data[column][mask].clip(lower, upper, inplace= True)

In [None]:
all_monthly_data

Unnamed: 0,permno,yyyymm,monthid,IM,range_20,log_vol_dollar_20,range_120,log_vol_dollar_120,xret_5,xret_10,...,T90IND,T30IND,CPIIND,10Q_tone_3m,10Q_tone_6m,10K_tone_1y,8K_tone_1y,volinc,10M2,recession_affinity
37432,10145.0,198202.0,26.0,-0.102886,0.021682,14.425697,0.019532,14.615516,0.046313,-0.002667,...,217.0908,207.4914,221.8823,0.000000,0.000000,0.000000,0.000000,0.043023,-0.001655,23.243621
37434,10241.0,198202.0,26.0,-0.078001,0.021561,14.424110,0.023697,14.170329,-0.043787,-0.013684,...,217.0908,207.4914,221.8823,0.000000,0.000000,0.000000,0.000000,0.009131,-0.001655,109.517034
37444,10460.0,198202.0,26.0,-0.073044,0.015622,12.621541,0.019425,12.246832,-0.012173,0.002418,...,217.0908,207.4914,221.8823,0.000000,0.000000,0.000000,0.000000,0.008639,-0.001655,115.756923
37447,10516.0,198202.0,26.0,0.027852,0.022940,14.384263,0.028358,14.281943,0.026917,0.045042,...,217.0908,207.4914,221.8823,0.000000,0.000000,0.000000,0.000000,0.010577,-0.001655,94.548065
37458,10866.0,198202.0,26.0,-0.073738,0.024308,12.248607,0.024539,12.066522,-0.018919,-0.016064,...,217.0908,207.4914,221.8823,0.000000,0.000000,0.000000,0.000000,0.004508,-0.001655,221.849373
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
603286,93374.0,201307.0,403.0,0.269122,0.025135,17.095573,0.022153,16.875807,0.021980,-0.028213,...,968.2475,794.6296,549.4211,-0.007449,-0.007449,-0.005938,-0.063278,0.017716,-0.029220,56.445842
603287,93419.0,201307.0,403.0,0.161248,0.022893,16.642219,0.021335,16.664899,0.002161,0.010046,...,968.2475,794.6296,549.4211,-0.001870,-0.001870,-0.003679,-0.059306,0.031049,-0.029220,32.207279
603288,93422.0,201307.0,403.0,0.134789,0.024668,17.528577,0.025422,17.722424,-0.028588,-0.040312,...,968.2475,794.6296,549.4211,-0.004185,-0.004185,-0.005346,-0.163066,0.018327,-0.029220,54.562960
603289,93427.0,201307.0,403.0,0.161248,0.022893,16.642219,0.021335,16.664899,0.002161,0.010046,...,968.2475,794.6296,549.4211,-0.006563,-0.013559,-0.003775,-0.038431,0.031049,-0.029220,32.207279


In [83]:

if os.path.isfile("all_monthly_data_final.csv"):
    all_monthly_data = pd.read_csv("all_monthly_data_final.csv", index_col=0)
else:
    all_monthly_data.to_csv("all_monthly_data_final.csv")

## Factor Code

In [84]:
ff4_factors = pd.read_sas("ff4_factors.sas7bdat", encoding = 'ISO-8859-1')
ff4_factors["monthid"] = ff4_factors.index + 1
ff4_factors.head()

Unnamed: 0,DATEFF,SMB,HML,MKTRF,RF,UMD,monthid
0,1980-01-31,0.0162,0.0175,0.0551,0.008,0.0755,1
1,1980-02-29,-0.0185,0.0061,-0.0122,0.0089,0.0788,2
2,1980-03-31,-0.0664,-0.0101,-0.129,0.0121,-0.0955,3
3,1980-04-30,0.0105,0.0106,0.0397,0.0126,-0.0043,4
4,1980-05-30,0.0213,0.0038,0.0526,0.0081,-0.0112,5


In [85]:
dates = [int(x) for x in sorted(list(set(list(all_monthly_data["yyyymm"]))))]
dates[0], dates[-1]

(198202, 201307)

In [86]:
monthids = [int(x) for x in sorted(list(set(list(all_monthly_data["monthid"]))))]
monthids[0], monthids[-1], len(monthids)

(26, 403, 216)

In [87]:
testing_range = monthids[0:2*(len(monthids) // 3)]
validation_range = monthids[2 * len(monthids) // 3:]

# Validate that ranges have correct ratios
len(testing_range) / len(monthids), len(validation_range) / len(monthids), len(testing_range) + len(validation_range)

(0.6666666666666666, 0.3333333333333333, 216)

## Testing Factors

In [88]:
model_factors = ['IM', 'range_20', 'log_vol_dollar_20',
       'range_120', 'log_vol_dollar_120', 'xret_5', 'xret_10', 'xret_20',
       'xret_indsize_20', 'xret_indsize_std20', 'xret_40', 'xret_120',
       'xret_indsize_120', 'xret_indsize_std120', 'KDJ_20', 'deviation_pct20',
       'MoneyFlowIndex_20', 'RSI_20', 'KDJ_120', 'deviation_pct120',
       'MoneyFlowIndex_120', 'RSI_120', 'IV_capm', 'mdr', 'ami_3', 'beta_3y',
       'beta_5y', 'tail_2y', 'dp', 'leverage', 'BL', 'roe', 'roa',
       'profitability', 'sales_g_q', 'sales_g_ttm', 'op_income_g_q', 'ni_g_q',
       'op_income_g_ttm', 'ni_g_ttm', 'sue_NI', 'BM', 'AM', 'EP', 'SP',
       'roe_q', 'roa_q', 'Cto', 'pe_ttm', 'lag_log_size']

In [89]:
model_factors += macro_factors
model_factors += recession_factors
model_factors += tone_factors

In [90]:
all_monthly_data = pd.merge(ff4_factors, all_monthly_data, on="monthid")

In [None]:
all_monthly_data[["monthid", "permno"] + model_factors[::-1]]

Unnamed: 0,monthid,permno,8K_tone_1y,10K_tone_1y,10Q_tone_6m,10Q_tone_3m,recession_affinity,volinc,10M2,CPIRET,...,xret_indsize_std20,xret_indsize_20,xret_20,xret_10,xret_5,log_vol_dollar_120,range_120,log_vol_dollar_20,range_20,IM
0,26,10145.0,0.000000,0.000000,0.000000,0.000000,23.243621,0.043023,-0.001655,0.003191,...,0.016863,0.051664,0.028496,-0.002667,0.046313,14.615516,0.019532,14.425697,0.021682,-0.102886
1,26,10241.0,0.000000,0.000000,0.000000,0.000000,109.517034,0.009131,-0.001655,0.003191,...,0.023591,-0.051245,-0.020178,-0.013684,-0.043787,14.170329,0.023697,14.424110,0.021561,-0.078001
2,26,10460.0,0.000000,0.000000,0.000000,0.000000,115.756923,0.008639,-0.001655,0.003191,...,0.013734,-0.112451,-0.057195,0.002418,-0.012173,12.246832,0.019425,12.621541,0.015622,-0.073044
3,26,10516.0,0.000000,0.000000,0.000000,0.000000,94.548065,0.010577,-0.001655,0.003191,...,0.012684,-0.040786,-0.017390,0.045042,0.026917,14.281943,0.028358,14.384263,0.022940,0.027852
4,26,10866.0,0.000000,0.000000,0.000000,0.000000,221.849373,0.004508,-0.001655,0.003191,...,0.018356,-0.009933,0.015425,-0.016064,-0.018919,12.066522,0.024539,12.248607,0.024308,-0.073738
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183031,403,93374.0,-0.063278,-0.005938,-0.007449,-0.007449,56.445842,0.017716,-0.029220,0.002400,...,0.012553,-0.064664,-0.057246,-0.028213,0.021980,16.875807,0.022153,17.095573,0.025135,0.269122
183032,403,93419.0,-0.059306,-0.003679,-0.001870,-0.001870,32.207279,0.031049,-0.029220,0.002400,...,0.011120,0.009194,0.013240,0.010046,0.002161,16.664899,0.021335,16.642219,0.022893,0.161248
183033,403,93422.0,-0.163066,-0.005346,-0.004185,-0.004185,54.562960,0.018327,-0.029220,0.002400,...,0.010607,0.003925,-0.004690,-0.040312,-0.028588,17.722424,0.025422,17.528577,0.024668,0.134789
183034,403,93427.0,-0.038431,-0.003775,-0.013559,-0.006563,32.207279,0.031049,-0.029220,0.002400,...,0.011120,0.009194,0.013240,0.010046,0.002161,16.664899,0.021335,16.642219,0.022893,0.161248


## [m, n, l] model for Fama-MacBeth Double Regression
We will use the technique employed during Assignment 2, utilizing a 36-month lookback for factor data to generate our betas (**First Stage**)
* For period $t_i$, we will use data starting at $t_{i-36} ... t_{i-1}$ if available. Worst case we look for 12 prior samples.

In [None]:
# Threaded Approach
def add_betas(permno):
    results = []
    for (i, monthid) in enumerate(testing_range): 
        window = set(testing_range[max(0, i-36):i]) # betas calculated using t_(i-37) to t_(i-2) factors and t_(i-36) to t_(i-1) returns.
                                                    # betas added alongside t_i returns (dated to be compared against t_(i-1) factors)
        window_data = all_monthly_data[(all_monthly_data["permno"] == permno) & (all_monthly_data["monthid"].isin(window))]
        
        if len(window_data) < 12:
            continue

        explanatory_vars = window_data[model_factors + ["monthid"]]
        explanatory_vars.sort_values(by="monthid", inplace=True)
        explanatory_vars.set_index("monthid", inplace=True)
    
        explained_var = window_data[["monthid", "RET"]] # Since factors are from t-1
        explained_var.sort_values(by="monthid", inplace=True)
        explained_var.set_index("monthid", inplace=True)
        
        model = linear_model.LinearRegression().fit(explanatory_vars, 
                                                    explained_var["RET"])
        
        results.append({"monthid": monthid, 
                        "permno": permno, 
                        "RET": explained_var["RET"].iloc[-1], 
                    **{f"{factor}": model.coef_[i] for i, factor in enumerate(model_factors)}
                       })  
    return results

# UNCOMMENT THIS AND ADD TO LINE BELOW INSTEAD OF `permnos`
# FOR DEVELOPMENT - THIS CODE BLOCK TAKES LIKE 30 MINS TO RUN

# smaller_permno_list = list(permnos)[:10]

# Only compute if not in files (delete local copy of file if code above if modifying factors or code above)
if os.path.isfile("first_stage_df.csv"):
    first_stage_df = pd.read_csv("first_stage_df.csv", index_col=0)
else:
    # Runs once basically
    summary_results = []
    with ThreadPool(cpu_count() - 1) as P:
        summary_results = P.map(add_betas, permnos)
        summary_results = [item for sublist in summary_results for item in sublist]
        first_stage_df = pd.DataFrame(summary_results)
        
        # Save first stage df for easy loading
        first_stage_df.to_csv("first_stage_df.csv")
first_stage_df

Unnamed: 0,monthid,permno,RET,IM,range_20,log_vol_dollar_20,range_120,log_vol_dollar_120,xret_5,xret_10,...,T90RET,T30RET,CPIRET,10M2,volinc,recession_affinity,10Q_tone_3m,10Q_tone_6m,10K_tone_1y,8K_tone_1y
0,38,49154.0,0.002268,-0.004932,-0.001411,0.021575,0.002977,0.024930,-0.004523,-0.003355,...,0.000153,0.000046,-0.000034,0.001311,-6.277725e-07,-0.005903,0.0,0.0,0.0,0.0
1,39,49154.0,0.089140,0.001284,-0.003611,0.079513,-0.000156,0.040482,0.001613,-0.003048,...,0.000635,0.000066,0.000064,-0.005588,-1.774521e-05,-0.001300,0.0,0.0,0.0,0.0
2,40,49154.0,0.141962,-0.001128,-0.001604,0.079359,0.000222,0.038977,0.000326,-0.003035,...,0.000614,0.000074,-0.000007,-0.004926,-2.744974e-05,-0.001477,0.0,0.0,0.0,0.0
3,41,49154.0,-0.005484,-0.002096,-0.001061,0.074409,0.000094,0.043385,0.001489,-0.003119,...,0.000700,0.000056,0.000051,-0.005854,-2.347649e-05,-0.002075,0.0,0.0,0.0,0.0
4,42,49154.0,0.020956,0.002188,-0.005366,0.078923,0.001397,0.018418,0.026170,0.025796,...,0.001169,-0.000621,-0.004618,-0.013788,-8.703041e-05,-0.000629,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88656,246,81917.0,0.033708,0.151287,0.000038,0.304989,0.075606,0.508505,-0.226907,0.874659,...,-0.062905,-0.048778,0.164945,0.266030,3.247630e-02,-0.002713,0.0,0.0,0.0,0.0
88657,247,81917.0,-0.019022,0.174183,0.079677,0.295627,-0.004475,0.222086,0.364953,0.510210,...,-0.029084,-0.027794,0.019834,0.303631,3.560299e-02,-0.002219,0.0,0.0,0.0,0.0
88658,248,81917.0,0.127535,0.221421,0.042634,0.289005,-0.068794,-0.108444,-0.453800,0.420478,...,0.004400,-0.012994,-0.118889,0.275127,5.129429e-02,-0.009159,0.0,0.0,0.0,0.0
88659,249,81917.0,-0.060606,0.190180,-0.014226,0.289051,-0.033812,0.393873,-0.354715,-0.614383,...,-0.052365,-0.120383,0.128703,0.817861,1.133618e-01,-0.000201,0.0,0.0,0.0,0.0


In [None]:
first_stage_df[["monthid", "permno", "RET"] + model_factors[::-1]]

Unnamed: 0,monthid,permno,RET,8K_tone_1y,10K_tone_1y,10Q_tone_6m,10Q_tone_3m,recession_affinity,volinc,10M2,...,xret_indsize_std20,xret_indsize_20,xret_20,xret_10,xret_5,log_vol_dollar_120,range_120,log_vol_dollar_20,range_20,IM
0,38,49154.0,0.002268,0.0,0.0,0.0,0.0,-0.005903,-6.277725e-07,0.001311,...,-0.000169,-0.000263,0.000845,-0.003355,-0.004523,0.024930,0.002977,0.021575,-0.001411,-0.004932
1,39,49154.0,0.089140,0.0,0.0,0.0,0.0,-0.001300,-1.774521e-05,-0.005588,...,-0.001070,0.006996,0.008289,-0.003048,0.001613,0.040482,-0.000156,0.079513,-0.003611,0.001284
2,40,49154.0,0.141962,0.0,0.0,0.0,0.0,-0.001477,-2.744974e-05,-0.004926,...,-0.001218,0.006014,0.008705,-0.003035,0.000326,0.038977,0.000222,0.079359,-0.001604,-0.001128
3,41,49154.0,-0.005484,0.0,0.0,0.0,0.0,-0.002075,-2.347649e-05,-0.005854,...,-0.001217,0.007408,0.011121,-0.003119,0.001489,0.043385,0.000094,0.074409,-0.001061,-0.002096
4,42,49154.0,0.020956,0.0,0.0,0.0,0.0,-0.000629,-8.703041e-05,-0.013788,...,-0.001814,0.041100,0.052014,0.025796,0.026170,0.018418,0.001397,0.078923,-0.005366,0.002188
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88656,246,81917.0,0.033708,0.0,0.0,0.0,0.0,-0.002713,3.247630e-02,0.266030,...,-0.059913,0.355551,0.258881,0.874659,-0.226907,0.508505,0.075606,0.304989,0.000038,0.151287
88657,247,81917.0,-0.019022,0.0,0.0,0.0,0.0,-0.002219,3.560299e-02,0.303631,...,-0.035769,0.480834,0.669103,0.510210,0.364953,0.222086,-0.004475,0.295627,0.079677,0.174183
88658,248,81917.0,0.127535,0.0,0.0,0.0,0.0,-0.009159,5.129429e-02,0.275127,...,-0.023666,0.604863,0.981214,0.420478,-0.453800,-0.108444,-0.068794,0.289005,0.042634,0.221421
88659,249,81917.0,-0.060606,0.0,0.0,0.0,0.0,-0.000201,1.133618e-01,0.817861,...,-0.008702,0.504155,1.396807,-0.614383,-0.354715,0.393873,-0.033812,0.289051,-0.014226,0.190180


In [None]:
# Second stage regression
lambdas = {"monthid": []}
for factor in model_factors:
    lambdas[f"{factor}"] = []
    
for monthid in testing_range:
    monthid_returns = first_stage_df.loc[first_stage_df["monthid"] == monthid]

    # If empty
    if monthid_returns.empty:
        continue
    
    explanatory_vars = monthid_returns[model_factors + ["permno"]]
    explanatory_vars.sort_values(by="permno", inplace=True)
    explanatory_vars.set_index("permno", inplace=True)

    explained_var = monthid_returns[["permno", "RET"]]
    explained_var.sort_values(by="permno", inplace=True)
    explained_var.set_index("permno", inplace=True)
    
    model = linear_model.LinearRegression(n_jobs=len(model_factors)).fit(explanatory_vars, 
                                                                         explained_var["RET"])

    lambdas["monthid"].append(monthid)

    for (i, factor) in enumerate(model_factors):
        lambdas[factor].append(model.coef_[i])

In [None]:
second_stage_df = pd.DataFrame(lambdas)
second_stage_df

Unnamed: 0,monthid,IM,range_20,log_vol_dollar_20,range_120,log_vol_dollar_120,xret_5,xret_10,xret_20,xret_indsize_20,...,T90RET,T30RET,CPIRET,10M2,volinc,recession_affinity,10Q_tone_3m,10Q_tone_6m,10K_tone_1y,8K_tone_1y
0,38,-0.842297,-1.433797,-0.210560,3.523173,0.006848,-0.469324,-1.788144,-1.828756,2.078370,...,15348.473288,-24234.092463,2482.284664,-1617.953163,94.475266,-0.070434,0.000000,0.000000,0.000000,0.000000
1,39,0.170858,-1.270637,0.072215,3.117987,-0.479138,-0.141052,0.633689,-1.422326,0.440524,...,-6510.929915,4118.864799,-2864.693540,-114.244438,-33.692280,0.275385,0.000000,0.000000,0.000000,0.000000
2,40,-0.073260,-0.474409,0.194630,-1.849567,0.078071,-0.828217,0.740247,0.132011,-0.338318,...,-10736.433388,17909.214277,-1878.281598,502.196517,-12.175531,-1.761136,0.000000,0.000000,0.000000,0.000000
3,41,-0.157924,1.745192,0.012517,-2.595289,0.238953,-0.252336,0.068552,-0.193489,0.519593,...,778.417608,2428.651821,5391.626657,-1211.525494,-0.784913,1.167292,0.000000,0.000000,0.000000,0.000000
4,42,0.232254,-1.095134,0.087950,-1.511833,-0.087311,0.267514,0.339152,0.046814,-0.096413,...,11280.741595,-29911.737866,-1323.716697,522.211926,8.551870,2.050129,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,246,0.011886,-0.019916,0.007471,0.006344,0.018156,-0.004351,-0.004101,-0.008452,-0.008925,...,-8.501153,4.537920,-0.819147,-107.657263,-0.027019,-0.252135,0.340910,-0.999641,1.682145,0.106481
128,247,-0.057662,-0.089261,-0.074302,0.077059,0.008541,-0.016966,-0.018030,0.023882,0.025521,...,-1.246681,5.291516,-0.031906,-13.652197,-0.025355,-1.952517,0.025689,0.598068,-0.571258,-0.171439
129,248,-0.026588,-0.002047,-0.010910,0.020393,0.002767,0.015968,0.000699,-0.009305,0.007924,...,-2.604145,5.019466,0.171079,-929.526170,0.045447,0.561437,-0.129484,-0.023453,0.637937,0.328537
130,249,-0.015841,-0.047512,-0.027555,0.392826,0.003044,-0.000077,-0.009533,-0.004909,-0.018083,...,-6.168466,20.259399,-1.823011,-325.705048,0.087127,-0.946079,3.070209,-0.593500,1.173888,-1.393214


In [None]:
# Get p values
p_value_dict = {"factor": [], "p-value": []}
t_stat_dict = {"factor": [], "t": [], "|t|": []}
for factor in model_factors:
    lambdas = second_stage_df[factor]
    ttest = stats.ttest_ind(lambdas, np.zeros(len(lambdas))) # Compare to see if any lambdas are significantly different from zero
    p_value_dict['factor'].append(factor)
    p_value_dict['p-value'].append(ttest[1])
    
    t_stat_dict['factor'].append(factor)
    t_stat_dict['t'].append(ttest[0])
    t_stat_dict['|t|'].append(abs(ttest[0]))

p_df = pd.DataFrame.from_dict(p_value_dict, orient='index')
t_df = pd.DataFrame.from_dict(t_stat_dict, orient='index')
t_df.round(2).T.sort_values(by="|t|", ascending=False)

Unnamed: 0,factor,t,|t|
77,B10RET,2.158209,2.158209
87,recession_affinity,2.136104,2.136104
49,lag_log_size,2.107963,2.107963
17,RSI_20,1.97471,1.97471
16,MoneyFlowIndex_20,1.91373,1.91373
...,...,...,...
68,TCMNOM_Y5,0.065823,0.065823
37,ni_g_q,0.044767,0.044767
55,CD_M3,0.040304,0.040304
47,Cto,0.037892,0.037892


In [None]:
p_df.round(2).T.sort_values(by="p-value")

Unnamed: 0,factor,p-value
77,B10RET,0.03182
87,recession_affinity,0.033598
49,lag_log_size,0.035984
17,RSI_20,0.049351
16,MoneyFlowIndex_20,0.056744
...,...,...
68,TCMNOM_Y5,0.947569
37,ni_g_q,0.964327
55,CD_M3,0.967881
47,Cto,0.969803


# Final Factors

As a group, we have decided to use the following factors for our final model:
- lag_log_size
- recession_affinity
- RSI_20
- CD_M1
- MoneyFlowIndex_20
- sue_NI
- xret_indsize_120
- ED_M6
- BL
- T30RET
- 10K_tone_1y
- volinc

In [None]:
predict_factors = ["lag_log_size", 
                    "recession_affinity",
                    "RSI_20",
                    "CD_M1",
                    "MoneyFlowIndex_20",
                    "sue_NI",
                    "xret_indsize_120",
                    "ED_M6",
                    "BL",
                    "T30RET",
                    "10K_tone_1y",
                    "volinc"]

In [None]:
# Threaded Approach
def add_betas_predict(permno):
    results = []
    for (i, monthid) in enumerate(validation_range): 
        window = set(validation_range[max(0, i-36):i]) # betas calculated using t_(i-37) to t_(i-2) factors and t_(i-36) to t_(i-1) returns.
                                                       # betas added alongside t_i returns (dated to be compared against t_(i-1) factors)
        window_data = all_monthly_data[(all_monthly_data["permno"] == permno) & (all_monthly_data["monthid"].isin(window))]
        
        if len(window_data) < 12:
            continue

        explanatory_vars = window_data[predict_factors + ["monthid"]]
        explanatory_vars.sort_values(by="monthid", inplace=True)
        explanatory_vars.set_index("monthid", inplace=True)
    
        explained_var = window_data[["monthid", "RET"]] # Since factors are from t-1
        explained_var.sort_values(by="monthid", inplace=True)
        explained_var.set_index("monthid", inplace=True)
        
        model = linear_model.LinearRegression().fit(explanatory_vars, 
                                                    explained_var["RET"])
        
        results.append({"monthid": monthid, 
                        "permno": permno, 
                        "RET": explained_var["RET"].iloc[-1], 
                    **{f"{factor}": model.coef_[i] for i, factor in enumerate(predict_factors)}
                       })  
    return results

# UNCOMMENT THIS AND ADD TO LINE BELOW INSTEAD OF `permnos`
# FOR DEVELOPMENT - THIS CODE BLOCK TAKES LIKE 30 MINS TO RUN

# smaller_permno_list = list(permnos)[:10]

# Only compute if not in files (delete local copy of file if code above if modifying factors or code above)
if os.path.isfile("linear_predict_betas.csv"):
    linear_betas = pd.read_csv("linear_predict_betas.csv", index_col=0)
else:
    # Runs once basically
    summary_results = []
    with ThreadPool(cpu_count() - 1) as P:
        summary_results = P.map(add_betas_predict, permnos)
        summary_results = [item for sublist in summary_results for item in sublist]
        linear_betas = pd.DataFrame(summary_results)
        
        # Save first stage df for easy loading
        linear_betas.to_csv("linear_predict_betas.csv")
linear_betas.describe()

Unnamed: 0,monthid,permno,RET,lag_log_size,recession_affinity,RSI_20,CD_M1,MoneyFlowIndex_20,sue_NI,xret_indsize_120,ED_M6,BL,T30RET,10K_tone_1y,volinc
count,68294.0,68294.0,68294.0,68294.0,68294.0,68294.0,68294.0,68294.0,68294.0,68294.0,68294.0,68294.0,68294.0,68294.0,68294.0
mean,374.582116,62927.921897,0.015357,-0.405096,0.000666,-0.000371,0.038329,0.000299,-0.024616,-0.083756,-0.031748,-0.025742,-43.4386,-1.943319,-13.328042
std,17.044542,27457.377471,0.128052,15.293134,0.138353,0.240398,2.344355,0.210455,7.342021,11.56325,2.952279,28.454518,4978.929,2902.544841,2657.638327
min,344.0,10026.0,-0.878327,-282.740835,-4.534503,-60.346467,-160.955754,-19.065641,-1851.835562,-2585.718067,-466.267973,-4031.841632,-1258950.0,-252054.862101,-671815.606437
25%,360.0,39693.0,-0.043542,-0.698896,-0.000531,-0.002014,-0.165192,-0.002032,-0.02927,-0.257095,-0.167977,-0.214791,-154.5008,0.0,-11.942057
50%,375.0,76721.0,0.014708,-0.414806,-2.7e-05,-1.2e-05,0.016133,0.000217,0.002011,-0.056196,-0.034942,0.001077,15.85528,0.0,0.411508
75%,389.0,85488.0,0.070913,-0.187785,0.000384,0.00223,0.200073,0.00243,0.03616,0.143052,0.101307,0.244493,147.7091,0.0,14.593884
max,403.0,93429.0,4.140351,3620.963737,35.168446,10.162978,275.22349,49.758283,296.078303,1218.707613,195.467009,4401.08234,161904.9,571592.297685,27127.566088


In [None]:
# # Calculate predicted returns using factors and betas (dot product)

# lookup_set = set(linear_betas[['monthid', 'permno']].apply(tuple, axis=1))
# mask = all_monthly_data[['monthid', 'permno']].apply(tuple, axis=1).isin(lookup_set)
# pred_factors = all_monthly_data[mask][["monthid", "permno"] + predict_factors].set_index(["monthid", "permno"]).sort_index()

# lookup_set = set(pred_factors.index.values)
# mask = linear_betas[['monthid', 'permno']].apply(tuple, axis=1).isin(lookup_set)
# pred_betas = linear_betas[mask].set_index(["monthid", "permno"]).sort_index()

# pred_returns = (pred_factors * pred_betas[predict_factors]).sum(axis=1) # Dot product

In [None]:
# Using the scoring method in Step 10 in assignment instead

# Second stage regression
lambdas = {"monthid": []}
for factor in predict_factors:
    lambdas[f"{factor}"] = []
    
for monthid in validation_range:
    monthid_returns = linear_betas.loc[linear_betas["monthid"] == monthid]

    # If empty
    if monthid_returns.empty:
        continue

    explanatory_vars = monthid_returns[predict_factors + ["permno"]]
    explanatory_vars.sort_values(by="permno", inplace=True)
    explanatory_vars.set_index("permno", inplace=True)

    explained_var = monthid_returns[["permno", "RET"]]
    explained_var.sort_values(by="permno", inplace=True)
    explained_var.set_index("permno", inplace=True)
    
    model = linear_model.LinearRegression(n_jobs=len(predict_factors)).fit(explanatory_vars, 
                                                                           explained_var["RET"])

    lambdas["monthid"].append(monthid)

    for (i, factor) in enumerate(predict_factors):
        lambdas[factor].append(model.coef_[i])

lambdas_df = pd.DataFrame(lambdas)

# Get p values
t_stat_dict = {"factor": [], "t": [], "|t|": []}
t_dict = {}
for factor in predict_factors:
    lambdas = lambdas_df[factor]
    ttest = stats.ttest_ind(lambdas, np.zeros(len(lambdas))) # Compare to see if any lambdas are significantly different from zero
    p_value_dict['factor'].append(factor)
    p_value_dict['p-value'].append(ttest[1])
    
    t_stat_dict['factor'].append(factor)
    t_stat_dict['t'].append(ttest[0])
    t_stat_dict['|t|'].append(abs(ttest[0]))
    t_dict[factor] = ttest[0]

# t_df = pd.DataFrame.from_dict(t_stat_dict, orient='index').T.sort_values(by="|t|", ascending=False)

In [None]:
# Now we just need to get a z score for each factor measurement for each permno for each date in our validation window

predict_data = all_monthly_data[all_monthly_data["monthid"].isin(validation_range)]

z_scores_dict = {"monthid": [], "permno": []}
for factor in predict_factors:
    z_scores_dict[factor] = []

for permno in permnos:
    permno_factor_data = predict_data[predict_data["permno"] == permno]
    
    factor_z_scores = {}
    for factor in predict_factors:
        factor_z_scores[factor] = list(stats.zscore(list(permno_factor_data[factor])))
    
    for i, monthid in enumerate(list(permno_factor_data["monthid"])):
        z_scores_dict["monthid"].append(monthid)
        z_scores_dict["permno"].append(permno)
        for factor in predict_factors:
            z_scores_dict[factor].append(factor_z_scores[factor][i] if not np.isnan(factor_z_scores[factor][i]) else 0)

In [None]:
factor_z_scores = pd.DataFrame(z_scores_dict)
factor_z_scores

Unnamed: 0,monthid,permno,lag_log_size,recession_affinity,RSI_20,CD_M1,MoneyFlowIndex_20,sue_NI,xret_indsize_120,ED_M6,BL,T30RET,10K_tone_1y,volinc
0,251,49154.0,-2.249096,-0.887619,0.507254,2.863534,1.102197,1.924875,-0.744959,2.821302,-0.137629,3.038973,0.0,-0.884117
1,252,49154.0,-1.699856,-0.896350,0.437340,2.868881,1.627591,-0.606106,1.033693,2.804159,1.521932,2.867931,0.0,-0.926541
2,253,49154.0,-1.297563,-0.905584,0.292334,2.900966,0.014198,-0.606106,1.205896,2.615591,1.521932,2.829088,0.0,-0.971408
3,254,49154.0,-0.235284,1.815703,0.763386,2.478521,0.927602,-0.606106,1.325381,2.135599,1.521932,3.056691,0.0,-1.018984
4,255,49154.0,-0.063142,-0.925789,0.328586,2.286015,0.035254,-0.606106,3.486712,1.947031,1.521932,1.973195,0.0,-1.069589
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83239,399,81917.0,1.200411,1.098539,0.241194,-0.542761,0.638047,0.520426,-0.356390,-0.710067,0.198691,-0.479320,0.0,0.610895
83240,400,81917.0,1.367306,0.984145,1.407853,-0.548109,1.324179,0.494818,-0.231876,-0.721496,0.206336,-0.488179,0.0,0.421845
83241,401,81917.0,1.366393,1.053065,0.066031,-0.542761,0.149142,0.159272,-0.606852,-0.727210,-0.139371,-0.490904,0.0,0.535743
83242,402,81917.0,1.526387,-1.264714,1.007908,-0.548109,1.071730,-0.123869,-0.725506,-0.732924,-0.105132,-0.507940,0.0,0.619465


In [100]:
scores = factor_z_scores.copy()
for factor in predict_factors:
    scores[factor] *= t_dict[factor]

scores.set_index(["monthid", "permno"], inplace=True)
scores = scores.sum(axis=1)
scores

monthid  permno 
251      49154.0    -8.033633
252      49154.0    -8.030536
253      49154.0    -9.064127
254      49154.0    -7.921881
255      49154.0   -10.485026
                      ...    
399      81917.0     3.501669
400      81917.0     3.972374
401      81917.0     3.686816
402      81917.0     7.283755
403      81917.0     5.624808
Length: 83244, dtype: float64

In [105]:
score_df = pd.DataFrame(scores.reset_index()).rename({0: "exposure score"}, axis=1)
score_df

Unnamed: 0,monthid,permno,exposure score
0,251,49154.0,-8.033633
1,252,49154.0,-8.030536
2,253,49154.0,-9.064127
3,254,49154.0,-7.921881
4,255,49154.0,-10.485026
...,...,...,...
83239,399,81917.0,3.501669
83240,400,81917.0,3.972374
83241,401,81917.0,3.686816
83242,402,81917.0,7.283755


# Machine Learning

We will attempt implementign **PLS** as our machine learning method as it provides the most potent return capabilities as outlined in Lecture 4's code example, and in the ML slides.

In [106]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ParameterGrid

In [107]:
# ML Code from ML Tutorials

# validation function
def validate_model(model_type, param_grid, x_train, y_train, x_validate, y_validate):
    # Special case for LinearRegression because it doesn't have hyperparameters to tune
    if model_type == LinearRegression:
        model = LinearRegression()
        model.fit(x_train, y_train)
        pred = model.predict(x_validate)
        r2 = r2_score(y_validate, pred)
        
        return r2
    else: # The other cases
        
        # Establishses the ParameterGrid
        model_param_grid = ParameterGrid(param_grid)
        
        # Initialize values
        best_MAE = 0
        best_r2 = 1
        best_config = None
        # Iterate through the parameter grid, fit models to the hyperparameters
        # and check for MAE and R2 values
        
        # each param_config in that validation function would represent 1 combination of the possible parameters.
        # for example in Lab 6, when I'm validating for the elastic net regression, I have 
        # 2 possible hyperparameters: alpha and l1_ratio. 
        #alpha can take on values 0.0001, 0.0005, etc, and l1_ratio can take on values 0, 1, 0.01. 
        #So each param_config in the for loop in validate_model would go over 1 possible 
        #combination of the hyperparameter and keep the one that gives us the best MAE/R2
        for param_config in model_param_grid:
            curr_config_MAEs = []
            model = model_type(**param_config)
            model.fit(x_train, y_train)
            pred = model.predict(x_validate)
            MAE = mean_squared_error(y_validate,pred)
            r2 = r2_score(y_validate, pred)
            curr_config_MAEs.append(MAE)
            if best_MAE == 0 or (MAE < best_MAE):
                best_MAE = MAE
                best_config = param_config
            if best_r2 == 1 or (r2 > best_r2):
                best_r2 = r2
        return best_config, best_MAE, best_r2

# Predictions
def pred(model_type, x_train, y_train, x_test, y_test):
    # Fit model and predict 
    model = model_type.fit(x_train, y_train)
    pred = model.predict(x_test)
    
    # Format prediction as DataFrame
    pred_df = pd.DataFrame(pred, columns = ['RET_pred'])
    pred_df.set_index(x_test.index, inplace = True)
    
    r2 = r2_score(y_test, pred)
    return pred_df, r2

In [108]:
# using a 60/20/20 split
# train, validate, test = \
#                         np.split(all_monthly_data.sample(frac=1, random_state=42), 
#                         [int(.6*len(all_monthly_data)), int(.8*len(all_monthly_data))])

# No subset
train, validate, test = \
                        np.split(all_monthly_data,
                        [int(.6*len(all_monthly_data)), int(.8*len(all_monthly_data))])

x_train = train[model_factors + ["yyyymm", "permno"]].set_index(["yyyymm", "permno"])
y_train = train[['RET', "yyyymm", "permno"]].set_index(["yyyymm", "permno"])

x_validate = validate[model_factors + ["yyyymm", "permno"]].set_index(["yyyymm", "permno"])
y_validate = validate[['RET', "yyyymm", "permno"]].set_index(["yyyymm", "permno"])

x_test = test[model_factors + ["yyyymm", "permno"]].set_index(["yyyymm", "permno"])
y_test = test[['RET', "yyyymm", "permno"]].set_index(["yyyymm", "permno"])

In [109]:
x_train

Unnamed: 0_level_0,Unnamed: 1_level_0,IM,range_20,log_vol_dollar_20,range_120,log_vol_dollar_120,xret_5,xret_10,xret_20,xret_indsize_20,xret_indsize_std20,...,T90RET,T30RET,CPIRET,10M2,volinc,recession_affinity,10Q_tone_3m,10Q_tone_6m,10K_tone_1y,8K_tone_1y
yyyymm,permno,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
198202.0,10145.0,-0.102886,0.021682,14.425697,0.019532,14.615516,0.046313,-0.002667,0.028496,0.051664,0.016863,...,0.007271,0.007306,0.003191,-0.001655,0.043023,23.243621,0.0,0.0,0.0,0.0
198202.0,10241.0,-0.078001,0.021561,14.424110,0.023697,14.170329,-0.043787,-0.013684,-0.020178,-0.051245,0.023591,...,0.007271,0.007306,0.003191,-0.001655,0.009131,109.517034,0.0,0.0,0.0,0.0
198202.0,10460.0,-0.073044,0.015622,12.621541,0.019425,12.246832,-0.012173,0.002418,-0.057195,-0.112451,0.013734,...,0.007271,0.007306,0.003191,-0.001655,0.008639,115.756923,0.0,0.0,0.0,0.0
198202.0,10516.0,0.027852,0.022940,14.384263,0.028358,14.281943,0.026917,0.045042,-0.017390,-0.040786,0.012684,...,0.007271,0.007306,0.003191,-0.001655,0.010577,94.548065,0.0,0.0,0.0,0.0
198202.0,10866.0,-0.073738,0.024308,12.248607,0.024539,12.066522,-0.018919,-0.016064,0.015425,-0.009933,0.018356,...,0.007271,0.007306,0.003191,-0.001655,0.004508,221.849373,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200109.0,25081.0,-0.020468,0.017343,17.314568,0.022834,17.508521,0.000192,-0.027837,0.036147,0.029839,0.009526,...,0.003284,0.002996,0.000000,0.007936,0.001587,2.380553,0.0,0.0,0.0,0.0
200109.0,25129.0,-0.020468,0.014896,15.387844,0.020314,15.428555,0.060157,0.030661,0.092748,0.086440,0.009600,...,0.003284,0.002996,0.000000,0.007936,0.000240,0.360110,0.0,0.0,0.0,0.0
200109.0,25232.0,-0.088292,0.027459,15.128745,0.028332,15.322000,-0.011431,-0.052600,-0.019621,-0.075675,0.014484,...,0.003284,0.002996,0.000000,0.007936,0.014834,22.250409,0.0,0.0,0.0,0.0
200109.0,25304.0,0.110344,0.020714,15.343086,0.022601,14.786562,0.055170,0.080436,0.205251,0.163889,0.019753,...,0.003284,0.002996,0.000000,0.007936,0.072670,109.004292,0.0,0.0,0.0,0.0


In [110]:
pls_grid = dict()
pls_grid['n_components'] = np.arange(1, len(model_factors)+1, 1)

pls_best_config, pls_best_MAE, pls_best_r2 = validate_model(PLSRegression, pls_grid, x_train, y_train, x_validate\
                                                            , y_validate)
print('Best config:' + str(pls_best_config))
print('Validation R2: ' + str(pls_best_r2))

Best config:{'n_components': 1}
Validation R2: -0.009512947483404144


In [111]:
# Run PLS with best determined components
pls_pred_df, pls_test_r2 = pred(PLSRegression(pls_best_config['n_components']), x_train, y_train, x_test, y_test)

In [112]:
pls_pred_df.describe()

Unnamed: 0,RET_pred
count,36608.0
mean,-0.001625
std,0.006712
min,-0.149957
25%,-0.003976
50%,-0.001397
75%,0.001609
max,0.034561


# Performance Analysis

You should compute 

(1) Raw return

(2) Sharpe ratio

(3) CAPM alpha

(4) 4-Factor alpha

(5) Information Ratio using 4-factor model.

In [154]:
# Time frames don't have to match up but actual returns must be a superset of predicted returns
def generate_monthly_portfolios(predicted_returns, actual_returns, pred_col: str) -> pd.DataFrame:
    if "monthid" not in predicted_returns.columns:
        merged = pd.merge(predicted_returns, actual_returns, on=["permno", "yyyymm"])
        monthTag = "yyyymm" # I don't like this im sorry
        months = merged["yyyymm"]
    else:
        merged = pd.merge(predicted_returns, actual_returns, on=["permno", "monthid"])
        monthTag = "monthid" # I don't like this im sorry
        months = merged["monthid"]
    
    result = {"month": [], "hedged_ret": [], "MKTRF": [], "SMB": [], "HML": [], "UMD": [], "RF": []}
    
    for month in months:
        month_predictions = merged[merged[monthTag] == month].sort_values(by=pred_col)
        ten_percent = int(len(month_predictions) / 10)
        
        # Short leg
        bottom_ten_ret = month_predictions.iloc[:ten_percent]["RET"].mean()
        
        # Long leg
        top_ten_ret = month_predictions.iloc[-ten_percent:]["RET"].mean()
        
        # Hedged (equally weighted)
        hedged_ret = top_ten_ret - bottom_ten_ret
        
        result["month"].append(month)
        result["hedged_ret"].append(hedged_ret)
        
        # Add stats for benchmarking
        result["MKTRF"].append(month_predictions["MKTRF"].iloc[0])
        result["SMB"].append(month_predictions["SMB"].iloc[0])
        result["HML"].append(month_predictions["HML"].iloc[0])
        result["UMD"].append(month_predictions["UMD"].iloc[0])
        result["RF"].append(month_predictions["RF"].iloc[0])
    return pd.DataFrame(result)

In [155]:
ml_model_results = generate_monthly_portfolios(pls_pred_df, all_monthly_data, "RET_pred")
linear_model_results = generate_monthly_portfolios(score_df, all_monthly_data, "exposure score")

In [187]:
def total_ret(port_ret):
    return port_ret.sum()

# Annualized because our two prediction models are on different time ranges
def annual_total_ret(port_ret):
    return port_ret.mean() * 12
    # return np.prod(port_ret + 1) - 1

def tracking_error(port_ret, bench_ret):
    return (port_ret - bench_ret).std()

def information_ratio(alpha, port_ret, bench_ret):
    return alpha / tracking_error(port_ret, bench_ret)

def sharpe_ratio(port_xret):
    return port_xret.mean() / port_xret.std()

def analyze_returns(combined, ret_col='hedged_ret'):
    returns = combined[ret_col]
    rf = combined["RF"]
    excess_returns = returns - rf
    
    capm_x = sm.add_constant(combined[["MKTRF"]])
    CAPMmodel = sm.OLS(excess_returns, capm_x).fit()
    
    ff4_x = sm.add_constant(combined[["MKTRF", "SMB", "HML", "UMD"]]) # UMD is our Momentum (MOM) factor
    FF4model = sm.OLS(excess_returns, ff4_x).fit()
    
    MKTRF_beta = FF4model.params[0]
    SMB_beta = FF4model.params[1]
    HML_beta = FF4model.params[2]
    UMD_beta = FF4model.params[3]
    
    # Predicts excess returns
    combined["FF4_predicted_ret"] = combined["MKTRF"] * MKTRF_beta + combined["SMB"] * SMB_beta + combined["HML"] * HML_beta + combined["UMD"] * UMD_beta
    
    # Convert to raw returns
    combined["FF4_predicted_ret"] += rf
    
    #### 
    summary_results = {}
    
    summary_results["raw return % (annual)"] = annual_total_ret(returns) * 100
    summary_results["sharpe_ratio"] = sharpe_ratio(returns)
    summary_results["CAPM alpha (%)"] = CAPMmodel.params.const * 100
    summary_results["CAPM alpha p-value"] = CAPMmodel.pvalues.const
    
    summary_results["FF4 alpha (%)"] = FF4model.params.const * 100
    summary_results["FF4 alpha p-value"] = FF4model.pvalues.const
    
    # information ratio = FF4 alpha / tracking error (lecture 2)
    summary_results["Information Ratio (FF4)"] = information_ratio(FF4model.params.const, returns, combined["FF4_predicted_ret"])
    
    return pd.DataFrame.from_dict(summary_results, orient='index').round(2)

In [188]:
ml_model_results

Unnamed: 0,month,hedged_ret,MKTRF,SMB,HML,UMD,RF,FF4_predicted_ret
0,201101.0,0.056467,0.0199,-0.0250,0.0083,-0.0029,0.0001,0.000027
1,201101.0,0.056467,0.0199,-0.0250,0.0083,-0.0029,0.0001,0.000027
2,201101.0,0.056467,0.0199,-0.0250,0.0083,-0.0029,0.0001,0.000027
3,201101.0,0.056467,0.0199,-0.0250,0.0083,-0.0029,0.0001,0.000027
4,201101.0,0.056467,0.0199,-0.0250,0.0083,-0.0029,0.0001,0.000027
...,...,...,...,...,...,...,...,...
36603,201307.0,0.038365,0.0565,0.0187,0.0057,0.0176,0.0000,0.010627
36604,201307.0,0.038365,0.0565,0.0187,0.0057,0.0176,0.0000,0.010627
36605,201307.0,0.038365,0.0565,0.0187,0.0057,0.0176,0.0000,0.010627
36606,201307.0,0.038365,0.0565,0.0187,0.0057,0.0176,0.0000,0.010627


In [191]:
ml_model_results.describe()

Unnamed: 0,month,hedged_ret,MKTRF,SMB,HML,UMD,RF,FF4_predicted_ret
count,36608.0,36608.0,36608.0,36608.0,36608.0,36608.0,36608.0,36608.0
mean,201193.579709,0.00908,0.011871,5.4e-05,0.001431,0.003201,3e-05,0.001488
std,75.497393,0.033046,0.037815,0.016075,0.015287,0.025988,4.6e-05,0.011891
min,201101.0,-0.048732,-0.0759,-0.0331,-0.0243,-0.0791,0.0,-0.024075
25%,201109.0,-0.009617,-0.012,-0.0064,-0.009,-0.0114,0.0,-0.003076
50%,201205.0,0.006742,0.0129,0.0007,0.001,0.0018,0.0,0.001674
75%,201212.0,0.025832,0.0349,0.0133,0.0114,0.0181,0.0001,0.008161
max,201307.0,0.096351,0.1135,0.0328,0.0359,0.0649,0.0001,0.024856


In [189]:
analyze_returns(ml_model_results)

Unnamed: 0,0
raw return % (annual),10.9
sharpe_ratio,0.27
CAPM alpha (%),0.24
CAPM alpha p-value,0.0
FF4 alpha (%),0.82
FF4 alpha p-value,0.0
Information Ratio (FF4),0.26


In [192]:
linear_model_results.describe()

Unnamed: 0,month,hedged_ret,MKTRF,SMB,HML,UMD,RF,FF4_predicted_ret
count,83244.0,83244.0,83244.0,83244.0,83244.0,83244.0,83244.0,83244.0
mean,358.193744,-0.003779,0.003262,0.003651,0.005174,-0.004364,0.000615,0.002246
std,42.803708,0.041742,0.055877,0.025092,0.037331,0.066296,0.001323,0.017008
min,251.0,-0.120243,-0.1723,-0.0623,-0.1129,-0.343,0.0,-0.026074
25%,351.0,-0.0222,-0.0235,-0.0115,-0.0112,-0.0232,0.0,-0.005609
50%,369.0,-0.006497,0.0119,0.0025,0.0014,0.0028,0.0001,-0.00162
75%,387.0,0.01035,0.0389,0.0187,0.0231,0.0301,0.0001,0.005805
max,403.0,0.156689,0.1135,0.0668,0.1247,0.1257,0.0054,0.090518


In [190]:
analyze_returns(linear_model_results)

Unnamed: 0,0
raw return % (annual),-4.53
sharpe_ratio,-0.09
CAPM alpha (%),-0.5
CAPM alpha p-value,0.0
FF4 alpha (%),-0.49
FF4 alpha p-value,0.0
Information Ratio (FF4),-0.13


In [None]:
# Cool Graphs

