# Final Project

- Saige Belanger
    - (20951877)
- Dylan Faelker
    - (20960747)
- Ethan Liu
    - (20959615)
- Timothy Zheng
    - t54zheng (20939203)

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
from sklearn import linear_model
import statsmodels.api as sm
import scipy.stats as stats
from math import sqrt
import math

from sklearn import preprocessing
from sklearn.impute import SimpleImputer
import datetime as dt

import os.path

warnings.filterwarnings('ignore')

# Factors
We start with an initial list of factors from the provided list of 50 Factors in the ML examples.

TODO: Increase our breadth of factors to the category chosen by downloading and creating them, then merging them 

https://wrds-www.wharton.upenn.edu/pages/get-data/compustat-capital-iq-standard-poors/compustat/north-america-daily/fundamentals-quarterly/

In [2]:
all_monthly_data = pd.read_sas("merged_df.sas7bdat", encoding = 'ISO-8859-1')

In [3]:
# Save all gvkeys - for WRDS Queries

# with open("gvkeys.txt", "w") as file:
#     for gvkey in set(all_monthly_data["gvkey"].dropna()):
#         file.write(f"{int(gvkey)},\n")

In [4]:
permnos = set(all_monthly_data["permno"])

In [5]:
gvkey_permno_dict = {}
for gvkey in set(all_monthly_data["gvkey"].dropna()):
    permno = all_monthly_data[all_monthly_data["gvkey"] == gvkey]["permno"].dropna().iloc[0]
    gvkey_permno_dict[gvkey] = permno

In [6]:
all_monthly_data.drop(["ticker", "conm", "gvkey", "cusip", "naics", "gsubind"], axis=1, inplace=True) # We don't use these columns anyway, drop them

In [7]:
factors = ['IM', 'range_20', 'log_vol_dollar_20',
       'range_120', 'log_vol_dollar_120', 'xret_5', 'xret_10', 'xret_20',
       'xret_indsize_20', 'xret_indsize_std20', 'xret_40', 'xret_120',
       'xret_indsize_120', 'xret_indsize_std120', 'KDJ_20', 'deviation_pct20',
       'MoneyFlowIndex_20', 'RSI_20', 'KDJ_120', 'deviation_pct120',
       'MoneyFlowIndex_120', 'RSI_120', 'IV_capm', 'mdr', 'ami_3', 'beta_3y',
       'beta_5y', 'tail_2y', 'dp', 'leverage', 'BL', 'roe', 'roa',
       'profitability', 'sales_g_q', 'sales_g_ttm', 'op_income_g_q', 'ni_g_q',
       'op_income_g_ttm', 'ni_g_ttm', 'sue_NI', 'BM', 'AM', 'EP', 'SP',
       'roe_q', 'roa_q', 'Cto', 'pe_ttm', 'lag_log_size']

ret_cols = ['ret_f1', 'ret_f2', 'ret_f3', 'ret_f4', 'ret_f5', 'ret_f6', 
            'ret_f7', 'ret_f8', 'ret_f9', 'ret_f10', 'ret_f11', 'ret_f12']

In [8]:
non_data_cols = [x for x in all_monthly_data.columns if x not in factors and x not in ret_cols]
non_data_cols

['permno', 'yyyymm', 'monthid', 'PRC', 'VOL', 'RET', 'SHROUT']

# Adding New Factors
* When you add a factor, document it here: [link](https://docs.google.com/spreadsheets/d/1rs9633QSYLVY5Z5DoGNy3USP2MROGtqTIKcbLG68wpE/edit#gid=1579135478) and fill properly
* Download the data file, if it's too large add it to the drive
* Also download the other files that arent on github but on the drive before working on this part of the notebook
    * https://drive.google.com/drive/u/0/folders/1D1eIYlkNxNLfzHJLzkGeE9ymr7doXg_6

## IMPORTANT NOTE - FACTOR/RETURN TIME
- When adding factors make sure you add such that factor is reported at t-1, **RET** has **T** returns (in same row)
- This means you need to download data from the range **(1979-12 to 2019-11)**

***

- Treasury and CPI Rates: [Link](https://wrds-www.wharton.upenn.edu/pages/get-data/center-research-security-prices-crsp/annual-update/index-treasury-and-inflation/us-treasury-and-inflation-indexes/)
- Federal Reserve Data: [Link](https://wrds-www.wharton.upenn.edu/pages/get-data/federal-reserve-bank-reports/interest-rates/data/)
- SEC Filings: https://wrds-www.wharton.upenn.edu/pages/get-data/wrds-sec-analytics-suite/wrds-sec-filings-queries/list-of-filings-exhibits/
- Analyzed Data: https://wrds-www.wharton.upenn.edu/pages/get-data/wrds-sec-analytics-suite/wrds-sec-text-analysis/readability-and-sentiment/

**TBD**
- Other Factors: Downloaded from https://wrds-www.wharton.upenn.edu/pages/get-data/compustat-capital-iq-standard-poors/compustat/north-america-daily/fundamentals-quarterly/

In [9]:
# Add new generated factors here
macro_factors = ['FF_O', 'SL_Y20', 'MORTG_NA', 
                'PRIME_NA', 'CD_M1', 'CD_M3', 'CD_M6', 'ED_M1', 'ED_M3', 
                'ED_M6', 'TB_M3', 'TB_M6', 'TB_Y1',
                'TCMNOM_M3', 'TCMNOM_M6', 'TCMNOM_Y1', 'TCMNOM_Y2', 
                'TCMNOM_Y3', 'TCMNOM_Y5', 'TCMNOM_Y7', 'TCMNOM_Y10', 
                'TCMNOM_Y20', 'TCMNOM_Y30', 'AAA_NA', 'BAA_NA', 'B30RET', 'B20RET', 
                 'B10RET', 'B7RET', 'B5RET', 'B2RET', 'B1RET', 
                'T90RET', 'T30RET', 'CPIRET', ]
recession_factors = ["10M2", "volinc", "recession_affinity"]
tone_factors = ['10Q_tone_3m', '10Q_tone_6m', 
                '10K_tone_1y', 
                '8K_tone_1y',]
factors += macro_factors
factors += recession_factors
factors += tone_factors

# Macro Factors

In [10]:
treasury_inflation = pd.read_sas("treasury_inflation.sas7bdat", encoding = 'ISO-8859-1')
federal_reserve = pd.read_sas("federal_reserve.sas7bdat", encoding = 'ISO-8859-1')

In [11]:
#dropping factors with excess null values
federal_reserve.dropna(axis = 1, thresh = 300, inplace=True)

In [12]:
#adding yyyymm to merge taking into account 1 month shift
federal_reserve["yyyymm"] = (federal_reserve["date"] + pd.DateOffset(months=1)).dt.strftime("%Y%m").astype(float)
treasury_inflation["yyyymm"] = (treasury_inflation["CALDT"] + pd.DateOffset(months=1)).dt.strftime("%Y%m").astype(float)


In [13]:
#merging to all_monthly_data
all_monthly_data = pd.merge(all_monthly_data, federal_reserve, on="yyyymm")
all_monthly_data = pd.merge(all_monthly_data, treasury_inflation, on="yyyymm")

# SEC Filings Sentiment Factors

In [14]:
sec_analytics = pd.read_sas("sec_filing_analysis_wrds.sas7bdat", encoding = 'ISO-8859-1')

In [15]:
sec_analytics["PERMNO"] = sec_analytics["GVKEY"].map(gvkey_permno_dict)

In [16]:
# Import from Dylan's code
sec_analytics['yyyymm'] = (sec_analytics['FDATE'].astype('string').str[:4] + sec_analytics['FDATE'].astype('string').str[5:7]).astype('float64')
sec_analytics_subset_forms = sec_analytics[(sec_analytics['FORM'].isin(['10-K', '10-Q', '8-K']))][["PERMNO", "yyyymm", "FORM", "LM_POSITIVE_COUNT", "LM_NEGATIVE_COUNT", "WORD_COUNT"]]

sec_analytics_subset_forms['tone'] = (sec_analytics_subset_forms['LM_POSITIVE_COUNT'] - sec_analytics_subset_forms['LM_NEGATIVE_COUNT']) / sec_analytics_subset_forms['WORD_COUNT']

sec_analytics_subset_forms.head()

Unnamed: 0,PERMNO,yyyymm,FORM,LM_POSITIVE_COUNT,LM_NEGATIVE_COUNT,WORD_COUNT,tone
0,86594.0,201306.0,10-K,2387.0,2500.0,260142.0,-0.000434
9,86594.0,200703.0,10-Q,110.0,115.0,14978.0,-0.000334
11,86594.0,201209.0,8-K,35.0,31.0,3111.0,0.001286
14,86594.0,200502.0,8-K,279.0,931.0,53640.0,-0.012155
17,86594.0,200606.0,8-K,15.0,6.0,907.0,0.009923


In [17]:

def yyyymm_add(yyyymm, years, months):
    return yyyymm + ((years + math.floor(months / 12)) * 100) + (months % 12)

def yyyymm_sub(yyyymm, years, months):
    return yyyymm - ((years + math.floor(months / 12)) * 100) - (months % 12)

In [18]:
!pip install multiprocess



In [19]:
from multiprocess import cpu_count # You might have to change to multiprocessing if on windows
from multiprocess.pool import ThreadPool

In [20]:
# Threaded Approach. Takes estimate >2h
def calc_tone_factors(permno):
    results = []

    for date in all_monthly_data['yyyymm'].unique():
        sentiments = {}
        # when there is no relevant data. Saves time
        if sec_analytics_subset_forms.loc[(permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 1, 0))].shape[0] == 0:
            sentiments = {"permno": permno, 
                            "yyyymm": date,
                            '10Q_tone_3m': 0,
                            '10Q_tone_6m': 0,
                            '10K_tone_1y': 0,
                            '8K_tone_1y': 0,
                            }
        elif sec_analytics_subset_forms.loc[(permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 0, 6))].shape[0] == 0:
            sentiments = {"permno": permno, 
                            "yyyymm": date,
                            '10Q_tone_3m': 0,
                            '10Q_tone_6m': 0,
                            '10K_tone_1y': sec_analytics_subset_forms.loc[('10-K' == sec_analytics_subset_forms['FORM']) & (permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 1, 0)), 'tone'].sum(),
                            '8K_tone_1y': sec_analytics_subset_forms.loc[('8-K' == sec_analytics_subset_forms['FORM']) & (permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 1, 0)), 'tone'].sum(),
                            }
        elif sec_analytics_subset_forms.loc[(permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 0, 3))].shape[0] == 0:
            sentiments = {"permno": permno, 
                            "yyyymm": date,
                            '10Q_tone_3m': 0,
                            '10Q_tone_6m': sec_analytics_subset_forms.loc[('10-Q' == sec_analytics_subset_forms['FORM']) & (permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 0, 6)), 'tone'].sum(),
                            '10K_tone_1y': sec_analytics_subset_forms.loc[('10-K' == sec_analytics_subset_forms['FORM']) & (permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 1, 0)), 'tone'].sum(),
                            '8K_tone_1y': sec_analytics_subset_forms.loc[('8-K' == sec_analytics_subset_forms['FORM']) & (permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 1, 0)), 'tone'].sum(),
                            }
        else:
            sentiments = {"permno": permno, 
                            "yyyymm": date,
                            '10Q_tone_3m': sec_analytics_subset_forms.loc[('10-Q' == sec_analytics_subset_forms['FORM']) & (permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 0, 3)), 'tone'].sum(),
                            '10Q_tone_6m': sec_analytics_subset_forms.loc[('10-Q' == sec_analytics_subset_forms['FORM']) & (permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 0, 6)), 'tone'].sum(),
                            '10K_tone_1y': sec_analytics_subset_forms.loc[('10-K' == sec_analytics_subset_forms['FORM']) & (permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 1, 0)), 'tone'].sum(),
                            '8K_tone_1y': sec_analytics_subset_forms.loc[('8-K' == sec_analytics_subset_forms['FORM']) & (permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 1, 0)), 'tone'].sum(),
                            }
        results.append(sentiments)
    
    return results

permnos = set(all_monthly_data["permno"])
with ThreadPool(cpu_count() - 1) as P:
    sentiments = P.map(calc_tone_factors, permnos)
    sentiments = [item for sublist in sentiments for item in sublist]
    sentiments = pd.DataFrame(sentiments)
    all_monthly_data = pd.merge(all_monthly_data, sentiments, on=["permno", "yyyymm"], how='outer')

In [None]:
all_monthly_data[all_monthly_data["permno"] == 86594.0].sort_values(by="monthid").head()

Unnamed: 0,permno,yyyymm,monthid,IM,range_20,log_vol_dollar_20,range_120,log_vol_dollar_120,xret_5,xret_10,...,ret_f7,ret_f8,ret_f9,ret_f10,ret_f11,ret_f12,10Q_tone_3m,10Q_tone_6m,10K_tone_1y,8K_tone_1y
578148,86594.0,199901.0,229.0,,,,,,,,...,0.223744,0.376866,-0.03523,0.016854,0.607735,-0.053265,0.0,0.0,0.0,0.0
578149,86594.0,199902.0,230.0,,,,,,,,...,0.376866,-0.03523,0.016854,0.607735,-0.053265,0.088929,0.0,0.0,0.0,0.0
578150,86594.0,199903.0,231.0,,,,,,,,...,-0.03523,0.016854,0.607735,-0.053265,0.088929,-0.206667,0.0,0.0,0.0,-0.023482
578151,86594.0,199904.0,232.0,,,,,,,,...,0.016854,0.607735,-0.053265,0.088929,-0.206667,-0.109244,0.000887,0.000887,0.0,-0.023482
578152,86594.0,199905.0,233.0,,,,,,,,...,0.607735,-0.053265,0.088929,-0.206667,-0.109244,-0.183962,0.000887,0.000887,0.0,-0.023482


# Recession Factor
- During recessions, companies that sell essential products/services typically outperform companies that offer products that are categorized as discretionary spending by consumers.
- Basically we want to come up with a factor such that during recessions, the factor is high for companies selling essential products/services and low for companies producing goods/services that are highly sensitive to recessions. Then we want the factor to be flipped when the company is out of a recession.
- We will categorize a recession as whenever the yield curve is inverted

### Proxy factors
- [10M2] Yield curve: US Treasury 10Y - US Treasury 2 Year: https://fred.stlouisfed.org/series/T10Y2YM
    - Recession if 10M2 < 0 [inverted yield curve]

- [volinc] Annual Income Volatility: Standard Deviation of annual net income growth [ni_g_ttm] for past 5 years, minimum past 1 year
    - Using trailing twelve month (TTM) measure because it avoids any seasonality considerations

In [None]:
all_monthly_data[all_monthly_data["permno"] == 86594.0][["ni_g_ttm", "yyyymm"]].dropna().head(12)

Unnamed: 0,ni_g_ttm,yyyymm
382792,-0.319421,200001.0
382793,-0.319421,200002.0
382794,-0.319421,200003.0
382795,-0.298619,200004.0
382796,-0.298619,200005.0
382797,-0.298619,200006.0
382798,0.319728,200007.0
382799,0.319728,200008.0
382800,0.319728,200009.0
382801,0.334539,200010.0


In [None]:
# Add annual income volatility - std of ni_g_ttm for past 5 years, minimum of past 1 year

volinc = {"yyyymm": [], "permno": [], "volinc": []}
for permno in permnos:
    ni_g_ttm = all_monthly_data[all_monthly_data["permno"] == permno][["ni_g_ttm", "yyyymm"]].dropna()
    date_range = sorted(list(ni_g_ttm["yyyymm"]))

    for i, yyyymm in enumerate(date_range):
        
        window = set(date_range[max(0, i-59):i+1]) # Look past 5 years (60 months)
        window_data = ni_g_ttm[ni_g_ttm["yyyymm"].isin(window)]
        
        if len(window_data) < 12:
            continue

        # Add std ni_g_ttm of past 5 years to volinc factor
        # Note this avoids lookahead bias because the data up to and including i
        # should be known (since ni_g_ttm is from i-1 as per data manual)
        volinc["yyyymm"].append(yyyymm)
        volinc["permno"].append(permno)
        volinc["volinc"].append(window_data["ni_g_ttm"].std())

volinc_df = pd.DataFrame(volinc)
volinc_df.head()

Unnamed: 0,yyyymm,permno,volinc
0,198012.0,49154.0,0.023537
1,198101.0,49154.0,0.02416
2,198102.0,49154.0,0.024472
3,198103.0,49154.0,0.024585
4,198104.0,49154.0,0.023754


In [None]:
all_monthly_data = pd.merge(all_monthly_data, volinc_df, on=["yyyymm", "permno"], how="outer")

## 10M2 Yield Curve

In [None]:
treasury_inflation = pd.read_sas("treasury_inflation.sas7bdat", encoding = 'ISO-8859-1')

In [None]:
fact_10M2 = treasury_inflation[["CALDT", "B2RET", "B10RET"]]

# Add one month to fit RET and factor time
fact_10M2["yyyymm"] = (fact_10M2["CALDT"] + pd.DateOffset(months=1)).dt.strftime("%Y%m").astype(float)

In [None]:
fact_10M2["10M2"] = fact_10M2["B10RET"] - fact_10M2["B2RET"]
fact_10M2

Unnamed: 0,CALDT,B2RET,B10RET,yyyymm,10M2
0,1979-12-31,0.005695,0.011951,198001.0,0.006256
1,1980-01-31,-0.000164,-0.037477,198002.0,-0.037313
2,1980-02-29,-0.036947,-0.050507,198003.0,-0.013560
3,1980-03-31,0.010329,0.048345,198004.0,0.038016
4,1980-04-30,0.084198,0.084375,198005.0,0.000177
...,...,...,...,...,...
477,2019-09-30,-0.001297,-0.013852,201910.0,-0.012555
478,2019-10-31,0.003274,-0.000742,201911.0,-0.004016
479,2019-11-29,-0.001038,-0.007410,201912.0,-0.006372
480,2019-12-31,0.002274,-0.011292,202001.0,-0.013566


In [None]:
all_monthly_data = pd.merge(all_monthly_data, fact_10M2[["yyyymm", "10M2"]], on="yyyymm")

## Recession Affinity
* Recession affinity is calculated as

- 1 / volinc **if 10M2 < 0** (recession)
- volinc $\times$ 1500 **if 10M2 > 0** (no recession)
    - TBH Times 1500 descision is arbitrary but it makes sense in "levelling" both sides of the variable, ie:
    - Values when 10M2 < 0 and 10M2 > 0 are relatively equal

In [None]:
all_monthly_data["recession_affinity"] = np.where(all_monthly_data["10M2"] < 0, 1 / all_monthly_data["volinc"], 1500 * all_monthly_data["volinc"])

In [None]:
test = all_monthly_data[["10M2", "volinc", "recession_affinity"]].dropna()

In [None]:
test[test["10M2"] < 0]["recession_affinity"].mean()

105.45824976257666

In [None]:
test[test["10M2"] > 0]["recession_affinity"].mean()

78.82993778650686

In [None]:
all_monthly_data

Unnamed: 0,permno,yyyymm,monthid,IM,range_20,log_vol_dollar_20,range_120,log_vol_dollar_120,xret_5,xret_10,...,ret_f10,ret_f11,ret_f12,10Q_tone_3m,10Q_tone_6m,10K_tone_1y,8K_tone_1y,volinc,10M2,recession_affinity
0,10026.0,198001.0,,,,,,,,,...,,,,0.000000,0.000000,0.000000,0.000000,,0.006256,
1,10032.0,198001.0,,,,,,,,,...,,,,0.000000,0.000000,0.000000,0.000000,,0.006256,
2,10051.0,198001.0,,,,,,,,,...,,,,0.000000,0.000000,0.000000,0.000000,,0.006256,
3,10104.0,198001.0,,,,,,,,,...,,,,0.000000,0.000000,0.000000,0.000000,,0.006256,
4,10107.0,198001.0,,,,,,,,,...,,,,0.000000,0.000000,0.000000,0.000000,,0.006256,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
718555,93374.0,201912.0,480.0,-0.147586,0.010868,17.310472,0.015246,17.388666,0.000246,-0.019599,...,-0.124141,0.086342,0.075351,-0.010593,-0.020941,-0.011014,-0.068925,0.008777,-0.006372,113.930650
718556,93419.0,201912.0,480.0,-0.147586,0.012941,16.775314,0.015804,16.961089,0.008011,0.004462,...,-0.121751,0.349948,-0.066538,-0.001085,-0.003564,-0.002398,-0.050815,0.010539,-0.006372,94.884444
718557,93422.0,201912.0,480.0,-0.046054,0.059361,16.763782,0.067228,17.048398,0.015794,-0.033674,...,-0.003102,0.788889,0.484472,-0.018540,-0.037721,-0.017328,-0.178637,0.125235,-0.006372,7.984961
718558,93427.0,201912.0,480.0,0.260954,0.025465,16.473392,0.028009,16.544964,-0.005524,-0.006388,...,-0.047755,0.138121,0.135851,-0.005819,-0.005819,-0.004089,-0.018317,0.016243,-0.006372,61.566110


***
# Data Cleanup
Done creating all factors, will clean up data before training step 

In [None]:
# Inputation - as in ML Lecture 1

# Drop NA in all non-numerical columns
all_monthly_data.dropna(subset=non_data_cols, inplace=True)

grouped_med = all_monthly_data.groupby(by='monthid')
# the lambda function gets the median per group in the groupby object, and fills the NaN values with the median per group
imputed_grouped = grouped_med.transform(lambda y: y.fillna(y.median()))

# This line assigns the values of the medians 
all_monthly_data = all_monthly_data.assign(**imputed_grouped.to_dict(orient='series'))
all_monthly_data.dropna(inplace=True)

In [None]:
# Filtering data by min price and min market share for each year

# Commenting out for runtime - **does not drop any rows**

# all_monthly_data['yyyy'] = all_monthly_data['yyyymm'].astype(str).str[:4]
# all_monthly_data['MKTSHR'] = all_monthly_data['PRC'] * all_monthly_data['SHROUT'] * 1_000

# to_drop_indices = []

# for permno in all_monthly_data.permno.unique():
#     for year in all_monthly_data['yyyy'].unique():
#         mask = (all_monthly_data['permno'] == permno) & (all_monthly_data['yyyy'] == year)
#         if all_monthly_data[mask].shape[0] != 0 != 0 and (all_monthly_data[mask]['MKTSHR'].iloc[0] < 100_000_000 or all_monthly_data[mask]['PRC'].iloc[0] <= 5):
#             to_drop_indices += list(all_monthly_data[mask].index)
# all_monthly_data.drop(to_drop_indices, inplace=True)

In [None]:
# Winsorizing factors--should winsorize the variables by quarter
for column in factors:
    for date in set(list(all_monthly_data["monthid"])):
        mask = (all_monthly_data["monthid"] == date)
        
        std = all_monthly_data[column][mask].std()
        mean = all_monthly_data[column][mask].mean()

        upper = mean + 3 * std
        lower = mean - 3 * std
        
        all_monthly_data[column][mask].clip(lower, upper, inplace= True)

In [None]:
all_monthly_data

Unnamed: 0,permno,yyyymm,monthid,IM,range_20,log_vol_dollar_20,range_120,log_vol_dollar_120,xret_5,xret_10,...,ret_f10,ret_f11,ret_f12,10Q_tone_3m,10Q_tone_6m,10K_tone_1y,8K_tone_1y,volinc,10M2,recession_affinity
16474,10145.0,198012.0,12.0,0.374586,0.027456,15.617313,0.022920,15.145167,-0.045500,-0.051723,...,0.141141,0.002105,-0.066489,0.000000,0.000000,0.000000,0.000000,0.040110,0.013658,60.164922
16476,10241.0,198012.0,12.0,0.188068,0.018115,13.277532,0.019202,13.227823,-0.033571,-0.063374,...,0.152756,-0.034843,-0.007220,0.000000,0.000000,0.000000,0.000000,0.009316,0.013658,13.974516
16486,10460.0,198012.0,12.0,0.324759,0.036155,13.062007,0.027327,12.680238,0.077472,0.109122,...,0.106796,0.093860,0.085020,0.000000,0.000000,0.000000,0.000000,0.007682,0.013658,11.523463
16489,10516.0,198012.0,12.0,0.170149,0.031346,15.249913,0.031163,15.026468,-0.048646,-0.008587,...,0.079316,0.087302,0.124088,0.000000,0.000000,0.000000,0.000000,0.014242,0.013658,21.362453
16500,10866.0,198012.0,12.0,0.273820,0.008009,11.612416,0.013282,11.810300,0.002714,-0.027249,...,0.049107,0.004255,-0.023220,0.000000,0.000000,0.000000,0.000000,0.002783,0.013658,4.174269
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
718555,93374.0,201912.0,480.0,-0.147586,0.010868,17.310472,0.015246,17.388666,0.000246,-0.019599,...,-0.124141,0.086342,0.075351,-0.010593,-0.020941,-0.011014,-0.068925,0.008777,-0.006372,113.930650
718556,93419.0,201912.0,480.0,-0.147586,0.012941,16.775314,0.015804,16.961089,0.008011,0.004462,...,-0.121751,0.349948,-0.066538,-0.001085,-0.003564,-0.002398,-0.050815,0.010539,-0.006372,94.884444
718557,93422.0,201912.0,480.0,-0.046054,0.059361,16.763782,0.067228,17.048398,0.015794,-0.033674,...,-0.003102,0.788889,0.484472,-0.018540,-0.037721,-0.017328,-0.178637,0.125235,-0.006372,7.984961
718558,93427.0,201912.0,480.0,0.260954,0.025465,16.473392,0.028009,16.544964,-0.005524,-0.006388,...,-0.047755,0.138121,0.135851,-0.005819,-0.005819,-0.004089,-0.018317,0.016243,-0.006372,61.566110


## Factor Code

In [None]:
ff4_factors = pd.read_sas("ff4_factors.sas7bdat", encoding = 'ISO-8859-1')
ff4_factors["monthid"] = ff4_factors.index + 1
ff4_factors.head()

Unnamed: 0,DATEFF,SMB,HML,MKTRF,RF,UMD,monthid
0,1980-01-31,0.0162,0.0175,0.0551,0.008,0.0755,1
1,1980-02-29,-0.0185,0.0061,-0.0122,0.0089,0.0788,2
2,1980-03-31,-0.0664,-0.0101,-0.129,0.0121,-0.0955,3
3,1980-04-30,0.0105,0.0106,0.0397,0.0126,-0.0043,4
4,1980-05-30,0.0213,0.0038,0.0526,0.0081,-0.0112,5


In [None]:
dates = [int(x) for x in sorted(list(set(list(all_monthly_data["yyyymm"]))))]
dates[0], dates[-1]

(198001, 201912)

In [None]:
monthids = [int(x) for x in sorted(list(set(list(all_monthly_data["monthid"]))))]
monthids[0], monthids[-1], len(monthids)

(1, 480, 480)

In [None]:
testing_range = monthids[0:2*(len(monthids) // 3)]
validation_range = monthids[2 * len(monthids) // 3:]

# Validate that ranges have correct ratios
len(testing_range) / len(monthids), len(validation_range) / len(monthids), len(testing_range) + len(validation_range)

(0.6666666666666666, 0.3333333333333333, 480)

## Testing Factors

In [None]:
model_factors = ['IM', 'range_20', 'log_vol_dollar_20',
       'range_120', 'log_vol_dollar_120', 'xret_5', 'xret_10', 'xret_20',
       'xret_indsize_20', 'xret_indsize_std20', 'xret_40', 'xret_120',
       'xret_indsize_120', 'xret_indsize_std120', 'KDJ_20', 'deviation_pct20',
       'MoneyFlowIndex_20', 'RSI_20', 'KDJ_120', 'deviation_pct120',
       'MoneyFlowIndex_120', 'RSI_120', 'IV_capm', 'mdr', 'ami_3', 'beta_3y',
       'beta_5y', 'tail_2y', 'dp', 'leverage', 'BL', 'roe', 'roa',
       'profitability', 'sales_g_q', 'sales_g_ttm', 'op_income_g_q', 'ni_g_q',
       'op_income_g_ttm', 'ni_g_ttm', 'sue_NI', 'BM', 'AM', 'EP', 'SP',
       'roe_q', 'roa_q', 'Cto', 'pe_ttm', 'lag_log_size']

In [None]:
model_factors += macro_factors
model_factors += recession_factors
model_factors += tone_factors

In [None]:
all_monthly_data = pd.merge(ff4_factors, all_monthly_data, on="monthid")

In [None]:
all_monthly_data[["monthid", "permno"] + model_factors[::-1]]

Unnamed: 0,monthid,permno,8K_tone_1y,10K_tone_1y,10Q_tone_6m,10Q_tone_3m,recession_affinity,volinc,10M2,lag_log_size,...,xret_indsize_std20,xret_indsize_20,xret_20,xret_10,xret_5,log_vol_dollar_120,range_120,log_vol_dollar_20,range_20,IM
0,12,10145.0,0.000000,0.000000,0.000000,0.000000,60.164922,0.040110,0.013658,7.566481,...,0.024948,-0.103709,-0.032998,-0.051723,-0.045500,15.145167,0.022920,15.617313,0.027456,0.374586
1,12,10241.0,0.000000,0.000000,0.000000,0.000000,13.974516,0.009316,0.013658,6.345893,...,0.009797,-0.036481,-0.141331,-0.063374,-0.033571,13.227823,0.019202,13.277532,0.018115,0.188068
2,12,10460.0,0.000000,0.000000,0.000000,0.000000,11.523463,0.007682,0.013658,5.905587,...,0.029715,0.068687,0.094566,0.109122,0.077472,12.680238,0.027327,13.062007,0.036155,0.324759
3,12,10516.0,0.000000,0.000000,0.000000,0.000000,21.362453,0.014242,0.013658,7.397096,...,0.022241,0.084091,-0.005626,-0.008587,-0.048646,15.026468,0.031163,15.249913,0.031346,0.170149
4,12,10866.0,0.000000,0.000000,0.000000,0.000000,4.174269,0.002783,0.013658,5.526573,...,0.009537,-0.020212,-0.088989,-0.027249,0.002714,11.810300,0.013282,11.612416,0.008009,0.273820
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
427769,480,93374.0,-0.068925,-0.011014,-0.020941,-0.010593,113.930650,0.008777,-0.006372,8.874422,...,0.025360,0.108797,-0.007484,-0.019599,0.000246,17.388666,0.015246,17.310472,0.010868,-0.147586
427770,480,93419.0,-0.050815,-0.002398,-0.003564,-0.001085,94.884444,0.010539,-0.006372,8.621115,...,0.021830,0.076118,-0.040163,0.004462,0.008011,16.961089,0.015804,16.775314,0.012941,-0.147586
427771,480,93422.0,-0.178637,-0.017328,-0.037721,-0.018540,7.984961,0.125235,-0.006372,6.656186,...,0.029444,0.016398,-0.035357,-0.033674,0.015794,17.048398,0.067228,16.763782,0.059361,-0.046054
427772,480,93427.0,-0.018317,-0.004089,-0.005819,-0.005819,61.566110,0.016243,-0.006372,7.713477,...,0.015712,0.042201,0.039414,-0.006388,-0.005524,16.544964,0.028009,16.473392,0.025465,0.260954


In [None]:
if os.path.isfile("all_monthly_data_final.csv"):
    all_monthly_data = pd.read_csv("all_monthly_data_final.csv", index_col=0)

## [m, n, l] model for Fama-MacBeth Double Regression
We will use the technique employed during Assignment 2, utilizing a 36-month lookback for factor data to generate our betas (**First Stage**)
* For period $t_i$, we will use data starting at $t_{i-36} ... t_{i-1}$ if available. Worst case we look for 12 prior samples.

In [None]:
# Threaded Approach
def add_betas(permno):
    results = []
    for (i, monthid) in enumerate(testing_range): 
        window = set(testing_range[max(0, i-36):i]) # betas calculated using t_(i-37) to t_(i-2) factors and t_(i-36) to t_(i-1) returns.
                                                    # betas added alongside t_i returns (dated to be compared against t_(i-1) factors)
        window_data = all_monthly_data[(all_monthly_data["permno"] == permno) & (all_monthly_data["monthid"].isin(window))]
        
        if len(window_data) < 12:
            continue

        explanatory_vars = window_data[model_factors + ["monthid"]]
        explanatory_vars.sort_values(by="monthid", inplace=True)
        explanatory_vars.set_index("monthid", inplace=True)
    
        explained_var = window_data[["monthid", "RET"]] # Since factors are from t-1
        explained_var.sort_values(by="monthid", inplace=True)
        explained_var.set_index("monthid", inplace=True)
        
        model = linear_model.LinearRegression().fit(explanatory_vars, 
                                                    explained_var["RET"])
        
        results.append({"monthid": monthid, 
                        "permno": permno, 
                        "RET": explained_var["RET"].iloc[-1], 
                    **{f"{factor}": model.coef_[i] for i, factor in enumerate(model_factors)}
                       })  
    return results

# UNCOMMENT THIS AND ADD TO LINE BELOW INSTEAD OF `permnos`
# FOR DEVELOPMENT - THIS CODE BLOCK TAKES LIKE 30 MINS TO RUN

# smaller_permno_list = list(permnos)[:10]

# Only compute if not in files (delete local copy of file if code above if modifying factors or code above)
if os.path.isfile("first_stage_df.csv"):
    first_stage_df = pd.read_csv("first_stage_df.csv", index_col=0)
else:
    # Runs once basically
    summary_results = []
    with ThreadPool(cpu_count() - 1) as P:
        summary_results = P.map(add_betas, permnos)
        summary_results = [item for sublist in summary_results for item in sublist]
        first_stage_df = pd.DataFrame(summary_results)
        
        # Save first stage df for easy loading
        first_stage_df.to_csv("first_stage_df.csv")
first_stage_df

Unnamed: 0,monthid,permno,RET,IM,range_20,log_vol_dollar_20,range_120,log_vol_dollar_120,xret_5,xret_10,...,Cto,pe_ttm,lag_log_size,10M2,volinc,recession_affinity,10Q_tone_3m,10Q_tone_6m,10K_tone_1y,8K_tone_1y
0,24,49154.0,-0.044053,-0.625612,-0.029899,-0.215916,-0.000078,0.176263,0.208140,0.275639,...,-0.003265,-0.063603,0.077816,-0.085844,-0.000208,-0.002478,0.0,0.0,0.0,0.0
1,25,49154.0,0.105991,-0.623237,-0.026931,-0.204025,0.000630,0.231143,0.214112,0.272743,...,0.000618,-0.009450,0.073351,-0.088219,-0.000370,-0.001777,0.0,0.0,0.0,0.0
2,26,49154.0,-0.037500,-0.471871,-0.052081,-0.160436,-0.002516,0.602752,0.117984,0.308181,...,-0.018240,-0.108668,-0.156348,-0.203525,-0.000026,-0.009635,0.0,0.0,0.0,0.0
3,27,49154.0,0.065801,-0.531017,-0.043607,-0.104008,-0.000262,0.450023,0.100204,0.294732,...,-0.020370,-0.067640,-0.154103,-0.187905,-0.000079,-0.005512,0.0,0.0,0.0,0.0
4,28,49154.0,0.102459,-0.453954,-0.039963,-0.109926,0.011565,0.409227,0.445027,0.496351,...,-0.018164,0.006692,-0.135691,-0.259197,-0.001181,-0.011085,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213830,316,81917.0,-0.018652,-0.314730,22.098379,-0.594348,-3.910890,0.804654,3.343991,-5.793530,...,-0.975391,-0.050443,1.071172,2.486828,-3.698890,-0.002425,0.0,0.0,0.0,0.0
213831,317,81917.0,0.003664,-0.324453,20.852762,-0.593506,-2.795266,0.618067,3.868699,-3.589924,...,-0.881821,-0.056601,0.967369,2.737353,-3.784462,-0.002977,0.0,0.0,0.0,0.0
213832,318,81917.0,-0.021578,0.069678,14.348436,-0.527508,0.633169,0.600707,3.856108,-6.673630,...,-4.025807,-0.047436,1.769092,1.605133,-4.474000,-0.002207,0.0,0.0,0.0,0.0
213833,319,81917.0,0.041499,0.118453,29.399179,-0.564529,4.625397,0.836035,0.420080,-19.281702,...,0.925206,-0.287153,1.202388,2.821753,-4.654795,-0.000509,0.0,0.0,0.0,0.0


In [None]:
first_stage_df[["monthid", "permno", "RET"] + model_factors[::-1]]

Unnamed: 0,monthid,permno,RET,8K_tone_1y,10K_tone_1y,10Q_tone_6m,10Q_tone_3m,recession_affinity,volinc,10M2,...,xret_indsize_std20,xret_indsize_20,xret_20,xret_10,xret_5,log_vol_dollar_120,range_120,log_vol_dollar_20,range_20,IM
0,24,49154.0,-0.044053,0.0,0.0,0.0,0.0,-0.002478,-0.000208,-0.085844,...,-0.011508,-0.180978,-0.247964,0.275639,0.208140,0.176263,-0.000078,-0.215916,-0.029899,-0.625612
1,25,49154.0,0.105991,0.0,0.0,0.0,0.0,-0.001777,-0.000370,-0.088219,...,-0.005412,-0.178335,-0.284494,0.272743,0.214112,0.231143,0.000630,-0.204025,-0.026931,-0.623237
2,26,49154.0,-0.037500,0.0,0.0,0.0,0.0,-0.009635,-0.000026,-0.203525,...,0.010695,-0.432147,-0.526946,0.308181,0.117984,0.602752,-0.002516,-0.160436,-0.052081,-0.471871
3,27,49154.0,0.065801,0.0,0.0,0.0,0.0,-0.005512,-0.000079,-0.187905,...,0.016175,-0.491258,-0.590167,0.294732,0.100204,0.450023,-0.000262,-0.104008,-0.043607,-0.531017
4,28,49154.0,0.102459,0.0,0.0,0.0,0.0,-0.011085,-0.001181,-0.259197,...,0.007172,-0.368239,-0.411495,0.496351,0.445027,0.409227,0.011565,-0.109926,-0.039963,-0.453954
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213830,316,81917.0,-0.018652,0.0,0.0,0.0,0.0,-0.002425,-3.698890,2.486828,...,15.606865,3.555232,3.249863,-5.793530,3.343991,0.804654,-3.910890,-0.594348,22.098379,-0.314730
213831,317,81917.0,0.003664,0.0,0.0,0.0,0.0,-0.002977,-3.784462,2.737353,...,15.396216,2.557431,3.566102,-3.589924,3.868699,0.618067,-2.795266,-0.593506,20.852762,-0.324453
213832,318,81917.0,-0.021578,0.0,0.0,0.0,0.0,-0.002207,-4.474000,1.605133,...,19.700430,-0.901880,6.487931,-6.673630,3.856108,0.600707,0.633169,-0.527508,14.348436,0.069678
213833,319,81917.0,0.041499,0.0,0.0,0.0,0.0,-0.000509,-4.654795,2.821753,...,16.404028,8.393186,7.765469,-19.281702,0.420080,0.836035,4.625397,-0.564529,29.399179,0.118453


In [None]:
# Second stage regression
lambdas = {"monthid": []}
for factor in model_factors:
    lambdas[f"{factor}"] = []
    
for monthid in testing_range:
    monthid_returns = first_stage_df.loc[first_stage_df["monthid"] == monthid]

    # If empty
    if monthid_returns.empty:
        continue
    
    explanatory_vars = monthid_returns[model_factors + ["permno"]]
    explanatory_vars.sort_values(by="permno", inplace=True)
    explanatory_vars.set_index("permno", inplace=True)

    explained_var = monthid_returns[["permno", "RET"]]
    explained_var.sort_values(by="permno", inplace=True)
    explained_var.set_index("permno", inplace=True)
    
    model = linear_model.LinearRegression(n_jobs=len(model_factors)).fit(explanatory_vars, 
                                                                         explained_var["RET"])

    lambdas["monthid"].append(monthid)

    for (i, factor) in enumerate(model_factors):
        lambdas[factor].append(model.coef_[i])

In [None]:
second_stage_df = pd.DataFrame(lambdas)
second_stage_df

Unnamed: 0,monthid,IM,range_20,log_vol_dollar_20,range_120,log_vol_dollar_120,xret_5,xret_10,xret_20,xret_indsize_20,...,Cto,pe_ttm,lag_log_size,10M2,volinc,recession_affinity,10Q_tone_3m,10Q_tone_6m,10K_tone_1y,8K_tone_1y
0,24,-0.087401,-0.838893,-0.090328,3.132225,0.030940,-0.017560,0.008128,-0.172702,0.109927,...,-0.335337,-0.007380,0.041685,0.487975,2.163892,0.298601,0.000000,0.000000,0.000000,0.000000
1,25,0.015878,-0.551612,0.007260,-1.499475,-0.007438,-0.027102,0.001939,-0.019762,-0.009209,...,-0.041752,0.027878,-0.003740,-0.056368,-1.029298,-1.629468,0.000000,0.000000,0.000000,0.000000
2,26,-0.006880,-0.133293,-0.012169,2.201899,-0.031751,-0.034279,0.018879,-0.010352,0.017789,...,0.075607,0.009850,-0.000809,-0.078125,1.821022,-1.595375,0.000000,0.000000,0.000000,0.000000
3,27,-0.001254,-0.187112,-0.031425,0.161325,0.034120,0.003525,-0.020145,0.011307,-0.050272,...,-0.183472,0.013245,-0.021362,-0.008911,0.867981,-2.725899,0.000000,0.000000,0.000000,0.000000
4,28,-0.034664,-0.222323,-0.037330,0.707055,-0.013198,0.012854,0.013067,0.005818,0.035443,...,-0.080139,-0.008789,-0.031555,-0.020386,0.611168,0.398590,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,316,-0.000949,-0.000077,-0.014905,0.000262,-0.005546,-0.000955,-0.000815,0.000819,-0.002312,...,0.000949,-0.010903,-0.002116,-0.000251,0.000355,0.224097,0.002782,0.000415,0.001647,-0.004022
293,317,0.006587,-0.000174,0.003451,0.000981,-0.002042,0.000042,0.000550,0.000817,0.002390,...,0.000072,-0.009681,-0.000339,-0.001557,0.000257,-0.511272,0.002196,-0.001648,0.000156,-0.002096
294,318,0.002913,-0.000088,-0.010161,-0.000597,-0.005507,-0.000592,-0.001627,-0.004761,-0.003929,...,-0.000537,0.003806,-0.003276,0.000108,0.000052,0.960497,-0.000045,0.005630,0.002047,-0.001480
295,319,0.003181,-0.000188,-0.003653,0.000647,-0.005228,0.002052,0.001941,0.002917,0.001691,...,0.000358,0.023345,0.000966,0.001268,-0.000263,-0.163509,-0.004005,-0.008756,-0.002724,0.002662


In [None]:
# Get p values
p_value_dict = {"factor": [], "p-value": []}
t_stat_dict = {"factor": [], "t": [], "|t|": []}
for factor in model_factors:
    lambdas = second_stage_df[factor]
    ttest = stats.ttest_ind(lambdas, np.zeros(len(lambdas))) # Compare to see if any lambdas are significantly different from zero
    p_value_dict['factor'].append(factor)
    p_value_dict['p-value'].append(ttest[1])
    
    t_stat_dict['factor'].append(factor)
    t_stat_dict['t'].append(ttest[0])
    t_stat_dict['|t|'].append(abs(ttest[0]))

p_df = pd.DataFrame.from_dict(p_value_dict, orient='index')
t_df = pd.DataFrame.from_dict(t_stat_dict, orient='index')
t_df.round(2).T.sort_values(by="|t|", ascending=False)

Unnamed: 0,factor,t,|t|
42,AM,2.565893,2.565893
49,lag_log_size,2.505643,2.505643
33,profitability,2.169747,2.169747
1,range_20,-1.968011,1.968011
44,SP,1.946136,1.946136
51,volinc,1.792666,1.792666
18,KDJ_120,1.580462,1.580462
4,log_vol_dollar_120,1.451777,1.451777
46,roa_q,1.431698,1.431698
55,10K_tone_1y,-1.405411,1.405411


In [None]:
p_df.round(2).T.sort_values(by="p-value")

Unnamed: 0,factor,p-value
42,AM,0.010536
49,lag_log_size,0.01249
33,profitability,0.030423
1,range_20,0.049533
44,SP,0.052111
51,volinc,0.073537
18,KDJ_120,0.114535
4,log_vol_dollar_120,0.147093
46,roa_q,0.152758
55,10K_tone_1y,0.160424


# TODO - Pick Factors
use these results to determine which factors to keep (among other considerations like cross-correlation, if they are in the same category, etc)

In [None]:
predict_factors = model_factors

In [None]:
# Threaded Approach
def add_betas_predict(permno):
    results = []
    for (i, monthid) in enumerate(validation_range): 
        window = set(validation_range[max(0, i-36):i]) # betas calculated using t_(i-37) to t_(i-2) factors and t_(i-36) to t_(i-1) returns.
                                                       # betas added alongside t_i returns (dated to be compared against t_(i-1) factors)
        window_data = all_monthly_data[(all_monthly_data["permno"] == permno) & (all_monthly_data["monthid"].isin(window))]
        
        if len(window_data) < 12:
            continue

        explanatory_vars = window_data[predict_factors + ["monthid"]]
        explanatory_vars.sort_values(by="monthid", inplace=True)
        explanatory_vars.set_index("monthid", inplace=True)
    
        explained_var = window_data[["monthid", "RET"]] # Since factors are from t-1
        explained_var.sort_values(by="monthid", inplace=True)
        explained_var.set_index("monthid", inplace=True)
        
        model = linear_model.LinearRegression().fit(explanatory_vars, 
                                                    explained_var["RET"])
        
        results.append({"monthid": monthid, 
                        "permno": permno, 
                        "RET": explained_var["RET"].iloc[-1], 
                    **{f"{factor}": model.coef_[i] for i, factor in enumerate(predict_factors)}
                       })  
    return results

# UNCOMMENT THIS AND ADD TO LINE BELOW INSTEAD OF `permnos`
# FOR DEVELOPMENT - THIS CODE BLOCK TAKES LIKE 30 MINS TO RUN

# smaller_permno_list = list(permnos)[:10]

# Only compute if not in files (delete local copy of file if code above if modifying factors or code above)
if os.path.isfile("linear_predict_betas.csv"):
    linear_betas = pd.read_csv("linear_predict_betas.csv", index_col=0)
else:
    # Runs once basically
    summary_results = []
    with ThreadPool(cpu_count() - 1) as P:
        summary_results = P.map(add_betas_predict, permnos)
        summary_results = [item for sublist in summary_results for item in sublist]
        linear_betas = pd.DataFrame(summary_results)
        
        # Save first stage df for easy loading
        linear_betas.to_csv("linear_predict_betas.csv")
linear_betas.describe()

Unnamed: 0,monthid,permno,RET,IM,range_20,log_vol_dollar_20,range_120,log_vol_dollar_120,xret_5,xret_10,...,Cto,pe_ttm,lag_log_size,10M2,volinc,recession_affinity,10Q_tone_3m,10Q_tone_6m,10K_tone_1y,8K_tone_1y
count,185151.0,185151.0,185151.0,185151.0,185151.0,185151.0,185151.0,185151.0,185151.0,185151.0,...,185151.0,185151.0,185151.0,185151.0,185151.0,185151.0,185151.0,185151.0,185151.0,185151.0
mean,409.800449,60676.60576,0.011776,-0.108886,-0.445366,-0.018285,-0.487355,-0.048539,0.081528,0.137171,...,-0.142631,-0.001571,-0.639582,0.221826,0.025059,-2.7e-05,0.001329,-0.052345,-0.011309,-0.009021
std,42.730433,29172.55569,0.115537,1.101041,6.836255,0.366114,5.855892,1.105619,3.374766,2.804053,...,4.985273,0.143607,1.65117,3.655191,3.48734,0.007464,1.502351,1.669719,1.359132,1.57889
min,333.0,10026.0,-0.878327,-21.573203,-103.773092,-6.142966,-127.823474,-15.248615,-58.4955,-67.576353,...,-156.882235,-6.159185,-33.744408,-41.978914,-98.90031,-0.313985,-59.270893,-39.742164,-47.126618,-50.278151
25%,373.0,28804.0,-0.042491,-0.607837,-2.97054,-0.180305,-1.415955,-0.549456,-1.072945,-0.967026,...,-1.474464,-0.022298,-1.445332,-1.498384,-0.629935,-0.000988,0.0,0.0,0.0,0.0
50%,412.0,76230.0,0.011617,-0.075705,-0.064966,-0.012961,-0.021375,-0.038768,0.008063,0.039998,...,-0.003743,0.0,-0.507821,0.049027,7.4e-05,-8e-06,0.0,0.0,0.0,0.0
75%,447.0,85631.0,0.06367,0.396091,1.954442,0.150035,0.855627,0.452346,1.149416,1.177155,...,1.276629,0.021176,0.190646,1.95751,0.669054,0.000999,0.0,0.0,0.0,0.0
max,480.0,93429.0,13.49505,18.631629,164.128853,7.962762,202.013038,28.004224,150.987922,52.155143,...,107.596613,7.383472,21.195844,51.921293,85.99607,0.38627,53.764546,50.373022,77.237319,35.863976


In [None]:
# # Calculate predicted returns using factors and betas (dot product)

# lookup_set = set(linear_betas[['monthid', 'permno']].apply(tuple, axis=1))
# mask = all_monthly_data[['monthid', 'permno']].apply(tuple, axis=1).isin(lookup_set)
# pred_factors = all_monthly_data[mask][["monthid", "permno"] + predict_factors].set_index(["monthid", "permno"]).sort_index()

# lookup_set = set(pred_factors.index.values)
# mask = linear_betas[['monthid', 'permno']].apply(tuple, axis=1).isin(lookup_set)
# pred_betas = linear_betas[mask].set_index(["monthid", "permno"]).sort_index()

# pred_returns = (pred_factors * pred_betas[predict_factors]).sum(axis=1) # Dot product

In [None]:
# Using the scoring method in Step 10 in assignment

# Second stage regression
lambdas = {"monthid": []}
for factor in predict_factors:
    lambdas[f"{factor}"] = []
    
for monthid in validation_range:
    monthid_returns = linear_betas.loc[linear_betas["monthid"] == monthid]

    # If empty
    if monthid_returns.empty:
        continue

    explanatory_vars = monthid_returns[predict_factors + ["permno"]]
    explanatory_vars.sort_values(by="permno", inplace=True)
    explanatory_vars.set_index("permno", inplace=True)

    explained_var = monthid_returns[["permno", "RET"]]
    explained_var.sort_values(by="permno", inplace=True)
    explained_var.set_index("permno", inplace=True)
    
    model = linear_model.LinearRegression(n_jobs=len(predict_factors)).fit(explanatory_vars, 
                                                                           explained_var["RET"])

    lambdas["monthid"].append(monthid)

    for (i, factor) in enumerate(predict_factors):
        lambdas[factor].append(model.coef_[i])

lambdas_df = pd.DataFrame(lambdas)

# Get p values
t_stat_dict = {"factor": [], "t": [], "|t|": []}
t_dict = {}
for factor in predict_factors:
    lambdas = lambdas_df[factor]
    ttest = stats.ttest_ind(lambdas, np.zeros(len(lambdas))) # Compare to see if any lambdas are significantly different from zero
    p_value_dict['factor'].append(factor)
    p_value_dict['p-value'].append(ttest[1])
    
    t_stat_dict['factor'].append(factor)
    t_stat_dict['t'].append(ttest[0])
    t_stat_dict['|t|'].append(abs(ttest[0]))
    t_dict[factor] = ttest[0]

# t_df = pd.DataFrame.from_dict(t_stat_dict, orient='index').T.sort_values(by="|t|", ascending=False)

In [None]:
# Now we just need to get a z score for each factor measurement for each permno for each date in our validation window

predict_data = all_monthly_data[all_monthly_data["monthid"].isin(validation_range)]

z_scores_dict = {"monthid": [], "permno": []}
for factor in predict_factors:
    z_scores_dict[factor] = []

for permno in permnos:
    permno_factor_data = predict_data[predict_data["permno"] == permno]
    
    factor_z_scores = {}
    for factor in predict_factors:
        factor_z_scores[factor] = list(stats.zscore(list(permno_factor_data[factor])))
    
    for i, monthid in enumerate(list(permno_factor_data["monthid"])):
        z_scores_dict["monthid"].append(monthid)
        z_scores_dict["permno"].append(permno)
        for factor in predict_factors:
            z_scores_dict[factor].append(factor_z_scores[factor][i] if not np.isnan(factor_z_scores[factor][i]) else 0)

In [None]:
factor_z_scores = pd.DataFrame(z_scores_dict)
factor_z_scores

Unnamed: 0,monthid,permno,IM,range_20,log_vol_dollar_20,range_120,log_vol_dollar_120,xret_5,xret_10,xret_20,...,Cto,pe_ttm,lag_log_size,10M2,volinc,recession_affinity,10Q_tone_3m,10Q_tone_6m,10K_tone_1y,8K_tone_1y
0,321,49154.0,-0.834298,0.041435,-1.993774,-0.265633,-2.408345,0.628027,-0.805769,0.350613,...,-0.483266,0.464568,0.197673,0.706804,-0.297823,-0.420654,0.0,0.0,0.0,0.0
1,322,49154.0,-0.631478,-0.037278,-0.678258,-0.179864,-2.131695,-0.327727,0.374475,1.746096,...,-0.483266,0.464568,0.938589,0.234868,-0.297949,-0.420768,0.0,0.0,0.0,0.0
2,323,49154.0,-0.241371,-0.212688,-0.331841,-0.153648,-1.652467,-0.078984,0.060512,-0.042025,...,-0.483266,0.464568,1.319624,0.045761,-0.298076,-0.420883,0.0,0.0,0.0,0.0
3,324,49154.0,0.229745,-0.171501,-1.297742,-0.166243,-1.725624,-0.151830,-0.165489,-0.414167,...,-0.502104,2.155499,1.216539,0.352599,-0.298741,-0.421484,0.0,0.0,0.0,0.0
4,325,49154.0,0.106986,-0.498074,-2.695981,-0.178323,-1.731538,-0.518581,-0.899214,-0.510005,...,-0.502104,2.155499,1.117616,-1.045810,-0.299432,-0.058552,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201904,476,81917.0,-0.049597,-0.467960,1.185877,-0.502459,1.580640,0.967219,1.197797,0.241315,...,-1.376008,0.859485,1.404352,0.100932,-0.352980,-0.026884,0.0,0.0,0.0,0.0
201905,477,81917.0,-1.194662,-0.516166,0.231154,-0.411406,1.562005,-2.026804,-3.638825,-0.334420,...,-4.401725,-1.318649,1.656242,1.763341,-0.273759,0.048939,0.0,0.0,0.0,0.0
201906,478,81917.0,-0.698561,-0.866323,0.945258,-0.988335,0.537750,4.233691,2.962623,2.492361,...,-4.401725,-1.318649,1.857185,-0.891735,-0.213099,-0.447474,0.0,0.0,0.0,0.0
201907,479,81917.0,-1.453524,-1.249723,-1.465737,-1.082796,0.069890,-2.341338,1.229317,-0.911460,...,-4.401725,-1.318649,1.798426,-0.382979,-0.111495,-0.536618,0.0,0.0,0.0,0.0


In [None]:
scores = factor_z_scores.copy()
for factor in predict_factors:
    scores[factor] *= t_dict[factor]

scores.set_index(["monthid", "permno"], inplace=True)
scores = scores.sum(axis=1)
scores

monthid  permno 
321      49154.0     5.768554
322      49154.0    -3.711786
323      49154.0    -5.081180
324      49154.0    -0.144236
325      49154.0    -0.229519
                      ...    
476      81917.0    -6.109044
477      81917.0   -17.669839
478      81917.0   -25.835757
479      81917.0   -22.066390
480      81917.0   -27.090869
Length: 201909, dtype: float64

In [None]:
# TODO Sort these scores
# Then compare with actual returns and refer to CFM assignment to get performance results
# Now i sleep

# Machine Learning

We will attempt implementign **PLS** as our machine learning method as it provides the most potent return capabilities as outlined in Lecture 4's code example, and in the ML slides.

In [None]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ParameterGrid

In [None]:
# ML Code from ML Tutorials

# validation function
def validate_model(model_type, param_grid, x_train, y_train, x_validate, y_validate):
    # Special case for LinearRegression because it doesn't have hyperparameters to tune
    if model_type == LinearRegression:
        model = LinearRegression()
        model.fit(x_train, y_train)
        pred = model.predict(x_validate)
        r2 = r2_score(y_validate, pred)
        
        return r2
    else: # The other cases
        
        # Establishses the ParameterGrid
        model_param_grid = ParameterGrid(param_grid)
        
        # Initialize values
        best_MAE = 0
        best_r2 = 1
        best_config = None
        # Iterate through the parameter grid, fit models to the hyperparameters
        # and check for MAE and R2 values
        
        # each param_config in that validation function would represent 1 combination of the possible parameters.
        # for example in Lab 6, when I'm validating for the elastic net regression, I have 
        # 2 possible hyperparameters: alpha and l1_ratio. 
        #alpha can take on values 0.0001, 0.0005, etc, and l1_ratio can take on values 0, 1, 0.01. 
        #So each param_config in the for loop in validate_model would go over 1 possible 
        #combination of the hyperparameter and keep the one that gives us the best MAE/R2
        for param_config in model_param_grid:
            curr_config_MAEs = []
            model = model_type(**param_config)
            model.fit(x_train, y_train)
            pred = model.predict(x_validate)
            MAE = mean_squared_error(y_validate,pred)
            r2 = r2_score(y_validate, pred)
            curr_config_MAEs.append(MAE)
            if best_MAE == 0 or (MAE < best_MAE):
                best_MAE = MAE
                best_config = param_config
            if best_r2 == 1 or (r2 > best_r2):
                best_r2 = r2
        return best_config, best_MAE, best_r2

# Predictions
def pred(model_type, x_train, y_train, x_test, y_test):
    # Fit model and predict 
    model = model_type.fit(x_train, y_train)
    pred = model.predict(x_test)
    
    # Format prediction as DataFrame
    pred_df = pd.DataFrame(pred, columns = ['RET_pred'])
    pred_df.set_index(x_test.index, inplace = True)
    
    r2 = r2_score(y_test, pred)
    return pred_df, r2

In [None]:
# using a 60/20/20 split
# train, validate, test = \
#                         np.split(all_monthly_data.sample(frac=1, random_state=42), 
#                         [int(.6*len(all_monthly_data)), int(.8*len(all_monthly_data))])

# No subset
train, validate, test = \
                        np.split(all_monthly_data,
                        [int(.6*len(all_monthly_data)), int(.8*len(all_monthly_data))])

x_train = train[model_factors + ["yyyymm", "permno"]].set_index(["yyyymm", "permno"])
y_train = train[['RET', "yyyymm", "permno"]].set_index(["yyyymm", "permno"])

x_validate = validate[model_factors + ["yyyymm", "permno"]].set_index(["yyyymm", "permno"])
y_validate = validate[['RET', "yyyymm", "permno"]].set_index(["yyyymm", "permno"])

x_test = test[model_factors + ["yyyymm", "permno"]].set_index(["yyyymm", "permno"])
y_test = test[['RET', "yyyymm", "permno"]].set_index(["yyyymm", "permno"])

In [None]:
x_train

Unnamed: 0_level_0,Unnamed: 1_level_0,IM,range_20,log_vol_dollar_20,range_120,log_vol_dollar_120,xret_5,xret_10,xret_20,xret_indsize_20,xret_indsize_std20,...,Cto,pe_ttm,lag_log_size,10M2,volinc,recession_affinity,10Q_tone_3m,10Q_tone_6m,10K_tone_1y,8K_tone_1y
yyyymm,permno,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
198012.0,10145.0,0.374586,0.027456,15.617313,0.022920,15.145167,-0.045500,-0.051723,-0.032998,-0.103709,0.024948,...,0.303363,6.286484,7.566481,0.013658,0.040110,60.164922,0.0,0.0,0.0,0.0
198012.0,10241.0,0.188068,0.018115,13.277532,0.019202,13.227823,-0.033571,-0.063374,-0.141331,-0.036481,0.009797,...,0.432945,6.663567,6.345893,0.013658,0.009316,13.974516,0.0,0.0,0.0,0.0
198012.0,10460.0,0.324759,0.036155,13.062007,0.027327,12.680238,0.077472,0.109122,0.094566,0.068687,0.029715,...,0.409445,12.956169,5.905587,0.013658,0.007682,11.523463,0.0,0.0,0.0,0.0
198012.0,10516.0,0.170149,0.031346,15.249913,0.031163,15.026468,-0.048646,-0.008587,-0.005626,0.084091,0.022241,...,0.580968,10.866713,7.397096,0.013658,0.014242,21.362453,0.0,0.0,0.0,0.0
198012.0,10866.0,0.273820,0.008009,11.612416,0.013282,11.810300,0.002714,-0.027249,-0.088989,-0.020212,0.009537,...,0.548834,4.723448,5.526573,0.013658,0.002783,4.174269,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200812.0,44206.0,-0.256424,0.055367,17.200459,0.038970,17.022322,-0.053123,0.005273,0.104616,0.038293,0.019833,...,0.118567,14.900494,8.196378,0.073257,0.029574,44.361233,0.0,0.0,0.0,0.0
200812.0,44274.0,-0.299851,0.097975,15.410891,0.067219,15.254340,0.174990,0.091204,0.344395,0.258588,0.039614,...,0.253302,18.345311,6.115437,0.073257,0.032275,48.412342,0.0,0.0,0.0,0.0
200812.0,44329.0,-0.282099,0.056087,16.510059,0.038861,16.602987,-0.062214,-0.068661,-0.020760,-0.010352,0.023586,...,0.120839,81.149296,7.539904,0.073257,0.031923,47.884848,0.0,0.0,0.0,0.0
200812.0,44601.0,-0.282099,0.061162,17.503754,0.043618,17.687756,0.037796,-0.016867,-0.031714,-0.021307,0.016859,...,0.257517,15.637459,8.103341,0.073257,0.015421,23.131971,0.0,0.0,0.0,0.0


In [None]:
pls_grid = dict()
pls_grid['n_components'] = np.arange(1, len(model_factors)+1, 1)

pls_best_config, pls_best_MAE, pls_best_r2 = validate_model(PLSRegression, pls_grid, x_train, y_train, x_validate\
                                                            , y_validate)
print('Best config:' + str(pls_best_config))
print('Validation R2: ' + str(pls_best_r2))

Best config:{'n_components': 10}
Validation R2: 0.0018752476200657453


In [None]:
# Run PLS with best determined components
pls_pred_df, pls_test_r2 = pred(PLSRegression(pls_best_config['n_components']), x_train, y_train, x_test, y_test)

In [None]:
pls_pred_df.describe()

Unnamed: 0,RET_pred
count,85555.0
mean,0.006113
std,0.009086
min,-0.052593
25%,0.000425
50%,0.005875
75%,0.01122
max,0.122856


# Performance Analysis

In [None]:
def generate_monthly_portfolios(predicted_returns, actual_returns) -> pd.DataFrame:
    pass

In [None]:
def total_ret(port_ret):
    return port_ret.sum()
    # return np.prod(port_ret + 1) - 1

def tracking_error(port_ret, bench_ret):
    return (port_ret - bench_ret).std()

def information_ratio(port_ret, bench_ret):
    return (total_ret(port_ret) - total_ret(bench_ret)) / tracking_error(port_ret, bench_ret)

def sharpe_ratio(port_ret, rf_ret):
    return information_ratio(port_ret, rf_ret)

def sharpe_ratio(port_xret):
    return total_ret(port_xret) / port_xret.std()

In [None]:
# Write Permnos - for WRDS Queries

# with open("permnos.txt", "w") as file:
#     for permno in permnos:
#         file.write(f"{int(permno)},\n")