# Final Project

- Saige Belanger
    - (20951877)
- Dylan Faelker
    - (20960747)
- Ethan Liu
    - (20959615)
- Timothy Zheng
    - t54zheng (20939203)

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
from sklearn import linear_model
import statsmodels.api as sm
import scipy.stats as stats
from math import sqrt
import math

from sklearn import preprocessing
from sklearn.impute import SimpleImputer
import datetime as dt

import os.path

warnings.filterwarnings('ignore')

# Factors
We start with an initial list of factors from the provided list of 50 Factors in the ML examples.

TODO: Increase our breadth of factors to the category chosen by downloading and creating them, then merging them 

https://wrds-www.wharton.upenn.edu/pages/get-data/compustat-capital-iq-standard-poors/compustat/north-america-daily/fundamentals-quarterly/

In [2]:
all_monthly_data = pd.read_sas("merged_df.sas7bdat", encoding = 'ISO-8859-1')

In [3]:
# Save all gvkeys - for WRDS Queries

# with open("gvkeys.txt", "w") as file:
#     for gvkey in set(all_monthly_data["gvkey"].dropna()):
#         file.write(f"{int(gvkey)},\n")

In [4]:
permnos = set(all_monthly_data["permno"])

In [5]:
gvkey_permno_dict = {}
for gvkey in set(all_monthly_data["gvkey"].dropna()):
    permno = all_monthly_data[all_monthly_data["gvkey"] == gvkey]["permno"].dropna().iloc[0]
    gvkey_permno_dict[gvkey] = permno

In [6]:
all_monthly_data.drop(["ticker", "conm", "gvkey", "cusip", "naics", "gsubind"], axis=1, inplace=True) # We don't use these columns anyway, drop them

In [7]:
factors = ['IM', 'range_20', 'log_vol_dollar_20',
       'range_120', 'log_vol_dollar_120', 'xret_5', 'xret_10', 'xret_20',
       'xret_indsize_20', 'xret_indsize_std20', 'xret_40', 'xret_120',
       'xret_indsize_120', 'xret_indsize_std120', 'KDJ_20', 'deviation_pct20',
       'MoneyFlowIndex_20', 'RSI_20', 'KDJ_120', 'deviation_pct120',
       'MoneyFlowIndex_120', 'RSI_120', 'IV_capm', 'mdr', 'ami_3', 'beta_3y',
       'beta_5y', 'tail_2y', 'dp', 'leverage', 'BL', 'roe', 'roa',
       'profitability', 'sales_g_q', 'sales_g_ttm', 'op_income_g_q', 'ni_g_q',
       'op_income_g_ttm', 'ni_g_ttm', 'sue_NI', 'BM', 'AM', 'EP', 'SP',
       'roe_q', 'roa_q', 'Cto', 'pe_ttm', 'lag_log_size']

ret_cols = ['ret_f1', 'ret_f2', 'ret_f3', 'ret_f4', 'ret_f5', 'ret_f6', 
            'ret_f7', 'ret_f8', 'ret_f9', 'ret_f10', 'ret_f11', 'ret_f12']

In [8]:
non_data_cols = [x for x in all_monthly_data.columns if x not in factors and x not in ret_cols]
non_data_cols

['permno', 'yyyymm', 'monthid', 'PRC', 'VOL', 'RET', 'SHROUT']

# Adding New Factors
* When you add a factor, document it here: [link](https://docs.google.com/spreadsheets/d/1rs9633QSYLVY5Z5DoGNy3USP2MROGtqTIKcbLG68wpE/edit#gid=1579135478) and fill properly
* Download the data file, if it's too large add it to the drive
* Also download the other files that arent on github but on the drive before working on this part of the notebook
    * https://drive.google.com/drive/u/0/folders/1D1eIYlkNxNLfzHJLzkGeE9ymr7doXg_6

## IMPORTANT NOTE - FACTOR/RETURN TIME
- When adding factors make sure you add such that factor is reported at t-1, **RET** has **T** returns (in same row)
- This means you need to download data from the range **(1979-12 to 2019-11)**

***

- Treasury and CPI Rates: [Link](https://wrds-www.wharton.upenn.edu/pages/get-data/center-research-security-prices-crsp/annual-update/index-treasury-and-inflation/us-treasury-and-inflation-indexes/)
- Federal Reserve Data: [Link](https://wrds-www.wharton.upenn.edu/pages/get-data/federal-reserve-bank-reports/interest-rates/data/)
- SEC Filings: https://wrds-www.wharton.upenn.edu/pages/get-data/wrds-sec-analytics-suite/wrds-sec-filings-queries/list-of-filings-exhibits/
- Analyzed Data: https://wrds-www.wharton.upenn.edu/pages/get-data/wrds-sec-analytics-suite/wrds-sec-text-analysis/readability-and-sentiment/

**TBD**
- Other Factors: Downloaded from https://wrds-www.wharton.upenn.edu/pages/get-data/compustat-capital-iq-standard-poors/compustat/north-america-daily/fundamentals-quarterly/

In [9]:
# Add new generated factors here
recession_factors = ["10M2", "volinc", "recession_affinity"]
tone_factors = ['10Q_tone_3m', '10Q_tone_6m', 
                '10K_tone_1y', 
                '8K_tone_1y',]
factors += recession_factors
factors += tone_factors

# SEC Filings Sentiment Factors

In [10]:
sec_analytics = pd.read_sas("sec_filing_analysis_wrds.sas7bdat", encoding = 'ISO-8859-1')

In [11]:
sec_analytics["PERMNO"] = sec_analytics["GVKEY"].map(gvkey_permno_dict)

In [12]:
# Import from Dylan's code
sec_analytics['yyyymm'] = (sec_analytics['FDATE'].astype('string').str[:4] + sec_analytics['FDATE'].astype('string').str[5:7]).astype('float64')
sec_analytics_subset_forms = sec_analytics[(sec_analytics['FORM'].isin(['10-K', '10-Q', '8-K']))][["PERMNO", "yyyymm", "FORM", "LM_POSITIVE_COUNT", "LM_NEGATIVE_COUNT", "WORD_COUNT"]]

sec_analytics_subset_forms['tone'] = (sec_analytics_subset_forms['LM_POSITIVE_COUNT'] - sec_analytics_subset_forms['LM_NEGATIVE_COUNT']) / sec_analytics_subset_forms['WORD_COUNT']

sec_analytics_subset_forms.head()

Unnamed: 0,PERMNO,yyyymm,FORM,LM_POSITIVE_COUNT,LM_NEGATIVE_COUNT,WORD_COUNT,tone
0,86594.0,201306.0,10-K,2387.0,2500.0,260142.0,-0.000434
9,86594.0,200703.0,10-Q,110.0,115.0,14978.0,-0.000334
11,86594.0,201209.0,8-K,35.0,31.0,3111.0,0.001286
14,86594.0,200502.0,8-K,279.0,931.0,53640.0,-0.012155
17,86594.0,200606.0,8-K,15.0,6.0,907.0,0.009923


In [13]:

def yyyymm_add(yyyymm, years, months):
    return yyyymm + ((years + math.floor(months / 12)) * 100) + (months % 12)

def yyyymm_sub(yyyymm, years, months):
    return yyyymm - ((years + math.floor(months / 12)) * 100) - (months % 12)

In [14]:
!pip install multiprocess



In [15]:
from multiprocess import cpu_count # You might have to change to multiprocessing if on windows
from multiprocess.pool import ThreadPool

In [16]:
# Threaded Approach. Takes estimate >2h
def calc_tone_factors(permno):
    results = []

    for date in all_monthly_data['yyyymm'].unique():
        sentiments = {}
        # when there is no relevant data. Saves time
        if sec_analytics_subset_forms.loc[(permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 1, 0))].shape[0] == 0:
            sentiments = {"permno": permno, 
                            "yyyymm": date,
                            '10Q_tone_3m': 0,
                            '10Q_tone_6m': 0,
                            '10K_tone_1y': 0,
                            '8K_tone_1y': 0,
                            }
        elif sec_analytics_subset_forms.loc[(permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 0, 6))].shape[0] == 0:
            sentiments = {"permno": permno, 
                            "yyyymm": date,
                            '10Q_tone_3m': 0,
                            '10Q_tone_6m': 0,
                            '10K_tone_1y': sec_analytics_subset_forms.loc[('10-K' == sec_analytics_subset_forms['FORM']) & (permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 1, 0)), 'tone'].sum(),
                            '8K_tone_1y': sec_analytics_subset_forms.loc[('8-K' == sec_analytics_subset_forms['FORM']) & (permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 1, 0)), 'tone'].sum(),
                            }
        elif sec_analytics_subset_forms.loc[(permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 0, 3))].shape[0] == 0:
            sentiments = {"permno": permno, 
                            "yyyymm": date,
                            '10Q_tone_3m': 0,
                            '10Q_tone_6m': sec_analytics_subset_forms.loc[('10-Q' == sec_analytics_subset_forms['FORM']) & (permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 0, 6)), 'tone'].sum(),
                            '10K_tone_1y': sec_analytics_subset_forms.loc[('10-K' == sec_analytics_subset_forms['FORM']) & (permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 1, 0)), 'tone'].sum(),
                            '8K_tone_1y': sec_analytics_subset_forms.loc[('8-K' == sec_analytics_subset_forms['FORM']) & (permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 1, 0)), 'tone'].sum(),
                            }
        else:
            sentiments = {"permno": permno, 
                            "yyyymm": date,
                            '10Q_tone_3m': sec_analytics_subset_forms.loc[('10-Q' == sec_analytics_subset_forms['FORM']) & (permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 0, 3)), 'tone'].sum(),
                            '10Q_tone_6m': sec_analytics_subset_forms.loc[('10-Q' == sec_analytics_subset_forms['FORM']) & (permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 0, 6)), 'tone'].sum(),
                            '10K_tone_1y': sec_analytics_subset_forms.loc[('10-K' == sec_analytics_subset_forms['FORM']) & (permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 1, 0)), 'tone'].sum(),
                            '8K_tone_1y': sec_analytics_subset_forms.loc[('8-K' == sec_analytics_subset_forms['FORM']) & (permno == sec_analytics_subset_forms['PERMNO']) & (date > sec_analytics_subset_forms['yyyymm']) & (date <= yyyymm_add(sec_analytics_subset_forms['yyyymm'], 1, 0)), 'tone'].sum(),
                            }
        results.append(sentiments)
    
    return results

permnos = set(all_monthly_data["permno"])
with ThreadPool(cpu_count() - 1) as P:
    sentiments = P.map(calc_tone_factors, permnos)
    sentiments = [item for sublist in sentiments for item in sublist]
    sentiments = pd.DataFrame(sentiments)
    all_monthly_data = pd.merge(all_monthly_data, sentiments, on=["permno", "yyyymm"], how='outer')

In [17]:
all_monthly_data[all_monthly_data["permno"] == 86594.0].sort_values(by="monthid").head()

Unnamed: 0,permno,yyyymm,monthid,IM,range_20,log_vol_dollar_20,range_120,log_vol_dollar_120,xret_5,xret_10,...,ret_f7,ret_f8,ret_f9,ret_f10,ret_f11,ret_f12,10Q_tone_3m,10Q_tone_6m,10K_tone_1y,8K_tone_1y
578148,86594.0,199901.0,229.0,,,,,,,,...,0.223744,0.376866,-0.03523,0.016854,0.607735,-0.053265,0.0,0.0,0.0,0.0
578149,86594.0,199902.0,230.0,,,,,,,,...,0.376866,-0.03523,0.016854,0.607735,-0.053265,0.088929,0.0,0.0,0.0,0.0
578150,86594.0,199903.0,231.0,,,,,,,,...,-0.03523,0.016854,0.607735,-0.053265,0.088929,-0.206667,0.0,0.0,0.0,-0.023482
578151,86594.0,199904.0,232.0,,,,,,,,...,0.016854,0.607735,-0.053265,0.088929,-0.206667,-0.109244,0.000887,0.000887,0.0,-0.023482
578152,86594.0,199905.0,233.0,,,,,,,,...,0.607735,-0.053265,0.088929,-0.206667,-0.109244,-0.183962,0.000887,0.000887,0.0,-0.023482


# Recession Factor
- During recessions, companies that sell essential products/services typically outperform companies that offer products that are categorized as discretionary spending by consumers.
- Basically we want to come up with a factor such that during recessions, the factor is high for companies selling essential products/services and low for companies producing goods/services that are highly sensitive to recessions. Then we want the factor to be flipped when the company is out of a recession.
- We will categorize a recession as whenever the yield curve is inverted

### Proxy factors
- [10M2] Yield curve: US Treasury 10Y - US Treasury 2 Year: https://fred.stlouisfed.org/series/T10Y2YM
    - Recession if 10M2 < 0 [inverted yield curve]

- [volinc] Annual Income Volatility: Standard Deviation of annual net income growth [ni_g_ttm] for past 5 years, minimum past 1 year
    - Using trailing twelve month (TTM) measure because it avoids any seasonality considerations

In [18]:
all_monthly_data[all_monthly_data["permno"] == 86594.0][["ni_g_ttm", "yyyymm"]].dropna().head(12)

Unnamed: 0,ni_g_ttm,yyyymm
578160,-0.319421,200001.0
578161,-0.319421,200002.0
578162,-0.319421,200003.0
578163,-0.298619,200004.0
578164,-0.298619,200005.0
578165,-0.298619,200006.0
578166,0.319728,200007.0
578167,0.319728,200008.0
578168,0.319728,200009.0
578169,0.334539,200010.0


In [19]:
# Add annual income volatility - std of ni_g_ttm for past 5 years, minimum of past 1 year

volinc = {"yyyymm": [], "permno": [], "volinc": []}
for permno in permnos:
    ni_g_ttm = all_monthly_data[all_monthly_data["permno"] == permno][["ni_g_ttm", "yyyymm"]].dropna()
    date_range = sorted(list(ni_g_ttm["yyyymm"]))

    for i, yyyymm in enumerate(date_range):
        
        window = set(date_range[max(0, i-59):i+1]) # Look past 5 years (60 months)
        window_data = ni_g_ttm[ni_g_ttm["yyyymm"].isin(window)]
        
        if len(window_data) < 12:
            continue

        # Add std ni_g_ttm of past 5 years to volinc factor
        # Note this avoids lookahead bias because the data up to and including i
        # should be known (since ni_g_ttm is from i-1 as per data manual)
        volinc["yyyymm"].append(yyyymm)
        volinc["permno"].append(permno)
        volinc["volinc"].append(window_data["ni_g_ttm"].std())

volinc_df = pd.DataFrame(volinc)
volinc_df.head()

Unnamed: 0,yyyymm,permno,volinc
0,198012.0,49154.0,0.023537
1,198101.0,49154.0,0.02416
2,198102.0,49154.0,0.024472
3,198103.0,49154.0,0.024585
4,198104.0,49154.0,0.023754


In [20]:
all_monthly_data = pd.merge(all_monthly_data, volinc_df, on=["yyyymm", "permno"], how="outer")

## 10M2 Yield Curve

In [21]:
treasury_inflation = pd.read_sas("treasury_inflation.sas7bdat", encoding = 'ISO-8859-1')

In [22]:
fact_10M2 = treasury_inflation[["CALDT", "B2RET", "B10RET"]]

# Add one month to fit RET and factor time
fact_10M2["yyyymm"] = (fact_10M2["CALDT"] + pd.DateOffset(months=1)).dt.strftime("%Y%m").astype(float)

In [23]:
fact_10M2["10M2"] = fact_10M2["B10RET"] - fact_10M2["B2RET"]
fact_10M2

Unnamed: 0,CALDT,B2RET,B10RET,yyyymm,10M2
0,1979-12-31,0.005695,0.011951,198001.0,0.006256
1,1980-01-31,-0.000164,-0.037477,198002.0,-0.037313
2,1980-02-29,-0.036947,-0.050507,198003.0,-0.013560
3,1980-03-31,0.010329,0.048345,198004.0,0.038016
4,1980-04-30,0.084198,0.084375,198005.0,0.000177
...,...,...,...,...,...
477,2019-09-30,-0.001297,-0.013852,201910.0,-0.012555
478,2019-10-31,0.003274,-0.000742,201911.0,-0.004016
479,2019-11-29,-0.001038,-0.007410,201912.0,-0.006372
480,2019-12-31,0.002274,-0.011292,202001.0,-0.013566


In [24]:
all_monthly_data = pd.merge(all_monthly_data, fact_10M2[["yyyymm", "10M2"]], on="yyyymm")

## Recession Affinity
* Recession affinity is calculated as

- 1 / volinc **if 10M2 < 0** (recession)
- volinc $\times$ 1500 **if 10M2 > 0** (no recession)
    - TBH Times 1500 descision is arbitrary but it makes sense in "levelling" both sides of the variable, ie:
    - Values when 10M2 < 0 and 10M2 > 0 are relatively equal

In [25]:
all_monthly_data["recession_affinity"] = np.where(all_monthly_data["10M2"] < 0, 1 / all_monthly_data["volinc"], 1500 * all_monthly_data["volinc"])

In [26]:
test = all_monthly_data[["10M2", "volinc", "recession_affinity"]].dropna()

In [27]:
test[test["10M2"] < 0]["recession_affinity"].mean()

105.45824976257666

In [28]:
test[test["10M2"] > 0]["recession_affinity"].mean()

78.82993778650686

In [29]:
all_monthly_data

Unnamed: 0,permno,yyyymm,monthid,IM,range_20,log_vol_dollar_20,range_120,log_vol_dollar_120,xret_5,xret_10,...,ret_f10,ret_f11,ret_f12,10Q_tone_3m,10Q_tone_6m,10K_tone_1y,8K_tone_1y,volinc,10M2,recession_affinity
0,10026.0,198001.0,,,,,,,,,...,,,,0.000000,0.000000,0.000000,0.000000,,0.006256,
1,10032.0,198001.0,,,,,,,,,...,,,,0.000000,0.000000,0.000000,0.000000,,0.006256,
2,10051.0,198001.0,,,,,,,,,...,,,,0.000000,0.000000,0.000000,0.000000,,0.006256,
3,10104.0,198001.0,,,,,,,,,...,,,,0.000000,0.000000,0.000000,0.000000,,0.006256,
4,10107.0,198001.0,,,,,,,,,...,,,,0.000000,0.000000,0.000000,0.000000,,0.006256,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
718555,93374.0,201912.0,480.0,-0.147586,0.010868,17.310472,0.015246,17.388666,0.000246,-0.019599,...,-0.124141,0.086342,0.075351,-0.010593,-0.020941,-0.011014,-0.068925,0.008777,-0.006372,113.930650
718556,93419.0,201912.0,480.0,-0.147586,0.012941,16.775314,0.015804,16.961089,0.008011,0.004462,...,-0.121751,0.349948,-0.066538,-0.001085,-0.003564,-0.002398,-0.050815,0.010539,-0.006372,94.884444
718557,93422.0,201912.0,480.0,-0.046054,0.059361,16.763782,0.067228,17.048398,0.015794,-0.033674,...,-0.003102,0.788889,0.484472,-0.018540,-0.037721,-0.017328,-0.178637,0.125235,-0.006372,7.984961
718558,93427.0,201912.0,480.0,0.260954,0.025465,16.473392,0.028009,16.544964,-0.005524,-0.006388,...,-0.047755,0.138121,0.135851,-0.005819,-0.005819,-0.004089,-0.018317,0.016243,-0.006372,61.566110


***
# Data Cleanup
Done creating all factors, will clean up data before training step 

In [30]:
# Inputation - as in ML Lecture 1

# Drop NA in all non-numerical columns
all_monthly_data.dropna(subset=non_data_cols, inplace=True)

grouped_med = all_monthly_data.groupby(by='monthid')
# the lambda function gets the median per group in the groupby object, and fills the NaN values with the median per group
imputed_grouped = grouped_med.transform(lambda y: y.fillna(y.median()))

# This line assigns the values of the medians 
all_monthly_data = all_monthly_data.assign(**imputed_grouped.to_dict(orient='series'))
all_monthly_data.dropna(inplace=True)

In [31]:
# Filtering data by min price and min market share for each year

# Commenting out for runtime - does not drop any rows

# all_monthly_data['yyyy'] = all_monthly_data['yyyymm'].astype(str).str[:4]
# all_monthly_data['MKTSHR'] = all_monthly_data['PRC'] * all_monthly_data['SHROUT'] * 1_000

# to_drop_indices = []

# for permno in all_monthly_data.permno.unique():
#     for year in all_monthly_data['yyyy'].unique():
#         mask = (all_monthly_data['permno'] == permno) & (all_monthly_data['yyyy'] == year)
#         if all_monthly_data[mask].shape[0] != 0 != 0 and (all_monthly_data[mask]['MKTSHR'].iloc[0] < 100_000_000 or all_monthly_data[mask]['PRC'].iloc[0] <= 5):
#             to_drop_indices += list(all_monthly_data[mask].index)
# all_monthly_data.drop(to_drop_indices, inplace=True)

In [32]:
# Winsorizing factors--should winsorize the variables by quarter
for column in factors:
    for date in set(list(all_monthly_data["monthid"])):
        mask = (all_monthly_data["monthid"] == date)
        
        std = all_monthly_data[column][mask].std()
        mean = all_monthly_data[column][mask].mean()

        upper = mean + 3 * std
        lower = mean - 3 * std
        
        all_monthly_data[column][mask].clip(lower, upper, inplace= True)

In [33]:
all_monthly_data

Unnamed: 0,permno,yyyymm,monthid,IM,range_20,log_vol_dollar_20,range_120,log_vol_dollar_120,xret_5,xret_10,...,ret_f10,ret_f11,ret_f12,10Q_tone_3m,10Q_tone_6m,10K_tone_1y,8K_tone_1y,volinc,10M2,recession_affinity
16474,10145.0,198012.0,12.0,0.374586,0.027456,15.617313,0.022920,15.145167,-0.045500,-0.051723,...,0.141141,0.002105,-0.066489,0.000000,0.000000,0.000000,0.000000,0.040110,0.013658,60.164922
16476,10241.0,198012.0,12.0,0.188068,0.018115,13.277532,0.019202,13.227823,-0.033571,-0.063374,...,0.152756,-0.034843,-0.007220,0.000000,0.000000,0.000000,0.000000,0.009316,0.013658,13.974516
16486,10460.0,198012.0,12.0,0.324759,0.036155,13.062007,0.027327,12.680238,0.077472,0.109122,...,0.106796,0.093860,0.085020,0.000000,0.000000,0.000000,0.000000,0.007682,0.013658,11.523463
16489,10516.0,198012.0,12.0,0.170149,0.031346,15.249913,0.031163,15.026468,-0.048646,-0.008587,...,0.079316,0.087302,0.124088,0.000000,0.000000,0.000000,0.000000,0.014242,0.013658,21.362453
16500,10866.0,198012.0,12.0,0.273820,0.008009,11.612416,0.013282,11.810300,0.002714,-0.027249,...,0.049107,0.004255,-0.023220,0.000000,0.000000,0.000000,0.000000,0.002783,0.013658,4.174269
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
718555,93374.0,201912.0,480.0,-0.147586,0.010868,17.310472,0.015246,17.388666,0.000246,-0.019599,...,-0.124141,0.086342,0.075351,-0.010593,-0.020941,-0.011014,-0.068925,0.008777,-0.006372,113.930650
718556,93419.0,201912.0,480.0,-0.147586,0.012941,16.775314,0.015804,16.961089,0.008011,0.004462,...,-0.121751,0.349948,-0.066538,-0.001085,-0.003564,-0.002398,-0.050815,0.010539,-0.006372,94.884444
718557,93422.0,201912.0,480.0,-0.046054,0.059361,16.763782,0.067228,17.048398,0.015794,-0.033674,...,-0.003102,0.788889,0.484472,-0.018540,-0.037721,-0.017328,-0.178637,0.125235,-0.006372,7.984961
718558,93427.0,201912.0,480.0,0.260954,0.025465,16.473392,0.028009,16.544964,-0.005524,-0.006388,...,-0.047755,0.138121,0.135851,-0.005819,-0.005819,-0.004089,-0.018317,0.016243,-0.006372,61.566110


## Factor Code

In [34]:
ff4_factors = pd.read_sas("ff4_factors.sas7bdat", encoding = 'ISO-8859-1')
ff4_factors["monthid"] = ff4_factors.index + 1
ff4_factors.head()

Unnamed: 0,DATEFF,SMB,HML,MKTRF,RF,UMD,monthid
0,1980-01-31,0.0162,0.0175,0.0551,0.008,0.0755,1
1,1980-02-29,-0.0185,0.0061,-0.0122,0.0089,0.0788,2
2,1980-03-31,-0.0664,-0.0101,-0.129,0.0121,-0.0955,3
3,1980-04-30,0.0105,0.0106,0.0397,0.0126,-0.0043,4
4,1980-05-30,0.0213,0.0038,0.0526,0.0081,-0.0112,5


In [35]:
dates = [int(x) for x in sorted(list(set(list(all_monthly_data["yyyymm"]))))]
dates[0], dates[-1]

(198012, 201912)

In [36]:
monthids = [int(x) for x in sorted(list(set(list(all_monthly_data["monthid"]))))]
monthids[0], monthids[-1], len(monthids)

(12, 480, 469)

In [37]:
testing_range = monthids[0:2*(len(monthids) // 3)]
validation_range = monthids[2 * len(monthids) // 3:]

# Validate that ranges have correct ratios
len(testing_range) / len(monthids), len(validation_range) / len(monthids), len(testing_range) + len(validation_range)

(0.6652452025586354, 0.3347547974413646, 469)

## Testing Factors

In [56]:
model_factors = ['IM', 'range_20', 'log_vol_dollar_20',
       'range_120', 'log_vol_dollar_120', 'xret_5', 'xret_10', 'xret_20',
       'xret_indsize_20', 'xret_indsize_std20', 'xret_40', 'xret_120',
       'xret_indsize_120', 'xret_indsize_std120', 'KDJ_20', 'deviation_pct20',
       'MoneyFlowIndex_20', 'RSI_20', 'KDJ_120', 'deviation_pct120',
       'MoneyFlowIndex_120', 'RSI_120', 'IV_capm', 'mdr', 'ami_3', 'beta_3y',
       'beta_5y', 'tail_2y', 'dp', 'leverage', 'BL', 'roe', 'roa',
       'profitability', 'sales_g_q', 'sales_g_ttm', 'op_income_g_q', 'ni_g_q',
       'op_income_g_ttm', 'ni_g_ttm', 'sue_NI', 'BM', 'AM', 'EP', 'SP',
       'roe_q', 'roa_q', 'Cto', 'pe_ttm', 'lag_log_size']

In [57]:
model_factors += recession_factors
model_factors += tone_factors

In [40]:
all_monthly_data = pd.merge(ff4_factors, all_monthly_data, on="monthid")

## [m, n, l] model for Fama-MacBeth Double Regression
We will use the technique employed during Assignment 2, utilizing a 36-month lookback for factor data to generate our betas (**First Stage**)
* For period $t_i$, we will use data starting at $t_{i-36} ... t_{i-1}$ if available. Worst case we look for 12 prior samples.

In [58]:
# Threaded Approach
def add_betas(permno):
    results = []
    for (i, monthid) in enumerate(testing_range): 
        window = set(testing_range[max(0, i-35):i+1]) # t_(i-36) to t_(i-1) returns. Compare to t_i returns
        window_data = all_monthly_data[(all_monthly_data["permno"] == permno) & (all_monthly_data["monthid"].isin(window))]
        
        if len(window_data) < 12:
            continue

        explanatory_vars = window_data[model_factors + ["monthid"]]
        explanatory_vars.sort_values(by="monthid", inplace=True)
        explanatory_vars.set_index("monthid", inplace=True)
    
        explained_var = window_data[["monthid", "RET"]] # Since factors are from t-1
        explained_var.sort_values(by="monthid", inplace=True)
        explained_var.set_index("monthid", inplace=True)
        
        model = linear_model.LinearRegression().fit(explanatory_vars, 
                                                    explained_var["RET"])
        
        results.append({"monthid": monthid, 
                        "permno": permno, 
                        "RET": explained_var["RET"].iloc[-1], 
                    **{f"{factor}": model.coef_[i] for i, factor in enumerate(model_factors)}
                       })  
    return results

# UNCOMMENT THIS AND ADD TO LINE BELOW INSTEAD OF `permnos`
# FOR DEVELOPMENT - THIS CODE BLOCK TAKES LIKE 30 MINS TO RUN

# smaller_permno_list = list(permnos)[:10]

# Only compute if not in files (delete local copy of file if code above if modifying factors or code above)
if os.path.isfile("first_stage_df.csv"):
    first_stage_df = pd.read_csv("first_stage_df.csv", index_col=0)
else:
    # Runs once basically
    summary_results = []
    with ThreadPool(cpu_count() - 1) as P:
        summary_results = P.map(add_betas, permnos)
        summary_results = [item for sublist in summary_results for item in sublist]
        first_stage_df = pd.DataFrame(summary_results)
        
        # Save first stage df for easy loading
        first_stage_df.to_csv("first_stage_df.csv")
first_stage_df

Unnamed: 0,monthid,permno,RET,IM,range_20,log_vol_dollar_20,range_120,log_vol_dollar_120,xret_5,xret_10,...,Cto,pe_ttm,lag_log_size,10M2,volinc,recession_affinity,10Q_tone_3m,10Q_tone_6m,10K_tone_1y,8K_tone_1y
0,23,49154.0,-0.044053,-0.625612,-0.029899,-0.215916,-0.000078,0.176263,0.208140,0.275639,...,-0.003265,-0.063603,0.077816,-0.085844,-0.000208,-0.002478,0.0,0.0,0.0,0.0
1,24,49154.0,0.105991,-0.623237,-0.026931,-0.204025,0.000630,0.231143,0.214112,0.272743,...,0.000618,-0.009450,0.073351,-0.088219,-0.000370,-0.001777,0.0,0.0,0.0,0.0
2,25,49154.0,-0.037500,-0.471871,-0.052081,-0.160436,-0.002516,0.602752,0.117984,0.308181,...,-0.018240,-0.108668,-0.156348,-0.203525,-0.000026,-0.009635,0.0,0.0,0.0,0.0
3,26,49154.0,0.065801,-0.531017,-0.043607,-0.104008,-0.000262,0.450023,0.100204,0.294732,...,-0.020370,-0.067640,-0.154103,-0.187905,-0.000079,-0.005512,0.0,0.0,0.0,0.0
4,27,49154.0,0.102459,-0.453954,-0.039963,-0.109926,0.011565,0.409227,0.445027,0.496351,...,-0.018164,0.006692,-0.135691,-0.259197,-0.001181,-0.011085,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218148,319,81917.0,0.008880,-0.233867,29.197616,-0.661912,3.984618,0.653929,0.468958,-12.452554,...,0.645369,-0.241367,0.826280,3.750750,-4.889649,-0.003301,0.0,0.0,0.0,0.0
218149,320,81917.0,0.063117,-1.346986,6.541533,-0.228248,3.344047,-0.360264,3.329775,4.154671,...,5.665293,-0.061930,-2.063183,0.814365,-1.706008,-0.006672,0.0,0.0,0.0,0.0
218150,321,81917.0,0.018998,-0.947802,9.570922,0.282730,0.746553,-0.274337,0.984295,14.885024,...,0.671193,0.090593,-1.614586,-1.629696,-1.120730,-0.008300,0.0,0.0,0.0,0.0
218151,322,81917.0,0.042373,-0.576227,9.272626,0.072836,0.440372,-0.039441,1.499367,19.390078,...,-3.309310,0.118523,-2.983006,-4.164838,-0.562649,-0.004929,0.0,0.0,0.0,0.0


In [59]:
# Second stage regression
lambdas = {"monthid": []}
for factor in model_factors:
    lambdas[f"{factor}"] = []
    
for monthid in testing_range:
    monthid_returns = first_stage_df.loc[first_stage_df["monthid"] == monthid]

    # If empty
    if monthid_returns.empty:
        continue
    
    explanatory_vars = monthid_returns[model_factors + ["permno"]]
    explanatory_vars.sort_values(by="permno", inplace=True)
    explanatory_vars.set_index("permno", inplace=True)

    explained_var = monthid_returns[["permno", "RET"]]
    explained_var.sort_values(by="permno", inplace=True)
    explained_var.set_index("permno", inplace=True)
    
    model = linear_model.LinearRegression(n_jobs=len(model_factors)).fit(explanatory_vars, 
                                                                         explained_var["RET"])

    lambdas["monthid"].append(monthid)

    for (i, factor) in enumerate(model_factors):
        lambdas[factor].append(model.coef_[i])

In [60]:
second_stage_df = pd.DataFrame(lambdas)
second_stage_df

Unnamed: 0,monthid,IM,range_20,log_vol_dollar_20,range_120,log_vol_dollar_120,xret_5,xret_10,xret_20,xret_indsize_20,...,Cto,pe_ttm,lag_log_size,10M2,volinc,recession_affinity,10Q_tone_3m,10Q_tone_6m,10K_tone_1y,8K_tone_1y
0,23,-0.085602,-0.807203,-0.088980,2.145172,0.029114,-0.052883,0.004942,-0.159684,0.098861,...,-0.418945,0.009766,0.073242,0.525879,1.689026,0.880493,0.000000,0.000000,0.000000,0.000000
1,24,0.012421,-0.536728,0.006272,-1.288940,-0.006253,-0.021233,-0.004223,0.007023,-0.040352,...,-0.035172,0.020233,-0.007019,-0.043884,-0.733835,-1.756024,0.000000,0.000000,0.000000,0.000000
2,25,-0.006880,-0.138258,-0.012698,2.210381,-0.031583,-0.034765,0.019367,-0.012192,0.018813,...,0.078827,0.010437,-0.002716,-0.080566,1.828575,-1.594702,0.000000,0.000000,0.000000,0.000000
3,26,-0.001254,-0.166184,-0.023568,0.180650,0.035027,0.003068,-0.015250,0.007534,-0.050384,...,-0.192871,0.014160,-0.024902,-0.005371,0.864990,-2.716690,0.000000,0.000000,0.000000,0.000000
4,27,-0.034305,-0.224536,-0.036996,0.720723,-0.013081,0.012454,0.013125,0.002810,0.035872,...,-0.077972,-0.008713,-0.031586,-0.020630,0.623182,0.407532,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,319,-0.009238,0.001152,0.022948,0.000003,0.009135,0.001214,-0.001816,0.000456,0.000842,...,0.000150,0.012361,0.002696,-0.002230,-0.000788,0.474471,0.000498,0.003274,0.002294,0.006052
297,320,-0.001762,-0.000086,-0.016746,0.000710,0.002038,-0.002058,-0.002049,-0.003034,0.000216,...,0.000385,0.000760,-0.003976,-0.000420,-0.001638,-0.514351,0.005483,0.002841,0.002216,-0.003295
298,321,-0.000252,0.000363,-0.018266,0.000889,-0.006513,0.000370,-0.000524,-0.001237,-0.001462,...,0.000425,0.009634,-0.001832,0.000752,0.001492,-0.427796,-0.000752,-0.000449,-0.001784,-0.006287
299,322,0.000667,-0.000599,-0.006558,-0.000514,-0.001221,-0.003213,-0.003550,-0.002260,-0.002149,...,-0.000050,0.012942,-0.002421,-0.001403,-0.000056,0.112243,0.001962,0.000418,0.001460,-0.002947


In [61]:
# Get p values
p_value_dict = {"factor": [], "p-value": []}
for factor in model_factors:
    lambdas = second_stage_df[factor]
    ttest = stats.ttest_ind(lambdas, np.zeros(len(lambdas))) # Compare to see if any lambdas are significantly different from zero
    p_value_dict['factor'].append(factor)
    p_value_dict['p-value'].append(ttest[1])

results_df = pd.DataFrame.from_dict(p_value_dict, orient='index')
results_df.round(2).T.sort_values(by="p-value")

Unnamed: 0,factor,p-value
49,lag_log_size,0.017764
33,profitability,0.036061
42,AM,0.044332
51,volinc,0.051711
1,range_20,0.055326
44,SP,0.063329
4,log_vol_dollar_120,0.154361
55,10K_tone_1y,0.160247
46,roa_q,0.174638
21,RSI_120,0.179679


# TODO
use these results to determine which factors to keep (among other considerations like cross-correlation, if they are in the same category, etc)

# Machine Learning

We will attempt implementign **PLS** as our machine learning method as it provides the most potent return capabilities as outlined in Lecture 4's code example, and in the ML slides.

In [45]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ParameterGrid

In [46]:
# validation function
def validate_model(model_type, param_grid, x_train, y_train, x_validate, y_validate):
    # Special case for LinearRegression because it doesn't have hyperparameters to tune
    if model_type == LinearRegression:
        model = LinearRegression()
        model.fit(x_train, y_train)
        pred = model.predict(x_validate)
        r2 = r2_score(y_validate, pred)
        
        return r2
    else: # The other cases
        
        # Establishses the ParameterGrid
        model_param_grid = ParameterGrid(param_grid)
        
        # Initialize values
        best_MAE = 0
        best_r2 = 1
        best_config = None
        # Iterate through the parameter grid, fit models to the hyperparameters
        # and check for MAE and R2 values
        
        # each param_config in that validation function would represent 1 combination of the possible parameters.
        # for example in Lab 6, when I'm validating for the elastic net regression, I have 
        # 2 possible hyperparameters: alpha and l1_ratio. 
        #alpha can take on values 0.0001, 0.0005, etc, and l1_ratio can take on values 0, 1, 0.01. 
        #So each param_config in the for loop in validate_model would go over 1 possible 
        #combination of the hyperparameter and keep the one that gives us the best MAE/R2
        for param_config in model_param_grid:
            curr_config_MAEs = []
            model = model_type(**param_config)
            model.fit(x_train, y_train)
            pred = model.predict(x_validate)
            MAE = mean_squared_error(y_validate,pred)
            r2 = r2_score(y_validate, pred)
            curr_config_MAEs.append(MAE)
            if best_MAE == 0 or (MAE < best_MAE):
                best_MAE = MAE
                best_config = param_config
            if best_r2 == 1 or (r2 > best_r2):
                best_r2 = r2
        return best_config, best_MAE, best_r2

# Predictions
def pred(model_type, x_train, y_train, x_test, y_test):
    # Fit model and predict 
    model = model_type.fit(x_train, y_train)
    pred = model.predict(x_test)
    
    # Format prediction as DataFrame
    pred_df = pd.DataFrame(pred, columns = ['RET_pred'])
    pred_df.set_index(x_test.index, inplace = True)
    
    r2 = r2_score(y_test, pred)
    return pred_df, r2

In [47]:
# using a 60/20/20 split
# train, validate, test = \
#                         np.split(all_monthly_data.sample(frac=1, random_state=42), 
#                         [int(.6*len(all_monthly_data)), int(.8*len(all_monthly_data))])

# No subset
train, validate, test = \
                        np.split(all_monthly_data,
                        [int(.6*len(all_monthly_data)), int(.8*len(all_monthly_data))])

x_train = train[model_factors + ["yyyymm", "permno"]].set_index(["yyyymm", "permno"])
y_train = train[['RET', "yyyymm", "permno"]].set_index(["yyyymm", "permno"])

x_validate = validate[model_factors + ["yyyymm", "permno"]].set_index(["yyyymm", "permno"])
y_validate = validate[['RET', "yyyymm", "permno"]].set_index(["yyyymm", "permno"])

x_test = test[model_factors + ["yyyymm", "permno"]].set_index(["yyyymm", "permno"])
y_test = test[['RET', "yyyymm", "permno"]].set_index(["yyyymm", "permno"])

In [48]:
x_train

Unnamed: 0_level_0,Unnamed: 1_level_0,IM,range_20,log_vol_dollar_20,range_120,log_vol_dollar_120,xret_5,xret_10,xret_20,xret_indsize_20,xret_indsize_std20,...,EP,SP,roe_q,roa_q,Cto,pe_ttm,lag_log_size,10M2,volinc,recession_affinity
yyyymm,permno,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
198012.0,10145.0,0.374586,0.027456,15.617313,0.022920,15.145167,-0.045500,-0.051723,-0.032998,-0.103709,0.024948,...,0.159071,3.106572,0.042556,0.015194,0.303363,6.286484,7.566481,0.013658,0.040110,60.164922
198012.0,10241.0,0.188068,0.018115,13.277532,0.019202,13.227823,-0.033571,-0.063374,-0.141331,-0.036481,0.009797,...,0.150070,7.809400,0.041306,0.008451,0.432945,6.663567,6.345893,0.013658,0.009316,13.974516
198012.0,10460.0,0.324759,0.036155,13.062007,0.027327,12.680238,0.077472,0.109122,0.094566,0.068687,0.029715,...,0.077183,1.424796,0.043730,0.022653,0.409445,12.956169,5.905587,0.013658,0.007682,11.523463
198012.0,10516.0,0.170149,0.031346,15.249913,0.031163,15.026468,-0.048646,-0.008587,-0.005626,0.084091,0.022241,...,0.092024,2.103178,0.046955,0.026372,0.580968,10.866713,7.397096,0.013658,0.014242,21.362453
198012.0,10866.0,0.273820,0.008009,11.612416,0.013282,11.810300,0.002714,-0.027249,-0.088989,-0.020212,0.009537,...,0.211710,5.921999,0.028797,0.016386,0.548834,4.723448,5.526573,0.013658,0.002783,4.174269
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200812.0,44206.0,-0.256424,0.055367,17.200459,0.038970,17.022322,-0.053123,0.005273,0.104616,0.038293,0.019833,...,0.067112,1.410528,0.025202,0.005829,0.118567,14.900494,8.196378,0.073257,0.029574,44.361233
200812.0,44274.0,-0.299851,0.097975,15.410891,0.067219,15.254340,0.174990,0.091204,0.344395,0.258588,0.039614,...,0.054510,1.266940,0.021542,0.014311,0.253302,18.345311,6.115437,0.073257,0.032275,48.412342
200812.0,44329.0,-0.282099,0.056087,16.510059,0.038861,16.602987,-0.062214,-0.068661,-0.020760,-0.010352,0.023586,...,0.012323,0.854643,0.021105,0.007056,0.120839,81.149296,7.539904,0.073257,0.031923,47.884848
200812.0,44601.0,-0.282099,0.061162,17.503754,0.043618,17.687756,0.037796,-0.016867,-0.031714,-0.021307,0.016859,...,0.063949,1.459469,0.028512,0.009361,0.257517,15.637459,8.103341,0.073257,0.015421,23.131971


In [49]:
pls_grid = dict()
pls_grid['n_components'] = np.arange(1, len(model_factors)+1, 1)

pls_best_config, pls_best_MAE, pls_best_r2 = validate_model(PLSRegression, pls_grid, x_train, y_train, x_validate\
                                                            , y_validate)
print('Best config:' + str(pls_best_config))
print('Validation R2: ' + str(pls_best_r2))

Best config:{'n_components': 12}
Validation R2: 0.006075974149239438


In [50]:
pls_pred_df, pls_test_r2 = pred(PLSRegression(pls_best_config['n_components']), x_train, y_train, x_test, y_test)

In [51]:
x_test

Unnamed: 0_level_0,Unnamed: 1_level_0,IM,range_20,log_vol_dollar_20,range_120,log_vol_dollar_120,xret_5,xret_10,xret_20,xret_indsize_20,xret_indsize_std20,...,EP,SP,roe_q,roa_q,Cto,pe_ttm,lag_log_size,10M2,volinc,recession_affinity
yyyymm,permno,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
201410.0,92948.0,0.071301,0.018533,16.795412,0.020295,16.793615,-0.003665,-0.012436,-0.019897,-0.017441,0.009343,...,0.046660,0.565663,0.030712,0.013243,0.197991,21.532204,7.899479,-0.010563,0.029160,34.293901
201410.0,92951.0,0.071301,0.018533,16.795412,0.020295,16.793615,-0.003665,-0.012436,-0.019897,-0.017441,0.009343,...,0.046660,0.565663,0.030712,0.013243,0.197991,21.532204,7.899479,-0.010563,0.029160,34.293901
201410.0,92976.0,0.071301,0.018533,16.795412,0.020295,16.793615,-0.003665,-0.012436,-0.019897,-0.017441,0.009343,...,0.046660,0.565663,0.030712,0.013243,0.197991,21.532204,7.899479,-0.010563,0.029160,34.293901
201410.0,93002.0,0.071301,0.018533,16.795412,0.020295,16.793615,-0.003665,-0.012436,-0.019897,-0.017441,0.009343,...,0.046660,0.565663,0.030712,0.013243,0.197991,21.532204,7.899479,-0.010563,0.029160,34.293901
201410.0,93014.0,0.071301,0.018533,16.795412,0.020295,16.793615,-0.003665,-0.012436,-0.019897,-0.017441,0.009343,...,0.046660,0.565663,0.030712,0.013243,0.197991,21.532204,7.899479,-0.010563,0.029160,34.293901
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201912.0,93374.0,-0.147586,0.010868,17.310472,0.015246,17.388666,0.000246,-0.019599,-0.007484,0.108797,0.025360,...,0.086933,0.890483,0.045823,0.016358,0.146043,11.503061,8.874422,-0.006372,0.008777,113.930650
201912.0,93419.0,-0.147586,0.012941,16.775314,0.015804,16.961089,0.008011,0.004462,-0.040163,0.076118,0.021830,...,0.008990,0.155746,0.017259,0.007888,0.027911,111.236507,8.621115,-0.006372,0.010539,94.884444
201912.0,93422.0,-0.046054,0.059361,16.763782,0.067228,17.048398,0.015794,-0.033674,-0.035357,0.016398,0.029444,...,-0.700006,1.470899,0.030119,0.014716,0.055865,200.000000,6.656186,-0.006372,0.125235,7.984961
201912.0,93427.0,0.260954,0.025465,16.473392,0.028009,16.544964,-0.005524,-0.006388,0.039414,0.042201,0.015712,...,0.061763,0.833342,0.030074,0.020678,0.318084,16.190940,7.713477,-0.006372,0.016243,61.566110


# Performance Analysis

In [52]:
def total_ret(port_ret):
    return port_ret.sum()
    # return np.prod(port_ret + 1) - 1

def tracking_error(port_ret, bench_ret):
    return (port_ret - bench_ret).std()

def information_ratio(port_ret, bench_ret):
    return (total_ret(port_ret) - total_ret(bench_ret)) / tracking_error(port_ret, bench_ret)

def sharpe_ratio(port_ret, rf_ret):
    return information_ratio(port_ret, rf_ret)

def sharpe_ratio(port_xret):
    return total_ret(port_xret) / port_xret.std()

In [53]:
# Write Permnos - for WRDS Queries

# with open("permnos.txt", "w") as file:
#     for permno in permnos:
#         file.write(f"{int(permno)},\n")