# Final Project

- Saige Belanger
    - (20951877)
- Dylan Faelker
    - (20960747)
- Ethan Liu
    - (20959615)
- Timothy Zheng
    - t54zheng (20939203)

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
from sklearn import linear_model
import statsmodels.api as sm
import scipy.stats as stats
from math import sqrt

from sklearn import preprocessing
from sklearn.impute import SimpleImputer
import datetime as dt

import os.path

warnings.filterwarnings('ignore')

# Factors
We start with an initial list of factors from the provided list of 50 Factors in the ML examples.

TODO: Increase our breadth of factors to the category chosen by downloading and creating them, then merging them 

https://wrds-www.wharton.upenn.edu/pages/get-data/compustat-capital-iq-standard-poors/compustat/north-america-daily/fundamentals-quarterly/

In [2]:
all_monthly_data = pd.read_sas("merged_df.sas7bdat", encoding = 'ISO-8859-1')

In [3]:
# Save all gvkeys - for WRDS Queries

# with open("gvkeys.txt", "w") as file:
#     for gvkey in set(all_monthly_data["gvkey"].dropna()):
#         file.write(f"{int(gvkey)},\n")

In [4]:
gvkey_permno_dict = {}
for gvkey in set(all_monthly_data["gvkey"].dropna()):
    permno = all_monthly_data[all_monthly_data["gvkey"] == gvkey]["permno"].dropna().iloc[0]
    gvkey_permno_dict[gvkey] = permno

In [5]:
all_monthly_data.drop(["ticker", "conm", "gvkey", "cusip", "naics", "gsubind"], axis=1, inplace=True) # We don't use these columns anyway, drop them

In [6]:
factors = ['IM', 'range_20', 'log_vol_dollar_20',
       'range_120', 'log_vol_dollar_120', 'xret_5', 'xret_10', 'xret_20',
       'xret_indsize_20', 'xret_indsize_std20', 'xret_40', 'xret_120',
       'xret_indsize_120', 'xret_indsize_std120', 'KDJ_20', 'deviation_pct20',
       'MoneyFlowIndex_20', 'RSI_20', 'KDJ_120', 'deviation_pct120',
       'MoneyFlowIndex_120', 'RSI_120', 'IV_capm', 'mdr', 'ami_3', 'beta_3y',
       'beta_5y', 'tail_2y', 'dp', 'leverage', 'BL', 'roe', 'roa',
       'profitability', 'sales_g_q', 'sales_g_ttm', 'op_income_g_q', 'ni_g_q',
       'op_income_g_ttm', 'ni_g_ttm', 'sue_NI', 'BM', 'AM', 'EP', 'SP',
       'roe_q', 'roa_q', 'Cto', 'pe_ttm', 'lag_log_size']

ret_cols = ['ret_f1', 'ret_f2', 'ret_f3', 'ret_f4', 'ret_f5', 'ret_f6', 
            'ret_f7', 'ret_f8', 'ret_f9', 'ret_f10', 'ret_f11', 'ret_f12']

In [7]:
non_data_cols = [x for x in all_monthly_data.columns if x not in factors and x not in ret_cols]
non_data_cols

['permno', 'yyyymm', 'monthid', 'PRC', 'VOL', 'RET', 'SHROUT']

# Adding New Factors
* When you add a factor, document it here: [link](https://docs.google.com/spreadsheets/d/1rs9633QSYLVY5Z5DoGNy3USP2MROGtqTIKcbLG68wpE/edit#gid=1579135478) and fill properly
* Download the data file, if it's too large add it to the drive
* Also download the other files that arent on github but on the drive before working on this part of the notebook
    * https://drive.google.com/drive/u/0/folders/1D1eIYlkNxNLfzHJLzkGeE9ymr7doXg_6

## IMPORTANT NOTE - FACTOR/RETURN TIME
- When adding factors make sure you add such that factor is reported at t-1, **RET** has **T** returns (in same row)
- This means you need to download data from the range **(1979-12 to 2019-11)**

***

- Treasury and CPI Rates: [Link](https://wrds-www.wharton.upenn.edu/pages/get-data/center-research-security-prices-crsp/annual-update/index-treasury-and-inflation/us-treasury-and-inflation-indexes/)
- Federal Reserve Data: [Link](https://wrds-www.wharton.upenn.edu/pages/get-data/federal-reserve-bank-reports/interest-rates/data/)
- SEC Filings: https://wrds-www.wharton.upenn.edu/pages/get-data/wrds-sec-analytics-suite/wrds-sec-filings-queries/list-of-filings-exhibits/
- Analyzed Data: https://wrds-www.wharton.upenn.edu/pages/get-data/wrds-sec-analytics-suite/wrds-sec-text-analysis/readability-and-sentiment/

**TBD**
- Other Factors: Downloaded from https://wrds-www.wharton.upenn.edu/pages/get-data/compustat-capital-iq-standard-poors/compustat/north-america-daily/fundamentals-quarterly/

In [8]:
sec_analytics = pd.read_sas("sec_filing_analysis_wrds.sas7bdat", encoding = 'ISO-8859-1')

In [9]:
sec_analytics["PERMNO"] = sec_analytics["GVKEY"].map(gvkey_permno_dict)

In [10]:
sec_analytics[["FDATE", "PERMNO", "FORM", "WORD_COUNT", "LM_MASTER_DICTIONARY_COUNT", "LM_NEGATIVE_COUNT",	"LM_POSITIVE_COUNT",	"LM_MODAL_WEAK_COUNT",	"LM_MODAL_STRONG_COUNT",	"LM_LITIGIOUS_COUNT",	"LM_UNCERTAINTY_COUNT",]]

# TODO Merge this into all_monthly_data, keeping in mind weird t+1 offset
# Also split by form?

Unnamed: 0,FDATE,PERMNO,FORM,WORD_COUNT,LM_MASTER_DICTIONARY_COUNT,LM_NEGATIVE_COUNT,LM_POSITIVE_COUNT,LM_MODAL_WEAK_COUNT,LM_MODAL_STRONG_COUNT,LM_LITIGIOUS_COUNT,LM_UNCERTAINTY_COUNT
0,2013-06-25,86594.0,10-K,260142.0,207916.0,2500.0,2387.0,458.0,814.0,4079.0,1585.0
1,2003-07-16,86594.0,4,107.0,131.0,1.0,0.0,0.0,0.0,1.0,0.0
2,2005-07-11,86594.0,4,82.0,60.0,0.0,0.0,0.0,0.0,1.0,0.0
3,2012-12-12,86594.0,S-8,2102.0,1762.0,13.0,4.0,4.0,1.0,78.0,8.0
4,2006-08-02,86594.0,4,61.0,44.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
906578,2019-05-13,18735.0,S-1,101132.0,92530.0,1603.0,1056.0,792.0,474.0,875.0,1375.0
906579,2019-05-02,18735.0,DRSLTR,2733.0,2492.0,16.0,17.0,3.0,6.0,47.0,34.0
906580,2019-05-02,18735.0,DRS/A,101883.0,93017.0,1590.0,1055.0,786.0,466.0,821.0,1367.0
906581,2019-04-25,18735.0,UPLOAD,2880.0,2580.0,28.0,22.0,10.0,4.0,48.0,34.0


In [11]:
sec_analytics[sec_analytics["PERMNO"] == 86594.0].sort_values(by="FDATE").head()

Unnamed: 0,CIK,FDATE,GVKEY,FORM,WORD_COUNT,COMPLEX_WORD_COUNT,ARI,COLEMAN_LIAU_INDEX,FLESCH_KINCAID_GRADE_LEVEL,FLESCH_READING_EASE,...,RIX,SMOG_INDEX,LM_MASTER_DICTIONARY_COUNT,LM_NEGATIVE_COUNT,LM_POSITIVE_COUNT,LM_MODAL_WEAK_COUNT,LM_MODAL_STRONG_COUNT,LM_LITIGIOUS_COUNT,LM_UNCERTAINTY_COUNT,PERMNO
173,56679,1998-08-18,118267,S-1,41172.0,10623.0,18.552631,17.113089,15.729576,27.472262,...,8.797905,17.744452,37549.0,322.0,411.0,206.0,220.0,338.0,417.0,86594.0
545,56679,1998-08-18,118267,S-1,41172.0,10623.0,18.552631,17.113089,15.729576,27.472262,...,8.797905,17.744452,37549.0,322.0,411.0,206.0,220.0,338.0,417.0,86594.0
544,56679,1998-09-04,118267,S-1/A,173837.0,39571.0,20.149471,14.896849,17.616691,26.687202,...,9.880636,18.672986,166236.0,2974.0,1202.0,857.0,792.0,6039.0,1014.0,86594.0
172,56679,1998-09-04,118267,S-1/A,173837.0,39571.0,20.149471,14.896849,17.616691,26.687202,...,9.880636,18.672986,166236.0,2974.0,1202.0,857.0,792.0,6039.0,1014.0,86594.0
512,56679,1998-09-22,118267,8-A12B,335.0,87.0,15.770003,17.433194,12.485647,38.69517,...,6.5,15.688483,292.0,0.0,2.0,1.0,0.0,5.0,1.0,86594.0


In [12]:
all_monthly_data[all_monthly_data["permno"] == 86594.0].sort_values(by="monthid").head()

Unnamed: 0,permno,yyyymm,monthid,IM,range_20,log_vol_dollar_20,range_120,log_vol_dollar_120,xret_5,xret_10,...,ret_f3,ret_f4,ret_f5,ret_f6,ret_f7,ret_f8,ret_f9,ret_f10,ret_f11,ret_f12
382780,86594.0,199901.0,229.0,,,,,,,,...,-0.090476,0.109948,0.283019,-0.194853,0.223744,0.376866,-0.03523,0.016854,0.607735,-0.053265
382781,86594.0,199902.0,230.0,,,,,,,,...,0.109948,0.283019,-0.194853,0.223744,0.376866,-0.03523,0.016854,0.607735,-0.053265,0.088929
382782,86594.0,199903.0,231.0,,,,,,,,...,0.283019,-0.194853,0.223744,0.376866,-0.03523,0.016854,0.607735,-0.053265,0.088929,-0.206667
382783,86594.0,199904.0,232.0,,,,,,,,...,-0.194853,0.223744,0.376866,-0.03523,0.016854,0.607735,-0.053265,0.088929,-0.206667,-0.109244
382784,86594.0,199905.0,233.0,,,,,,,,...,0.223744,0.376866,-0.03523,0.016854,0.607735,-0.053265,0.088929,-0.206667,-0.109244,-0.183962


# Macro factors

In [13]:
macro_factors = ['FF_O', 'SL_Y20', 'MORTG_NA', 
                'PRIME_NA', 'CD_M1', 'CD_M3', 'CD_M6', 'ED_M1', 'ED_M3', 
                'ED_M6', 'TB_M3', 'TB_M6', 'TB_Y1',
                'TCMNOM_M3', 'TCMNOM_M6', 'TCMNOM_Y1', 'TCMNOM_Y2', 
                'TCMNOM_Y3', 'TCMNOM_Y5', 'TCMNOM_Y7', 'TCMNOM_Y10', 
                'TCMNOM_Y20', 'TCMNOM_Y30', 'AAA_NA', 'BAA_NA', 'B30RET', 'B20RET', 
                 'B10RET', 'B7RET', 'B5RET', 'B2RET', 'B1RET', 
                'T90RET', 'T30RET', 'CPIRET', ]
factors += macro_factors

In [14]:
treasury_inflation = pd.read_sas("treasury_inflation.sas7bdat", encoding = 'ISO-8859-1')
federal_reserve = pd.read_sas("federal_reserve.sas7bdat", encoding = 'ISO-8859-1')

In [15]:
#dropping factors with excess null values
federal_reserve.dropna(axis = 1, thresh = 300, inplace=True)

In [16]:
#adding yyyymm to merge taking into account 1 month shift
federal_reserve["yyyymm"] = (federal_reserve["date"] + pd.DateOffset(months=1)).dt.strftime("%Y%m").astype(float)
treasury_inflation["yyyymm"] = (treasury_inflation["CALDT"] + pd.DateOffset(months=1)).dt.strftime("%Y%m").astype(float)

In [17]:
#merging to all_monthly_data
all_monthly_data = pd.merge(all_monthly_data, federal_reserve, on="yyyymm")
all_monthly_data = pd.merge(all_monthly_data, treasury_inflation, on="yyyymm")

# Data Cleanup

In [18]:
# Inputation - as in ML Lecture 1

# Drop NA in all non-numerical columns
all_monthly_data.dropna(subset=non_data_cols, inplace=True)

grouped_med = all_monthly_data.groupby(by='monthid')
# the lambda function gets the median per group in the groupby object, and fills the NaN values with the median per group
imputed_grouped = grouped_med.transform(lambda y: y.fillna(y.median()))

# This line assigns the values of the medians 
all_monthly_data = all_monthly_data.assign(**imputed_grouped.to_dict(orient='series'))
all_monthly_data.dropna(inplace=True)

In [19]:
# Filtering data by min price and min market share for each year

# Commenting out for runtime - does not drop any rows

# all_monthly_data['yyyy'] = all_monthly_data['yyyymm'].astype(str).str[:4]
# all_monthly_data['MKTSHR'] = all_monthly_data['PRC'] * all_monthly_data['SHROUT'] * 1_000

# to_drop_indices = []

# for permno in all_monthly_data.permno.unique():
#     for year in all_monthly_data['yyyy'].unique():
#         mask = (all_monthly_data['permno'] == permno) & (all_monthly_data['yyyy'] == year)
#         if all_monthly_data[mask].shape[0] != 0 != 0 and (all_monthly_data[mask]['MKTSHR'].iloc[0] < 100_000_000 or all_monthly_data[mask]['PRC'].iloc[0] <= 5):
#             to_drop_indices += list(all_monthly_data[mask].index)
# all_monthly_data.drop(to_drop_indices, inplace=True)

In [20]:
# Winsorizing factors--should winsorize the variables by quarter
for column in factors:
    for date in set(list(all_monthly_data["monthid"])):
        mask = (all_monthly_data["monthid"] == date)
        
        std = all_monthly_data[column][mask].std()
        mean = all_monthly_data[column][mask].mean()

        upper = mean + 3 * std
        lower = mean - 3 * std
        
        all_monthly_data[column][mask].clip(lower, upper, inplace= True)

In [21]:
all_monthly_data

Unnamed: 0,permno,yyyymm,monthid,IM,range_20,log_vol_dollar_20,range_120,log_vol_dollar_120,xret_5,xret_10,...,B30IND,B20IND,B10IND,B7IND,B5IND,B2IND,B1IND,T90IND,T30IND,CPIIND
3,10145.0,198601.0,73.0,0.060316,0.016503,16.650330,0.014017,16.371426,-0.008059,-0.004234,...,267.6089,258.6363,308.3518,311.6737,309.2294,327.9959,339.9697,321.6565,293.7668,257.1764
4,10241.0,198601.0,73.0,0.071812,0.014277,15.683749,0.015375,15.205596,-0.011443,-0.035607,...,267.6089,258.6363,308.3518,311.6737,309.2294,327.9959,339.9697,321.6565,293.7668,257.1764
5,10460.0,198601.0,73.0,0.093893,0.022592,13.462903,0.024872,13.226926,0.018046,0.046521,...,267.6089,258.6363,308.3518,311.6737,309.2294,327.9959,339.9697,321.6565,293.7668,257.1764
6,10516.0,198601.0,73.0,0.187840,0.017957,15.540080,0.018308,15.189420,-0.022566,0.011309,...,267.6089,258.6363,308.3518,311.6737,309.2294,327.9959,339.9697,321.6565,293.7668,257.1764
7,10517.0,198601.0,73.0,0.073529,0.048080,12.558268,0.024596,11.513491,0.062813,0.003209,...,267.6089,258.6363,308.3518,311.6737,309.2294,327.9959,339.9697,321.6565,293.7668,257.1764
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
440715,88664.0,198512.0,72.0,-0.306584,0.019174,13.342257,0.017969,13.162905,0.001480,0.003818,...,250.7871,244.8869,294.7147,299.5477,299.7958,323.3717,337.0134,319.3229,291.9745,256.4705
440716,90756.0,198512.0,72.0,-0.306584,0.019174,13.342257,0.017969,13.162905,0.001480,0.003818,...,250.7871,244.8869,294.7147,299.5477,299.7958,323.3717,337.0134,319.3229,291.9745,256.4705
440717,91556.0,198512.0,72.0,-0.306584,0.019174,13.342257,0.017969,13.162905,0.001480,0.003818,...,250.7871,244.8869,294.7147,299.5477,299.7958,323.3717,337.0134,319.3229,291.9745,256.4705
440718,91855.0,198512.0,72.0,0.099752,0.019174,13.342257,0.017969,13.162905,0.001480,0.003818,...,250.7871,244.8869,294.7147,299.5477,299.7958,323.3717,337.0134,319.3229,291.9745,256.4705


## Factor Code

In [22]:
ff4_factors = pd.read_sas("ff4_factors.sas7bdat", encoding = 'ISO-8859-1')
ff4_factors["monthid"] = ff4_factors.index + 1
ff4_factors.head()

Unnamed: 0,DATEFF,SMB,HML,MKTRF,RF,UMD,monthid
0,1980-01-31,0.0162,0.0175,0.0551,0.008,0.0755,1
1,1980-02-29,-0.0185,0.0061,-0.0122,0.0089,0.0788,2
2,1980-03-31,-0.0664,-0.0101,-0.129,0.0121,-0.0955,3
3,1980-04-30,0.0105,0.0106,0.0397,0.0126,-0.0043,4
4,1980-05-30,0.0213,0.0038,0.0526,0.0081,-0.0112,5


In [23]:
dates = [int(x) for x in sorted(list(set(list(all_monthly_data["yyyymm"]))))]
dates[0], dates[-1]

(198202, 201307)

In [24]:
monthids = [int(x) for x in sorted(list(set(list(all_monthly_data["monthid"]))))]
monthids[0], monthids[-1], len(monthids)

(26, 403, 216)

In [25]:
testing_range = monthids[0:2*(len(monthids) // 3)]
validation_range = monthids[2 * len(monthids) // 3:]

# Validate that ranges have correct ratios
len(testing_range) / len(monthids), len(validation_range) / len(monthids), len(testing_range) + len(validation_range)

(0.6666666666666666, 0.3333333333333333, 216)

## Testing Factors

In [26]:
model_factors = ['IM', 'range_20', 'log_vol_dollar_20',
       'range_120', 'log_vol_dollar_120', 'xret_5', 'xret_10', 'xret_20',
       'xret_indsize_20', 'xret_indsize_std20', 'xret_40', 'xret_120',
       'xret_indsize_120', 'xret_indsize_std120', 'KDJ_20', 'deviation_pct20',
       'MoneyFlowIndex_20', 'RSI_20', 'KDJ_120', 'deviation_pct120',
       'MoneyFlowIndex_120', 'RSI_120', 'IV_capm', 'mdr', 'ami_3', 'beta_3y',
       'beta_5y', 'tail_2y', 'dp', 'leverage', 'BL', 'roe', 'roa',
       'profitability', 'sales_g_q', 'sales_g_ttm', 'op_income_g_q', 'ni_g_q',
       'op_income_g_ttm', 'ni_g_ttm', 'sue_NI', 'BM', 'AM', 'EP', 'SP',
       'roe_q', 'roa_q', 'Cto', 'pe_ttm', 'lag_log_size']

In [27]:
model_factors += macro_factors

In [28]:
all_monthly_data = pd.merge(ff4_factors, all_monthly_data, on="monthid")

## [m, n, l] model for Fama-MacBeth Double Regression
We will use the technique employed during Assignment 2, utilizing a 36-month lookback for factor data to generate our betas (**First Stage**)
* For period $t_i$, we will use data starting at $t_{i-36} ... t_{i-1}$ if available. Worst case we look for 12 prior samples.

In [29]:
permnos = set(all_monthly_data["permno"])

In [30]:
#!pip install multiprocess

In [31]:
from multiprocess import Manager, cpu_count # You might have to change to multiprocessing if on windows
from multiprocess.pool import ThreadPool

In [32]:
# Threaded Approach
def add_betas(permno):
    results = []
    for (i, monthid) in enumerate(testing_range): 
        window = set(testing_range[max(0, i-35):i+1]) # t_(i-36) to t_(i-1) returns. Compare to t_i returns
        window_data = all_monthly_data[(all_monthly_data["permno"] == permno) & (all_monthly_data["monthid"].isin(window))]
        
        if len(window_data) < 12:
            continue

        explanatory_vars = window_data[model_factors + ["monthid"]]
        explanatory_vars.sort_values(by="monthid", inplace=True)
        explanatory_vars.set_index("monthid", inplace=True)
    
        explained_var = window_data[["monthid", "RET"]] # Since factors are from t-1
        explained_var.sort_values(by="monthid", inplace=True)
        explained_var.set_index("monthid", inplace=True)
        
        model = linear_model.LinearRegression().fit(explanatory_vars, 
                                                    explained_var["RET"])
        
        results.append({"monthid": monthid, 
                        "permno": permno, 
                        "RET": explained_var["RET"].iloc[-1], 
                    **{f"{factor}": model.coef_[i] for i, factor in enumerate(model_factors)}
                       })  
    return results

# UNCOMMENT THIS AND ADD TO LINE BELOW INSTEAD OF `permnos`
# FOR DEVELOPMENT - THIS CODE BLOCK TAKES LIKE 30 MINS TO RUN

# smaller_permno_list = list(permnos)[:10]

# Only compute if not in files (delete local copy of file if code above if modifying factors or code above)
if os.path.isfile("first_stage_df.csv"):
    first_stage_df = pd.read_csv("first_stage_df.csv", index_col=0)
else:
    # Runs once basically
    summary_results = []
    with ThreadPool(cpu_count() - 1) as P:
        summary_results = P.map(add_betas, permnos)
        summary_results = [item for sublist in summary_results for item in sublist]
        first_stage_df = pd.DataFrame(summary_results)
        
        # Save first stage df for easy loading
        first_stage_df.to_csv("first_stage_df.csv")
first_stage_df

Unnamed: 0,monthid,permno,RET,IM,range_20,log_vol_dollar_20,range_120,log_vol_dollar_120,xret_5,xret_10,...,B30RET,B20RET,B10RET,B7RET,B5RET,B2RET,B1RET,T90RET,T30RET,CPIRET
0,37,49154.0,0.002268,0.015205,-0.010278,0.057982,0.000316,-0.000874,0.006516,0.003849,...,0.008083,0.008083,0.009508,0.008482,0.006178,0.004651,0.002910,0.000851,-0.000082,-0.000031
1,38,49154.0,0.089140,0.006667,-0.002853,0.085001,-0.000085,0.028916,0.005409,-0.000613,...,0.001326,0.001326,0.000023,0.003319,0.003015,0.003384,0.002412,0.000801,0.000019,0.000045
2,39,49154.0,0.141962,0.008693,0.002539,0.077274,0.001316,0.024903,0.001960,-0.001020,...,0.003328,0.003328,0.001547,0.004207,0.003502,0.003710,0.002539,0.000780,0.000027,-0.000053
3,40,49154.0,-0.005484,0.000377,-0.001762,0.066811,-0.000107,0.024276,0.007160,0.002094,...,0.003868,0.003868,0.003781,0.006843,0.005239,0.005257,0.003452,0.001112,-0.000069,0.000038
4,41,49154.0,0.020956,-0.004650,0.003615,0.074329,-0.000323,0.009708,0.027553,0.026784,...,-0.005145,-0.005145,-0.006536,0.009861,0.006597,0.006308,0.004356,0.001271,-0.000646,-0.004482
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89610,246,81917.0,-0.019022,0.165596,0.085542,0.299908,-0.001883,0.221424,0.408155,0.472825,...,-0.267809,-0.301312,0.426651,0.265228,0.370042,0.099377,0.076332,-0.025255,-0.023801,0.024128
89611,247,81917.0,0.127535,0.182037,0.008028,0.285854,-0.117653,0.028721,-0.498885,0.342359,...,-0.244826,-0.293345,0.469481,0.385664,0.628472,0.206649,0.151574,0.019254,-0.000347,-0.121334
89612,248,81917.0,-0.060606,0.188030,-0.219003,0.281645,-0.161331,0.438866,-0.358221,-0.629045,...,-0.983907,-0.510824,0.929531,0.334558,0.733992,-0.035936,-0.001282,-0.056250,-0.125405,0.138543
89613,249,81917.0,0.005376,0.246716,0.271400,0.303642,-0.139960,0.236188,0.076774,-0.808842,...,-1.107713,-0.648234,0.954431,0.489960,0.876174,-0.014868,-0.001881,-0.061946,-0.123857,0.161784


In [33]:
# Second stage regression
lambdas = {"monthid": []}
for factor in model_factors:
    lambdas[f"{factor}"] = []
    
for monthid in testing_range:
    monthid_returns = first_stage_df.loc[first_stage_df["monthid"] == monthid]

    # If empty
    if monthid_returns.empty:
        continue
    
    explanatory_vars = monthid_returns[model_factors + ["permno"]]
    explanatory_vars.sort_values(by="permno", inplace=True)
    explanatory_vars.set_index("permno", inplace=True)

    explained_var = monthid_returns[["permno", "RET"]]
    explained_var.sort_values(by="permno", inplace=True)
    explained_var.set_index("permno", inplace=True)
    
    model = linear_model.LinearRegression(n_jobs=len(model_factors)).fit(explanatory_vars, 
                                                                         explained_var["RET"])

    lambdas["monthid"].append(monthid)

    for (i, factor) in enumerate(model_factors):
        lambdas[factor].append(model.coef_[i])

In [34]:
second_stage_df = pd.DataFrame(lambdas)
second_stage_df

Unnamed: 0,monthid,IM,range_20,log_vol_dollar_20,range_120,log_vol_dollar_120,xret_5,xret_10,xret_20,xret_indsize_20,...,B30RET,B20RET,B10RET,B7RET,B5RET,B2RET,B1RET,T90RET,T30RET,CPIRET
0,37,0.827790,0.839122,-0.237946,-0.607570,0.269703,0.201543,-0.963455,0.035119,1.396578,...,266.196553,266.196553,-478.646586,-270.144597,726.966915,-839.528395,-329.344755,3737.673211,-5316.852575,2553.760266
1,38,0.061294,-0.333175,0.045037,0.886988,-0.004352,0.039709,-0.234459,-0.162749,0.339815,...,-106.211989,-106.211989,262.195354,-528.111254,216.402090,619.122951,-968.848940,2905.906176,-3211.945719,-527.315549
2,39,-0.049370,1.079391,0.191825,-0.684513,0.060417,-0.310639,0.173117,0.074601,0.115316,...,-27.278547,-27.278547,-74.276617,-118.134297,135.954852,430.461056,-21.145591,-1587.324963,3122.575738,-2013.638107
3,40,0.133378,1.609646,0.006297,-1.820092,0.260469,0.352478,0.281043,-0.038765,0.032611,...,186.190936,186.190936,-333.751659,40.120866,-472.327866,2107.580563,-3273.523395,4549.537117,-2879.851449,1840.240907
4,41,0.324921,-0.399031,0.095914,1.103005,-0.183758,0.267488,0.297225,0.225772,-0.100367,...,65.070732,65.070732,-103.393088,-83.368691,-26.629136,198.006102,-47.532286,1356.095769,-1967.521587,177.495925
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,246,-0.052088,-0.044711,-0.061819,0.011504,-0.003668,-0.021084,-0.012048,0.027448,0.012328,...,-0.347525,0.506699,0.027536,-0.366006,0.997586,-2.005554,0.046938,-0.756892,5.033056,-0.151598
129,247,-0.019329,-0.006143,-0.036184,0.054631,0.007959,0.010389,0.000046,-0.016737,0.005592,...,-0.147572,0.154743,0.260359,-0.480232,0.549882,-0.286998,-1.148887,-2.108837,3.809643,0.127550
130,248,-0.026056,-0.013738,-0.036300,0.170055,-0.001604,-0.009146,-0.006693,0.003507,-0.001663,...,-0.095071,0.151012,-0.009927,0.088978,-0.116100,-0.905496,2.330768,-0.864669,0.730069,0.071539
131,249,0.014287,-0.013534,-0.051270,0.033502,-0.035874,-0.002285,-0.007239,0.001230,0.017800,...,-1.005614,1.098808,0.549502,0.587903,-0.714287,-0.723736,-0.503015,-0.150001,3.309338,-1.379498


In [35]:
# Get p values
p_value_dict = {"factor": [], "p-value": []}
for factor in model_factors:
    lambdas = second_stage_df[factor]
    ttest = stats.ttest_ind(lambdas, np.zeros(len(lambdas))) # Compare to see if any lambdas are significantly different from zero
    p_value_dict['factor'].append(factor)
    p_value_dict['p-value'].append(ttest[1])

results_df = pd.DataFrame.from_dict(p_value_dict, orient='index')
results_df.round(2).T.sort_values(by="p-value")

Unnamed: 0,factor,p-value
60,TB_M3,0.019302
63,TCMNOM_M3,0.022512
53,PRIME_NA,0.02968
32,roa,0.051993
73,AAA_NA,0.069691
...,...,...
7,xret_20,0.931331
79,B5RET,0.936548
38,op_income_g_ttm,0.980579
71,TCMNOM_Y20,0.989718


# TODO
use these results to determine which factors to keep (among other considerations like cross-correlation, if they are in the same category, etc)

# Machine Learning

frick

# Performance Analysis

In [None]:
def total_ret(port_ret):
    return port_ret.sum()
    # return np.prod(port_ret + 1) - 1

def tracking_error(port_ret, bench_ret):
    return (port_ret - bench_ret).std()

def information_ratio(port_ret, bench_ret):
    return (total_ret(port_ret) - total_ret(bench_ret)) / tracking_error(port_ret, bench_ret)

def sharpe_ratio(port_ret, rf_ret):
    return information_ratio(port_ret, rf_ret)

def sharpe_ratio(port_xret):
    return total_ret(port_xret) / port_xret.std()

In [None]:
# Write Permnos - for WRDS Queries

# with open("permnos.txt", "w") as file:
#     for permno in permnos:
#         file.write(f"{int(permno)},\n")