# Final Project

- Saige Belanger
    - (20951877)
- Dylan Faelker
    - (20960747)
- Ethan Liu
    - (20959615)
- Timothy Zheng
    - t54zheng (20939203)

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
from sklearn import linear_model
import statsmodels.api as sm
import scipy.stats as stats
from math import sqrt

from sklearn import preprocessing
from sklearn.impute import SimpleImputer
import datetime as dt

import os.path

warnings.filterwarnings('ignore')

# Factors
We start with an initial list of factors from the provided list of 50 Factors in the ML examples.

TODO: Increase our breadth of factors to the category chosen by downloading and creating them, then merging them 

https://wrds-www.wharton.upenn.edu/pages/get-data/compustat-capital-iq-standard-poors/compustat/north-america-daily/fundamentals-quarterly/

In [29]:
all_monthly_data = pd.read_sas("merged_df.sas7bdat", encoding = 'ISO-8859-1')

In [34]:
# Save all gvkeys - for WRDS Queries

# with open("gvkeys.txt", "w") as file:
#     for gvkey in set(all_monthly_data["gvkey"].dropna()):
#         file.write(f"{int(gvkey)},\n")

In [3]:
all_monthly_data.drop(["ticker", "conm", "gvkey", "cusip", "naics", "gsubind"], axis=1, inplace=True) # We don't use these columns anyway, drop them

In [4]:
factors = ['IM', 'range_20', 'log_vol_dollar_20',
       'range_120', 'log_vol_dollar_120', 'xret_5', 'xret_10', 'xret_20',
       'xret_indsize_20', 'xret_indsize_std20', 'xret_40', 'xret_120',
       'xret_indsize_120', 'xret_indsize_std120', 'KDJ_20', 'deviation_pct20',
       'MoneyFlowIndex_20', 'RSI_20', 'KDJ_120', 'deviation_pct120',
       'MoneyFlowIndex_120', 'RSI_120', 'IV_capm', 'mdr', 'ami_3', 'beta_3y',
       'beta_5y', 'tail_2y', 'dp', 'leverage', 'BL', 'roe', 'roa',
       'profitability', 'sales_g_q', 'sales_g_ttm', 'op_income_g_q', 'ni_g_q',
       'op_income_g_ttm', 'ni_g_ttm', 'sue_NI', 'BM', 'AM', 'EP', 'SP',
       'roe_q', 'roa_q', 'Cto', 'pe_ttm', 'lag_log_size']

ret_cols = ['ret_f1', 'ret_f2', 'ret_f3', 'ret_f4', 'ret_f5', 'ret_f6', 
            'ret_f7', 'ret_f8', 'ret_f9', 'ret_f10', 'ret_f11', 'ret_f12']

In [5]:
non_data_cols = [x for x in all_monthly_data.columns if x not in factors and x not in ret_cols]
non_data_cols

['permno', 'yyyymm', 'monthid', 'PRC', 'VOL', 'RET', 'SHROUT']

In [6]:
# Inputation - as in ML Lecture 1

# Drop NA in all non-numerical columns
all_monthly_data.dropna(subset=non_data_cols, inplace=True)

grouped_med = all_monthly_data.groupby(by='monthid')
# the lambda function gets the median per group in the groupby object, and fills the NaN values with the median per group
imputed_grouped = grouped_med.transform(lambda y: y.fillna(y.median()))

# This line assigns the values of the medians 
all_monthly_data = all_monthly_data.assign(**imputed_grouped.to_dict(orient='series'))
all_monthly_data.dropna(inplace=True)

In [7]:
# Filtering data by min price and min market share for each year

# Commenting out for runtime - does not drop any rows

# all_monthly_data['yyyy'] = all_monthly_data['yyyymm'].astype(str).str[:4]
# all_monthly_data['MKTSHR'] = all_monthly_data['PRC'] * all_monthly_data['SHROUT'] * 1_000

# to_drop_indices = []

# for permno in all_monthly_data.permno.unique():
#     for year in all_monthly_data['yyyy'].unique():
#         mask = (all_monthly_data['permno'] == permno) & (all_monthly_data['yyyy'] == year)
#         if all_monthly_data[mask].shape[0] != 0 != 0 and (all_monthly_data[mask]['MKTSHR'].iloc[0] < 100_000_000 or all_monthly_data[mask]['PRC'].iloc[0] <= 5):
#             to_drop_indices += list(all_monthly_data[mask].index)
# all_monthly_data.drop(to_drop_indices, inplace=True)

In [8]:
# Winsorizing factors--should winsorize the variables by quarter
for column in factors:
    for date in set(list(all_monthly_data["monthid"])):
        mask = (all_monthly_data["monthid"] == date)
        
        std = all_monthly_data[column][mask].std()
        mean = all_monthly_data[column][mask].mean()

        upper = mean + 3 * std
        lower = mean - 3 * std
        
        all_monthly_data[column][mask].clip(lower, upper, inplace= True)

In [9]:
all_monthly_data

Unnamed: 0,permno,yyyymm,monthid,IM,range_20,log_vol_dollar_20,range_120,log_vol_dollar_120,xret_5,xret_10,...,ret_f3,ret_f4,ret_f5,ret_f6,ret_f7,ret_f8,ret_f9,ret_f10,ret_f11,ret_f12
2,10026.0,198603.0,75.0,-0.183465,0.021967,13.935166,0.020239,13.429522,0.001357,0.002957,...,-0.156250,-0.375000,-0.066667,-0.166667,0.114286,0.051282,-0.048780,0.615385,0.031746,0.030769
3,10026.0,198604.0,76.0,0.636488,0.023080,13.849557,0.020850,13.536907,-0.005400,0.000191,...,-0.375000,-0.066667,-0.166667,0.114286,0.051282,-0.048780,0.615385,0.031746,0.030769,-0.119403
4,10026.0,198605.0,77.0,0.354652,0.023095,13.798978,0.022117,13.612165,0.007211,0.006643,...,-0.066667,-0.166667,0.114286,0.051282,-0.048780,0.615385,0.031746,0.030769,-0.119403,-0.042373
5,10026.0,198606.0,78.0,0.308972,0.020076,13.643324,0.022047,13.697346,-0.009126,-0.005370,...,-0.166667,0.114286,0.051282,-0.048780,0.615385,0.031746,0.030769,-0.119403,-0.042373,0.159292
6,10026.0,198607.0,79.0,0.273834,0.020131,13.650006,0.022064,13.723655,-0.004666,-0.010382,...,0.114286,0.051282,-0.048780,0.615385,0.031746,0.030769,-0.119403,-0.042373,0.159292,0.114504
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
440715,93429.0,201908.0,476.0,0.043659,0.022145,17.094448,0.022864,17.153811,0.007966,0.013102,...,0.035693,0.009251,0.026833,-0.071904,-0.217105,0.113501,0.074864,-0.123802,-0.059820,0.051425
440716,93429.0,201909.0,477.0,0.017751,0.025800,17.123301,0.023830,17.149012,-0.000186,-0.005715,...,0.009251,0.026833,-0.071904,-0.217105,0.113501,0.074864,-0.123802,-0.059820,0.051425,-0.044122
440717,93429.0,201910.0,478.0,0.004530,0.024727,17.193758,0.024310,17.139314,0.004466,-0.006312,...,0.026833,-0.071904,-0.217105,0.113501,0.074864,-0.123802,-0.059820,0.051425,-0.044122,-0.073513
440718,93429.0,201911.0,479.0,0.005354,0.022660,17.061345,0.024140,17.111865,-0.004312,0.000399,...,-0.071904,-0.217105,0.113501,0.074864,-0.123802,-0.059820,0.051425,-0.044122,-0.073513,0.128552


## Factor Code

In [10]:
ff4_factors = pd.read_sas("ff4_factors.sas7bdat", encoding = 'ISO-8859-1')
ff4_factors["monthid"] = ff4_factors.index + 1
ff4_factors.head()

Unnamed: 0,DATEFF,SMB,HML,MKTRF,RF,UMD,monthid
0,1980-01-31,0.0162,0.0175,0.0551,0.008,0.0755,1
1,1980-02-29,-0.0185,0.0061,-0.0122,0.0089,0.0788,2
2,1980-03-31,-0.0664,-0.0101,-0.129,0.0121,-0.0955,3
3,1980-04-30,0.0105,0.0106,0.0397,0.0126,-0.0043,4
4,1980-05-30,0.0213,0.0038,0.0526,0.0081,-0.0112,5


In [11]:
dates = [int(x) for x in sorted(list(set(list(all_monthly_data["yyyymm"]))))]
dates[0], dates[-1]

(198004, 201912)

In [12]:
monthids = [int(x) for x in sorted(list(set(list(all_monthly_data["monthid"]))))]
monthids[0], monthids[-1], len(monthids)

(4, 480, 477)

In [13]:
testing_range = monthids[0:2*(len(monthids) // 3)]
validation_range = monthids[2 * len(monthids) // 3:]

# Validate that ranges have correct ratios
len(testing_range) / len(monthids), len(validation_range) / len(monthids), len(testing_range) + len(validation_range)

(0.6666666666666666, 0.3333333333333333, 477)

## Testing Factors

In [14]:
model_factors = ['IM', 'range_20', 'log_vol_dollar_20',
       'range_120', 'log_vol_dollar_120', 'xret_5', 'xret_10', 'xret_20',
       'xret_indsize_20', 'xret_indsize_std20', 'xret_40', 'xret_120',
       'xret_indsize_120', 'xret_indsize_std120', 'KDJ_20', 'deviation_pct20',
       'MoneyFlowIndex_20', 'RSI_20', 'KDJ_120', 'deviation_pct120',
       'MoneyFlowIndex_120', 'RSI_120', 'IV_capm', 'mdr', 'ami_3', 'beta_3y',
       'beta_5y', 'tail_2y', 'dp', 'leverage', 'BL', 'roe', 'roa',
       'profitability', 'sales_g_q', 'sales_g_ttm', 'op_income_g_q', 'ni_g_q',
       'op_income_g_ttm', 'ni_g_ttm', 'sue_NI', 'BM', 'AM', 'EP', 'SP',
       'roe_q', 'roa_q', 'Cto', 'pe_ttm', 'lag_log_size']

In [15]:
all_monthly_data = pd.merge(ff4_factors, all_monthly_data, on="monthid")

## [m, n, l] model for Fama-MacBeth Double Regression
We will use the technique employed during Assignment 2, utilizing a 36-month lookback for factor data to generate our betas (**First Stage**)
* For period $t_i$, we will use data starting at $t_{i-36} ... t_{i-1}$ if available. Worst case we look for 12 prior samples.

In [16]:
permnos = set(all_monthly_data["permno"])

In [17]:
!pip install multiprocess



In [18]:
from multiprocess import Manager, cpu_count # You might have to change to multiprocessing if on windows
from multiprocess.pool import ThreadPool

In [19]:
# Threaded Approach
def add_betas(permno):
    results = []
    for (i, monthid) in enumerate(testing_range): 
        window = set(testing_range[max(0, i-35):i+1]) # t_(i-36) to t_(i-1) returns. Compare to t_i returns
        window_data = all_monthly_data[(all_monthly_data["permno"] == permno) & (all_monthly_data["monthid"].isin(window))]
        
        if len(window_data) < 12:
            continue

        explanatory_vars = window_data[model_factors + ["monthid"]]
        explanatory_vars.sort_values(by="monthid", inplace=True)
        explanatory_vars.set_index("monthid", inplace=True)
    
        explained_var = window_data[["monthid", "RET"]] # Since factors are from t-1
        explained_var.sort_values(by="monthid", inplace=True)
        explained_var.set_index("monthid", inplace=True)
        
        model = linear_model.LinearRegression().fit(explanatory_vars, 
                                                    explained_var["RET"])
        
        results.append({"monthid": monthid, 
                        "permno": permno, 
                        "RET": explained_var["RET"].iloc[-1], 
                    **{f"{factor}": model.coef_[i] for i, factor in enumerate(model_factors)}
                       })  
    return results

# UNCOMMENT THIS AND ADD TO LINE BELOW INSTEAD OF `permnos`
# FOR DEVELOPMENT - THIS CODE BLOCK TAKES LIKE 30 MINS TO RUN

# smaller_permno_list = list(permnos)[:10]

# Only compute if not in files (delete local copy of file if code above if modifying factors or code above)
if os.path.isfile("first_stage_df.csv"):
    first_stage_df = pd.read_csv("first_stage_df.csv", index_col=0)
else:
    # Runs once basically
    summary_results = []
    with ThreadPool(cpu_count() - 1) as P:
        summary_results = P.map(add_betas, permnos)
        summary_results = [item for sublist in summary_results for item in sublist]
        first_stage_df = pd.DataFrame(summary_results)
        
        # Save first stage df for easy loading
        first_stage_df.to_csv("first_stage_df.csv")
first_stage_df

Unnamed: 0,monthid,permno,RET,IM,range_20,log_vol_dollar_20,range_120,log_vol_dollar_120,xret_5,xret_10,...,sue_NI,BM,AM,EP,SP,roe_q,roa_q,Cto,pe_ttm,lag_log_size
0,15,49154.0,0.217742,-0.177922,-0.017326,-0.404999,0.001008,0.094084,0.162161,0.357050,...,-0.103432,-0.162681,0.226897,-0.000786,0.194951,0.006577,-0.006378,-0.030747,0.038512,0.001883
1,16,49154.0,0.037528,-0.166218,-0.016434,-0.357248,0.001120,0.151541,0.173540,0.401040,...,-0.177359,-0.161623,0.201678,-0.000921,0.181788,0.015621,0.009557,0.075181,-0.006746,0.006358
2,17,49154.0,0.040426,-0.307559,-0.019532,-0.228961,0.001087,0.254345,0.228074,0.380108,...,-0.020192,-0.188044,0.077337,-0.001099,0.045979,0.010966,0.005602,0.043739,0.036017,-0.035808
3,18,49154.0,-0.113402,-0.352726,-0.020887,-0.213851,0.000877,0.238876,0.219848,0.344600,...,-0.074884,-0.184293,0.155087,-0.001629,0.063590,0.019056,0.004786,0.047677,0.020254,-0.022703
4,19,49154.0,0.046512,-0.352828,-0.020891,-0.213848,0.000877,0.238716,0.219853,0.344728,...,-0.074928,-0.184306,0.155164,-0.001633,0.063437,0.019088,0.004799,0.047794,0.020266,-0.022781
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218367,317,81917.0,-0.021578,0.489096,10.687724,-0.466841,3.707940,0.683761,4.434208,-8.056764,...,-0.082417,-6.634807,1.790930,-2.328958,2.991200,19.291439,12.913282,-3.652366,-0.056563,2.546052
218368,318,81917.0,0.041499,0.603172,19.804879,-0.496944,12.429544,0.671798,2.554879,-10.596175,...,0.391672,-4.027635,-2.818458,-2.271254,3.142411,-3.350635,0.841312,2.354425,-0.205835,2.699524
218369,319,81917.0,0.008880,0.605449,20.417885,-0.482653,11.000336,0.662651,3.324765,-9.955248,...,0.477191,-3.633941,-2.279511,-1.578396,3.011760,-6.184711,-0.532376,2.476481,-0.178907,2.775979
218370,320,81917.0,0.063117,-0.228668,31.117055,-0.507347,1.068001,0.524543,6.665282,-2.317012,...,-0.102278,-3.579858,3.059718,-0.709997,-1.842430,-15.868266,-7.148372,3.467508,-0.043580,0.209534


In [20]:
# Second stage regression
lambdas = {"monthid": []}
for factor in model_factors:
    lambdas[f"{factor}"] = []
    
for monthid in testing_range:
    monthid_returns = first_stage_df.loc[first_stage_df["monthid"] == monthid]

    # If empty
    if monthid_returns.empty:
        continue
    
    explanatory_vars = monthid_returns[model_factors + ["permno"]]
    explanatory_vars.sort_values(by="permno", inplace=True)
    explanatory_vars.set_index("permno", inplace=True)

    explained_var = monthid_returns[["permno", "RET"]]
    explained_var.sort_values(by="permno", inplace=True)
    explained_var.set_index("permno", inplace=True)
    
    model = linear_model.LinearRegression(n_jobs=len(model_factors)).fit(explanatory_vars, 
                                                                         explained_var["RET"])

    lambdas["monthid"].append(monthid)

    for (i, factor) in enumerate(model_factors):
        lambdas[factor].append(model.coef_[i])

In [21]:
second_stage_df = pd.DataFrame(lambdas)
second_stage_df

Unnamed: 0,monthid,IM,range_20,log_vol_dollar_20,range_120,log_vol_dollar_120,xret_5,xret_10,xret_20,xret_indsize_20,...,sue_NI,BM,AM,EP,SP,roe_q,roa_q,Cto,pe_ttm,lag_log_size
0,15,-0.035745,-0.571039,-0.061605,-2.283169,-0.029881,-0.108176,-0.104386,0.149525,-0.212826,...,-0.041983,0.026534,0.015382,-0.889307,0.021992,-0.529210,3.357724,-0.393992,-0.057799,0.079260
1,16,0.001652,-0.048982,0.057267,-0.534534,0.026931,0.026047,0.052760,-0.061393,0.012480,...,-0.040266,0.112582,-0.021534,-0.343119,-0.059704,-0.261934,-0.299428,0.035351,0.346079,0.070998
2,17,-0.021220,-0.419154,-0.066717,0.775655,0.019364,0.004047,0.003952,-0.009959,-0.019862,...,-0.034248,0.083042,-0.006422,-0.377669,-0.092692,-0.554356,0.889185,0.121707,0.001101,0.029692
3,18,-0.021485,0.345735,-0.060198,-0.670695,0.009679,-0.006652,-0.001722,-0.016863,0.037531,...,0.006454,0.016666,0.015354,0.207710,0.023645,0.106906,-0.467556,-0.064735,-0.136481,0.051276
4,19,-0.030211,0.044757,-0.012854,0.110476,0.037228,0.007549,0.012964,0.001766,0.028058,...,0.018864,-0.043275,0.024022,-0.027254,-0.015462,0.399238,-0.343996,0.002513,0.093189,0.000007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
302,317,-0.002052,0.000121,0.000777,0.000112,-0.001494,-0.000959,-0.002133,-0.003470,-0.003873,...,0.015909,-0.000528,0.000129,-0.000113,0.000229,-0.000227,-0.000908,0.000146,0.000216,-0.001342
303,318,0.002857,0.000061,-0.003151,0.000102,-0.002905,0.000159,0.000470,0.001164,0.000438,...,0.002612,0.000112,0.000263,0.000272,0.000095,-0.000557,0.000446,0.000588,0.012407,0.003445
304,319,-0.007282,0.000354,0.014312,0.000193,0.006701,0.000748,-0.001218,0.001316,0.001158,...,0.001495,-0.000300,-0.001778,-0.000241,0.000448,0.000159,-0.000518,-0.000001,-0.010411,-0.002123
305,320,0.000738,0.000232,-0.000996,0.000437,0.002546,-0.000941,-0.000438,-0.000859,0.002232,...,-0.013841,0.000209,0.000460,0.000309,-0.000146,0.000065,0.000935,0.000015,0.000445,-0.004901


In [22]:
# Get p values
p_value_dict = {"factor": [], "p-value": []}
for factor in model_factors:
    lambdas = second_stage_df[factor]
    ttest = stats.ttest_ind(lambdas, np.zeros(len(lambdas))) # Compare to see if any lambdas are significantly different from zero
    p_value_dict['factor'].append(factor)
    p_value_dict['p-value'].append(ttest[1])

results_df = pd.DataFrame.from_dict(p_value_dict, orient='index')
results_df.round(2).T.sort_values(by="p-value")

Unnamed: 0,factor,p-value
49,lag_log_size,0.001659
41,BM,0.03193
4,log_vol_dollar_120,0.057241
23,mdr,0.09065
32,roa,0.099273
16,MoneyFlowIndex_20,0.101928
37,ni_g_q,0.128012
28,dp,0.154698
9,xret_indsize_std20,0.160873
13,xret_indsize_std120,0.181982


# TODO
use these results to determine which factors to keep (among other considerations like cross-correlation, if they are in the same category, etc)

# Machine Learning

frick

# Performance Analysis

In [23]:
def total_ret(port_ret):
    return port_ret.sum()
    # return np.prod(port_ret + 1) - 1

def tracking_error(port_ret, bench_ret):
    return (port_ret - bench_ret).std()

def information_ratio(port_ret, bench_ret):
    return (total_ret(port_ret) - total_ret(bench_ret)) / tracking_error(port_ret, bench_ret)

def sharpe_ratio(port_ret, rf_ret):
    return information_ratio(port_ret, rf_ret)

def sharpe_ratio(port_xret):
    return total_ret(port_xret) / port_xret.std()

In [27]:
# Write Permnos - for WRDS Queries

# with open("permnos.txt", "w") as file:
#     for permno in permnos:
#         file.write(f"{int(permno)},\n")