# Final Project

- Saige Belanger
    - (20951877)
- Dylan Faelker
    - (20960747)
- Ethan Liu
    - (20959615)
- Timothy Zheng
    - t54zheng (20939203)

In [4]:
import pandas as pd
from pandasql import sqldf
import matplotlib.pyplot as plt
import numpy as np
import warnings
from sklearn import linear_model
import statsmodels.api as sm
import scipy.stats as stats
from math import sqrt

from sklearn import preprocessing
from sklearn.impute import SimpleImputer
import datetime as dt

warnings.filterwarnings('ignore')

In [5]:
all_monthly_data = pd.read_sas("merged_df.sas7bdat", encoding = 'ISO-8859-1')

In [6]:
all_monthly_data.drop(["ticker", "conm", "gvkey", "cusip", "naics", "gsubind"], axis=1, inplace=True) # We don't use these columns anyway, drop them

In [7]:
factors = ['IM', 'range_20', 'log_vol_dollar_20',
       'range_120', 'log_vol_dollar_120', 'xret_5', 'xret_10', 'xret_20',
       'xret_indsize_20', 'xret_indsize_std20', 'xret_40', 'xret_120',
       'xret_indsize_120', 'xret_indsize_std120', 'KDJ_20', 'deviation_pct20',
       'MoneyFlowIndex_20', 'RSI_20', 'KDJ_120', 'deviation_pct120',
       'MoneyFlowIndex_120', 'RSI_120', 'IV_capm', 'mdr', 'ami_3', 'beta_3y',
       'beta_5y', 'tail_2y', 'dp', 'leverage', 'BL', 'roe', 'roa',
       'profitability', 'sales_g_q', 'sales_g_ttm', 'op_income_g_q', 'ni_g_q',
       'op_income_g_ttm', 'ni_g_ttm', 'sue_NI', 'BM', 'AM', 'EP', 'SP',
       'roe_q', 'roa_q', 'Cto', 'pe_ttm', 'lag_log_size']

ret_cols = ['ret_f1', 'ret_f2', 'ret_f3', 'ret_f4', 'ret_f5', 'ret_f6', 
            'ret_f7', 'ret_f8', 'ret_f9', 'ret_f10', 'ret_f11', 'ret_f12']

In [8]:
non_data_cols = [x for x in all_monthly_data.columns if x not in factors and x not in ret_cols]
non_data_cols

['permno', 'yyyymm', 'monthid', 'PRC', 'VOL', 'RET', 'SHROUT']

In [9]:
# Inputation - as in ML Lecture 1

# Drop NA in all non-numerical columns
all_monthly_data.dropna(subset=non_data_cols, inplace=True)

grouped_med = all_monthly_data.groupby(by='monthid')
# the lambda function gets the median per group in the groupby object, and fills the NaN values with the median per group
imputed_grouped = grouped_med.transform(lambda y: y.fillna(y.median()))

# This line assigns the values of the medians 
all_monthly_data = all_monthly_data.assign(**imputed_grouped.to_dict(orient='series'))
all_monthly_data.dropna(inplace=True)

In [10]:
# Filtering data by min price and min market share for each year

# Commenting out for runtime - does not drop any rows

# all_monthly_data['yyyy'] = all_monthly_data['yyyymm'].astype(str).str[:4]
# all_monthly_data['MKTSHR'] = all_monthly_data['PRC'] * all_monthly_data['SHROUT'] * 1_000

# to_drop_indices = []

# for permno in all_monthly_data.permno.unique():
#     for year in all_monthly_data['yyyy'].unique():
#         mask = (all_monthly_data['permno'] == permno) & (all_monthly_data['yyyy'] == year)
#         if all_monthly_data[mask].shape[0] != 0 != 0 and (all_monthly_data[mask]['MKTSHR'].iloc[0] < 100_000_000 or all_monthly_data[mask]['PRC'].iloc[0] <= 5):
#             to_drop_indices += list(all_monthly_data[mask].index)
# all_monthly_data.drop(to_drop_indices, inplace=True)

In [11]:
# Winsorizing factors--should winsorize the variables by quarter
for column in factors:
    for date in set(list(all_monthly_data["monthid"])):
        mask = (all_monthly_data["monthid"] == date)
        
        std = all_monthly_data[column][mask].std()
        mean = all_monthly_data[column][mask].mean()

        upper = mean + 3 * std
        lower = mean - 3 * std
        
        all_monthly_data[column][mask].clip(lower, upper, inplace= True)

In [12]:
all_monthly_data

Unnamed: 0,permno,yyyymm,monthid,IM,range_20,log_vol_dollar_20,range_120,log_vol_dollar_120,xret_5,xret_10,...,ret_f3,ret_f4,ret_f5,ret_f6,ret_f7,ret_f8,ret_f9,ret_f10,ret_f11,ret_f12
2,10026.0,198603.0,75.0,-0.183465,0.021967,13.935166,0.020239,13.429522,0.001357,0.002957,...,-0.156250,-0.375000,-0.066667,-0.166667,0.114286,0.051282,-0.048780,0.615385,0.031746,0.030769
3,10026.0,198604.0,76.0,0.636488,0.023080,13.849557,0.020850,13.536907,-0.005400,0.000191,...,-0.375000,-0.066667,-0.166667,0.114286,0.051282,-0.048780,0.615385,0.031746,0.030769,-0.119403
4,10026.0,198605.0,77.0,0.354652,0.023095,13.798978,0.022117,13.612165,0.007211,0.006643,...,-0.066667,-0.166667,0.114286,0.051282,-0.048780,0.615385,0.031746,0.030769,-0.119403,-0.042373
5,10026.0,198606.0,78.0,0.308972,0.020076,13.643324,0.022047,13.697346,-0.009126,-0.005370,...,-0.166667,0.114286,0.051282,-0.048780,0.615385,0.031746,0.030769,-0.119403,-0.042373,0.159292
6,10026.0,198607.0,79.0,0.273834,0.020131,13.650006,0.022064,13.723655,-0.004666,-0.010382,...,0.114286,0.051282,-0.048780,0.615385,0.031746,0.030769,-0.119403,-0.042373,0.159292,0.114504
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
440715,93429.0,201908.0,476.0,0.043659,0.022145,17.094448,0.022864,17.153811,0.007966,0.013102,...,0.035693,0.009251,0.026833,-0.071904,-0.217105,0.113501,0.074864,-0.123802,-0.059820,0.051425
440716,93429.0,201909.0,477.0,0.017751,0.025800,17.123301,0.023830,17.149012,-0.000186,-0.005715,...,0.009251,0.026833,-0.071904,-0.217105,0.113501,0.074864,-0.123802,-0.059820,0.051425,-0.044122
440717,93429.0,201910.0,478.0,0.004530,0.024727,17.193758,0.024310,17.139314,0.004466,-0.006312,...,0.026833,-0.071904,-0.217105,0.113501,0.074864,-0.123802,-0.059820,0.051425,-0.044122,-0.073513
440718,93429.0,201911.0,479.0,0.005354,0.022660,17.061345,0.024140,17.111865,-0.004312,0.000399,...,-0.071904,-0.217105,0.113501,0.074864,-0.123802,-0.059820,0.051425,-0.044122,-0.073513,0.128552


## Factor Code

In [13]:
ff4_factors = pd.read_sas("ff4_factors.sas7bdat", encoding = 'ISO-8859-1')
ff4_factors["monthid"] = ff4_factors.index + 1
ff4_factors.head()

Unnamed: 0,DATEFF,SMB,HML,MKTRF,RF,UMD,monthid
0,1980-01-31,0.0162,0.0175,0.0551,0.008,0.0755,1
1,1980-02-29,-0.0185,0.0061,-0.0122,0.0089,0.0788,2
2,1980-03-31,-0.0664,-0.0101,-0.129,0.0121,-0.0955,3
3,1980-04-30,0.0105,0.0106,0.0397,0.0126,-0.0043,4
4,1980-05-30,0.0213,0.0038,0.0526,0.0081,-0.0112,5


In [14]:
dates = [int(x) for x in sorted(list(set(list(all_monthly_data["yyyymm"]))))]
dates[0], dates[-1]

(198004, 201912)

In [15]:
monthids = [int(x) for x in sorted(list(set(list(all_monthly_data["monthid"]))))]
monthids[0], monthids[-1], len(monthids)

(4, 480, 477)

In [16]:
testing_range = monthids[0:2*(len(monthids) // 3)]
validation_range = monthids[2 * len(monthids) // 3:]

# Validate that ranges have correct ratios
len(testing_range) / len(monthids), len(validation_range) / len(monthids), len(testing_range) + len(validation_range)

(0.6666666666666666, 0.3333333333333333, 477)

## Testing Factors

In [17]:
model_factors = ['IM', 'range_20', 'log_vol_dollar_20',
       'range_120', 'log_vol_dollar_120', 'xret_5', 'xret_10', 'xret_20',
       'xret_indsize_20', 'xret_indsize_std20', 'xret_40', 'xret_120',
       'xret_indsize_120', 'xret_indsize_std120', 'KDJ_20', 'deviation_pct20',
       'MoneyFlowIndex_20', 'RSI_20', 'KDJ_120', 'deviation_pct120',
       'MoneyFlowIndex_120', 'RSI_120', 'IV_capm', 'mdr', 'ami_3', 'beta_3y',
       'beta_5y', 'tail_2y', 'dp', 'leverage', 'BL', 'roe', 'roa',
       'profitability', 'sales_g_q', 'sales_g_ttm', 'op_income_g_q', 'ni_g_q',
       'op_income_g_ttm', 'ni_g_ttm', 'sue_NI', 'BM', 'AM', 'EP', 'SP',
       'roe_q', 'roa_q', 'Cto', 'pe_ttm', 'lag_log_size']

In [18]:
all_monthly_data = pd.merge(ff4_factors, all_monthly_data, on="monthid")

## [m, n, l] model for Fama-MacBeth Double Regression
We will use the technique employed during Assignment 2, utilizing a 36-month lookback for factor data to generate our betas (**First Stage**)
* For period $t_i$, we will use data starting at $t_{i-36} ... t_{i-1}$ if available. Worst case we look for 12 prior samples.

In [19]:
permnos = set(all_monthly_data["permno"])

In [1]:
# Version of the multiprocessing library that works with Jupyter Notebooks
!pip install multiprocess

Collecting multiprocess
  Downloading multiprocess-0.70.16-py38-none-any.whl.metadata (7.1 kB)
Collecting dill>=0.3.8 (from multiprocess)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Downloading multiprocess-0.70.16-py38-none-any.whl (132 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.6/132.6 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dill, multiprocess
Successfully installed dill-0.3.8 multiprocess-0.70.16


In [20]:
from multiprocess import Pool, Manager

In [None]:
# Fama-Macbeth
# using [m,n,l] model 

# Threading speedup
manager = Manager()
summary_results = manager.dict()

summary_results["monthid"] = []
summary_results["permno"] = []
for factor in model_factors:
    summary_results[f"{factor}_beta"] = []

def add_betas(permno):
    for (i, monthid) in enumerate(testing_range):
        # Optimize
        if i < 11:
            continue
            
        window = set(testing_range[max(0, i-35):i+1]) # t_(i-36) to t_(i-1) returns. Compare to t_i returns
        window_data = all_monthly_data[(all_monthly_data["monthid"].isin(window)) & (all_monthly_data["permno"] == permno)]
        
        if len(window_data) < 12:
            continue
            
        window_factors = window_data[model_factors]
        window_returns = window_data["RET"] # Since factors are from t-1
        rf = window_data["RF"]
        
        window_excess_returns = window_returns - rf
    
        model_x = sm.add_constant(window_factors)
        model_y = sm.OLS(window_excess_returns, model_x).fit()
    
        summary_results["monthid"].append(monthid)
        summary_results["permno"].append(permno)
    
        for factor in model_factors:
            summary_results[f"{factor}_beta"].append(model_y.params[factor])

if __name__ == '__main__':
    with Pool() as P:
        result_list= list(P.map(add_betas, permnos))
        

In [33]:
first_stage_df = pd.DataFrame(summary_results).set_index("monthid")
first_stage_df

Unnamed: 0_level_0,permno,IM_beta,IM_beta_t_stat,range_20_beta,range_20_beta_t_stat,log_vol_dollar_20_beta,log_vol_dollar_20_beta_t_stat,range_120_beta,range_120_beta_t_stat,log_vol_dollar_120_beta,...,roe_q_beta,roe_q_beta_t_stat,roa_q_beta,roa_q_beta_t_stat,Cto_beta,Cto_beta_t_stat,pe_ttm_beta,pe_ttm_beta_t_stat,lag_log_size_beta,lag_log_size_beta_t_stat
monthid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15,49154.0,-0.209131,,-0.016676,,-0.355592,,0.001417,,0.193001,...,0.002606,,-0.005120,,-0.025124,,0.032864,,0.081667,
16,49154.0,-0.192345,,-0.016097,,-0.327118,,0.001424,,0.214135,...,0.010954,,0.007743,,0.060946,,-0.004459,,0.064560,
17,49154.0,-0.311754,,-0.020052,,-0.224471,,0.001007,,0.217179,...,0.012494,,0.006269,,0.048171,,0.030910,,-0.081230,
18,49154.0,-0.355868,,-0.021481,,-0.209875,,0.000772,,0.195835,...,0.020728,,0.005618,,0.052972,,0.015798,,-0.077686,
19,49154.0,-0.313180,,-0.020913,,-0.210451,,0.000193,,0.216846,...,0.008997,,0.000176,,0.004429,,0.009629,,-0.118856,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317,81917.0,0.468625,,10.812009,,-0.466755,,2.481337,,0.669191,...,18.777279,,12.303925,,-3.060220,,-0.077922,,2.475819,
318,81917.0,0.598238,,19.773484,,-0.496104,,12.260907,,0.667233,...,-3.416657,,0.784448,,2.396761,,-0.207145,,2.684361,
319,81917.0,0.598568,,20.297463,,-0.486032,,10.600855,,0.656997,...,-5.835022,,-0.420914,,2.567645,,-0.184414,,2.745798,
320,81917.0,-0.223139,,31.091282,,-0.503372,,1.458490,,0.523207,...,-16.179173,,-7.219454,,3.419905,,-0.041960,,0.230163,


In [None]:
# Second stage regression
lambdas = {"monthid": []}
for factor in model_factors:
    lambdas[f"{factor}"] = []
    lambdas[f"{factor}_t_stat"] = []
    
for monthid in testing_range:
    # Check a4

# Performance Analysis

In [None]:
def total_ret(port_ret):
    return port_ret.sum()
    # return np.prod(port_ret + 1) - 1

def tracking_error(port_ret, bench_ret):
    return (port_ret - bench_ret).std()

def information_ratio(port_ret, bench_ret):
    return (total_ret(port_ret) - total_ret(bench_ret)) / tracking_error(port_ret, bench_ret)

def sharpe_ratio(port_ret, rf_ret):
    return information_ratio(port_ret, rf_ret)

def sharpe_ratio(port_xret):
    return total_ret(port_xret) / port_xret.std()