# Filing Sentiment Predictions with FinBERT
**Brett Bartol**

In [87]:
import pandas as pd 
import numpy as np
from numpy import mean, median, arange
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
from regressors import stats
import warnings
warnings.filterwarnings('ignore')
import math
import os
import glob
import random
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import statsmodels.formula.api as ols
import statsmodels.api as sm

**EDA & Data Cleaning**

In [88]:
df = pd.read_csv('Assignment1Data9.csv')

## dropping unneeded cols here
df = df.drop(columns = ['indfmt', 'consol', 'popsrc', 'datafmt', 'costat', 'curcd', 'wcapch', 'nim', 'xinst', 'xt', 'xintd'])
df = df.drop_duplicates(subset = ['gvkey', 'fyear'])

df['xad'] = df['xad'].fillna(0)
df['xrd'] = df['xrd'].fillna(0)

df['fyear'] = df.fyear.fillna(df.datadate.apply(lambda x : int(str(x)[:4])))

In [89]:
# df = pd.read_csv('Assignment1Data6.csv')

# ## dropping unneeded cols here
# df = df.drop(columns = ['indfmt', 'consol', 'popsrc', 'datafmt', 'costat', 'exchg', 'ggroup', 'tic', 'curcd', 'fyr', 'ajex'])
# df = df.drop_duplicates(subset = ['gvkey', 'fyear'])

# df['xad'] = df['xad'].fillna(0)
# df['xrd'] = df['xrd'].fillna(0)

# df['fyear'] = df.fyear.fillna(df.datadate.apply(lambda x : int(str(x)[:4])))


In [90]:
df.head()

Unnamed: 0,gvkey,datadate,fyear,ajex,act,ap,apalch,at,bkvlps,capx,...,revt,sale,wcap,xacc,xad,xint,xopr,xrd,xsga,prcc_f
0,1000,1961/12/31,1961.0,3.3418,,,,,2.4342,,...,0.9,0.9,,,0.0,,,0.0,,
1,1000,1962/12/31,1962.0,3.3418,,,,,3.0497,,...,1.6,1.6,,,0.0,0.01,,0.0,,
2,1000,1963/12/31,1963.0,3.2445,0.408,0.096,,,2.9731,,...,1.457,1.457,0.086,,0.0,0.02,1.411,0.0,0.346,
3,1000,1964/12/31,1964.0,3.09,0.718,0.146,,1.416,3.0969,,...,2.032,2.032,0.451,,0.0,0.033,1.905,0.0,0.431,
4,1000,1965/12/31,1965.0,3.09,0.725,0.272,,2.31,2.3835,,...,1.688,1.688,0.102,,0.0,0.062,1.848,0.0,0.506,


In [91]:
df.isna().sum()

gvkey            0
datadate         0
fyear            0
ajex           206
act         146941
ap          133949
apalch      349302
at           78809
bkvlps      136533
capx        139853
ceq          94826
ch          144252
chech       233068
cogs        125886
csho         74319
dd1         120204
dd2         248979
dd3         250097
dlc          87875
dltt         79092
dt          364749
dv          173912
dvt          81230
ebit        128538
ebitda      128449
fincf       273832
invt        120542
ivncf       273831
lct         140930
lt           85239
ni          123528
nopi        123759
oancf       273900
opeps       285198
ppegt       117814
ppent        86518
re          105995
rect        133378
revt         80078
sale        120059
wcap        149085
xacc        254686
xad              0
xint        101037
xopr        125895
xrd              0
xsga        204627
prcc_f      117434
dtype: int64

In [92]:
df.columns

Index(['gvkey', 'datadate', 'fyear', 'ajex', 'act', 'ap', 'apalch', 'at',
       'bkvlps', 'capx', 'ceq', 'ch', 'chech', 'cogs', 'csho', 'dd1', 'dd2',
       'dd3', 'dlc', 'dltt', 'dt', 'dv', 'dvt', 'ebit', 'ebitda', 'fincf',
       'invt', 'ivncf', 'lct', 'lt', 'ni', 'nopi', 'oancf', 'opeps', 'ppegt',
       'ppent', 're', 'rect', 'revt', 'sale', 'wcap', 'xacc', 'xad', 'xint',
       'xopr', 'xrd', 'xsga', 'prcc_f'],
      dtype='object')

In [93]:
# filling in missing values with median of the year
new_df = pd.DataFrame(columns = df.columns)
for i in range(2000, 2021):
    
    df2 = df[df['fyear'] == i]
    df2['act'] = df2['act'].fillna(df2['act'].median())
    df2['ap'] = df2['ap'].fillna(df2['ap'].median())
    df2['apalch'] = df2['apalch'].fillna(df2['apalch'].median())
    df2['at'] = df2['at'].fillna(df2['at'].median())
    df2['lct'] = df2['lct'].fillna(df2['lct'].median())
    df2['bkvlps'] = df2['bkvlps'].fillna(df2['bkvlps'].median())
    df2['capx'] = df2['capx'].fillna(df2['capx'].median())
    df2['ceq'] = df2['ceq'].fillna(df2['ceq'].median())
    df2['ch'] = df2['ch'].fillna(df2['ch'].median())
    df2['chech'] = df2['chech'].fillna(df2['chech'].median())
    df2['cogs'] = df2['cogs'].fillna(df2['cogs'].median())
    df2['dd1'] = df2['dd1'].fillna(df2['dd1'].median())
    df2['dd2'] = df2['dd2'].fillna(df2['dd2'].median())
    df2['dd3'] = df2['dd3'].fillna(df2['dd3'].median())
    df2['dlc'] = df2['dlc'].fillna(df2['dlc'].median())
    df2['dltt'] = df2['dltt'].fillna(df2['dltt'].median())
    df2['dt'] = df2['dt'].fillna(df2['dt'].median())
    df2['dv'] = df2['dv'].fillna(df2['dv'].median())
    df2['dvt'] = df2['dvt'].fillna(df2['dvt'].median())
    df2['ebit'] = df2['ebit'].fillna(df2['ebit'].median())
    df2['ebitda'] = df2['ebitda'].fillna(df2['ebitda'].median())
    df2['fincf'] = df2['fincf'].fillna(df2['fincf'].median())
    df2['ivncf'] = df2['ivncf'].fillna(df2['ivncf'].median())
    df2['invt'] = df2['invt'].fillna(df2['invt'].median())
    df2['lt'] = df2['lt'].fillna(df2['lt'].median())
    df2['ni'] = df2['ni'].fillna(df2['ni'].median())
    df2['nopi'] = df2['nopi'].fillna(df2['nopi'].median())
    df2['oancf'] = df2['oancf'].fillna(df2['oancf'].median())
    df2['opeps'] = df2['opeps'].fillna(df2['opeps'].median())
    df2['ppegt'] = df2['ppegt'].fillna(df2['ppegt'].median())
    df2['ppent'] = df2['ppent'].fillna(df2['ppent'].median())
    df2['re'] = df2['re'].fillna(df2['re'].median())
    df2['rect'] = df2['rect'].fillna(df2['rect'].median())
    df2['revt'] = df2['revt'].fillna(df2['revt'].median())
    df2['sale'] = df2['sale'].fillna(df2['sale'].median())
    df2['wcap'] = df2['wcap'].fillna(df2['wcap'].median())
    df2['xacc'] = df2['xacc'].fillna(df2['xacc'].median())
    df2['xad'] = df2['xad'].fillna(df2['xad'].median())
    # df2['xinst'] = df2['xinst'].fillna(df2['xinst'].median())
    df2['xint'] = df2['xint'].fillna(df2['xint'].median())
    # df2['xintd'] = df2['xintd'].fillna(df2['xintd'].median())
    df2['xopr'] = df2['xopr'].fillna(df2['xopr'].median())
    df2['xrd'] = df2['xrd'].fillna(df2['xrd'].median())
    df2['xsga'] = df2['xsga'].fillna(df2['xsga'].median())
    # df2['xt'] = df2['xt'].fillna(df2['xt'].median())
    
    new_df = new_df.append(df2)
    
    new_df['pm'] = new_df['ni'] / new_df['revt']
    new_df['atr'] = new_df['revt'] / new_df['at']
    new_df['fl'] = new_df['at'] / new_df['ceq']
    new_df['cr'] = new_df['act'] / new_df['lct']
    new_df['de'] = new_df['dt'] / new_df['ceq']
    new_df['roa'] = new_df['ni'] / new_df['at']

In [94]:
new_df.isna().sum()

gvkey           0
datadate        0
fyear           0
ajex          205
act             0
ap              0
apalch          0
at              0
bkvlps          0
capx            0
ceq             0
ch              0
chech           0
cogs            0
csho        35493
dd1             0
dd2             0
dd3             0
dlc             0
dltt            0
dt              0
dv              0
dvt             0
ebit            0
ebitda          0
fincf           0
invt            0
ivncf           0
lct             0
lt              0
ni              0
nopi            0
oancf           0
opeps           0
ppegt           0
ppent           0
re              0
rect            0
revt            0
sale            0
wcap            0
xacc            0
xad             0
xint            0
xopr            0
xrd             0
xsga            0
prcc_f      25017
pm            102
atr           775
fl             93
cr            131
de            169
roa            61
dtype: int64

In [95]:
df = new_df
df = df.dropna(subset = ['csho'], how = 'any')

df = df[df['at'] != 0]
df = df[df['lct'] != 0]
df = df[df['act'] != 0]


df['ajex'] = df['ajex'].replace(0, 1)
df['ajex'] = df['ajex'].fillna(1)
df['csho_adj'] = df['csho'] / df['ajex']

# dropping csho = 0 or NaN
df = df[df['csho_adj'] != 0]
# dropping revt = 0 or NaN
# df = df[df['revt'] != 0]

df['ajex'] = df['ajex'].replace(0, 1)
df['ajex'] = df['ajex'].fillna(1)
df['csho_adj'] = df['csho'] / df['ajex']

df['prcc_f_adj'] = df['prcc_f'] * df['ajex']


# converting variables to per share
df['actps'] = df['act'] / df['csho_adj']
df['apps'] = df['ap'] / df['csho_adj']
df['apalchps'] = df['apalch'] / df['csho_adj']
df['atps'] = df['at'] / df['csho_adj']
df['lctps'] = df['lct'] / df['csho_adj']
df['capxps'] = df['capx'] / df['csho_adj']
df['ceqps'] = df['ceq'] / df['csho_adj']
df['chps'] = df['ch'] / df['csho_adj']
df['chechps'] = df['chech'] / df['csho_adj']
df['cogsps'] = df['cogs'] / df['csho_adj']
df['dd1ps'] = df['dd1'] / df['csho_adj']
df['dd2ps'] = df['dd2'] / df['csho_adj']
df['dd3ps'] = df['dd3'] / df['csho_adj']
df['dlcps'] = df['dlc'] / df['csho_adj']
df['dlttps'] = df['dltt'] / df['csho_adj']
df['dtps'] = df['dt'] / df['csho_adj']
df['dvps'] = df['dv'] / df['csho_adj']
df['dvtps'] = df['dvt'] / df['csho_adj']
df['ebitps'] = df['ebit'] / df['csho_adj']
df['ebitdaps'] = df['ebitda'] / df['csho_adj']
df['fincfps'] = df['fincf'] / df['csho_adj']
df['invtps'] = df['invt'] / df['csho_adj']
df['ivncfps'] = df['ivncf'] / df['csho_adj']
df['ltps'] = df['lt'] / df['csho_adj']
df['nips'] = df['ni'] / df['csho_adj']
df['nopips'] = df['nopi'] /df['csho_adj']
df['oancfps'] = df['oancf'] / df['csho_adj']
df['ppegtps'] = df['ppegt'] / df['csho_adj']
df['ppentps'] = df['ppent'] /df['csho_adj']
df['rectps'] = df['rect'] / df['csho_adj']
df['revtps'] = df['revt'] / df['csho_adj']
df['saleps'] = df['sale'] / df['csho_adj']
df['wcapps'] = df['wcap'] / (df['csho_adj'])
df['xaccps'] = df['xacc'] / df['csho_adj']
df['xadps'] = df['xad'] / df['csho_adj']
df['xintps'] = df['xint'] / df['csho_adj']
df['xoprps'] = df['xopr'] / df['csho_adj']
df['xrdps'] = df['xrd'] / df['csho_adj']
df['xsgaps'] = df['xsga'] / df['csho_adj']
df['reps'] = df['re'] / df['csho_adj']

#df

In [96]:
## cleaning 

# # dropping csho = 0 or NaN
# df = df[df['csho_adj'] != 0]
# # dropping revt = 0 or NaN
# df = df[df['revt'] != 0]

# dropping when csho < .1 bc having less 100k shares outstanding skews our findings drastically
#df = df[df['csho'] > .1]

df = df.reset_index(drop = True)

In [97]:
df

Unnamed: 0,gvkey,datadate,fyear,ajex,act,ap,apalch,at,bkvlps,capx,...,revtps,saleps,wcapps,xaccps,xadps,xintps,xoprps,xrdps,xsgaps,reps
0,1004,2001/05/31,2000.0,1.0000,485.856,73.975,0.3990,701.854,12.6299,13.134,...,32.455544,32.455544,13.381743,1.325537,0.000000,0.812526,30.066006,0.000000,3.566730,7.575639
1,1010,2000/12/31,2000.0,1.0000,49.719,37.400,1.2000,3794.500,65680.0000,198.400,...,29586.666667,29586.666667,706.800000,3240.000000,0.000000,11753.333333,17673.333333,0.000000,2160.000000,58980.000000
2,1013,2000/10/31,2000.0,0.1429,2650.900,211.300,306.3000,3970.500,3.7813,375.300,...,0.609945,0.609945,0.298600,0.080828,0.000000,0.000761,0.485634,0.062703,0.194918,0.342047
3,1019,2000/12/31,2000.0,1.0000,8.840,2.235,0.3990,28.638,79.4217,1.870,...,256.048193,256.048193,20.783133,11.126506,0.000000,1.343373,223.259036,0.000000,75.626506,76.361446
4,1021,2000/06/30,2000.0,0.0020,7.884,1.165,-0.2920,11.608,0.4805,0.139,...,0.005472,0.005472,0.001209,0.000198,0.000000,0.000102,0.005327,0.000103,0.001767,-0.001550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199234,332115,2020/12/31,2020.0,1.0000,10.846,0.956,-0.2160,39.516,1.0090,0.824,...,0.044039,0.044039,0.221586,0.082192,0.000000,0.033604,1.139555,0.772902,2.774829,-9.615903
199235,339965,2021/01/31,2020.0,1.0000,4300.652,5.647,0.1925,5921.739,17.1454,40.330,...,2.056311,2.056311,12.195792,0.286349,0.142402,0.000000,3.911391,0.826437,3.102960,-4.303246
199236,345920,2020/12/31,2020.0,1.0000,199.917,22.638,0.1925,275.795,6.2961,1.447,...,10.215075,10.215075,4.513940,0.627612,0.008239,0.302716,9.859254,0.000000,1.746030,-4.577104
199237,345980,2020/12/31,2020.0,1.0000,2314.000,434.000,0.1925,2397.000,1.7496,2.000,...,4.328790,4.328790,1.672913,0.538330,2.725724,0.010547,5.383305,0.378194,3.790460,-3.718910


In [98]:
df['EarningsPerShare'] = df['nips'] #/ df['csho']

df = df.sort_values(by = ['gvkey', 'fyear'])

# converting par share variables to percent changes
df['EarningsPerShare_pct'] = (df['EarningsPerShare'] - df['EarningsPerShare'].shift(1)) / df.prcc_f.shift(1)
df['actps_pct'] = (df['actps'] - df['actps'].shift(1)) / df.prcc_f.shift(1)
df['apps_pct'] = (df['apps'] - df['apps'].shift(1)) / df.prcc_f.shift(1)
df['apalchps_pct'] = (df['apalchps'] - df['apalchps'].shift(1)) / df.prcc_f.shift(1)
df['atps_pct'] = (df['atps'] - df['atps'].shift(1)) / df.prcc_f.shift(1)
df['lctps_pct'] = (df['lctps'] - df['lctps'].shift(1)) / df.prcc_f.shift(1)
df['capxps_pct'] = (df['capxps'] - df['capxps'].shift(1)) / df.prcc_f.shift(1)
df['ceqps_pct'] = (df['ceqps'] - df['ceqps'].shift(1)) / df.prcc_f.shift(1)
df['chps_pct'] = (df['chps'] - df['chps'].shift(1)) / df.prcc_f.shift(1)
df['chechps_pct'] = (df['chechps'] - df['chechps'].shift(1)) / df.prcc_f.shift(1)
df['cogsps_pct'] = (df['cogsps'] - df['cogsps'].shift(1)) / df.prcc_f.shift(1)
df['dd1ps_pct'] = (df['dd1ps'] - df['dd1ps'].shift(1)) / df.prcc_f.shift(1)
df['dd2ps_pct'] = (df['dd2ps'] - df['dd2ps'].shift(1)) / df.prcc_f.shift(1)
df['dd3ps_pct'] = (df['dd3ps'] - df['dd3ps'].shift(1)) / df.prcc_f.shift(1)
df['dlcps_pct'] = (df['dlcps'] - df['dlcps'].shift(1)) / df.prcc_f.shift(1)
df['dlttps_pct'] = (df['dlttps'] - df['dlttps'].shift(1)) / df.prcc_f.shift(1)
df['dtps_pct'] = (df['dtps'] - df['dtps'].shift(1)) / df.prcc_f.shift(1)
df['dvps_pct'] = (df['dvps'] - df['dvps'].shift(1)) / df.prcc_f.shift(1)
df['dvtps_pct'] = (df['dvtps'] - df['dvtps'].shift(1)) / df.prcc_f.shift(1)
df['ebitps_pct'] = (df['ebitps'] - df['ebitps'].shift(1)) / df.prcc_f.shift(1)
df['ebitdaps_pct'] = (df['ebitdaps'] - df['ebitdaps'].shift(1)) / df.prcc_f.shift(1)
df['fincfps_pct'] = (df['fincfps'] - df['fincfps'].shift(1)) / df.prcc_f.shift(1)
df['invtps_pct'] = (df['invtps'] - df['invtps'].shift(1)) / df.prcc_f.shift(1)
df['ivncfps_pct'] = (df['ivncfps'] - df['ivncfps'].shift(1)) / df.prcc_f.shift(1)
df['ltps_pct'] = (df['ltps'] - df['ltps'].shift(1)) / df.prcc_f.shift(1)
df['nips_pct'] = (df['nips'] - df['nips'].shift(1)) / df.prcc_f.shift(1)
df['nopips_pct'] = (df['nopips'] - df['nopips'].shift(1)) / df.prcc_f.shift(1)
df['oancfps_pct'] = (df['oancfps'] - df['oancfps'].shift(1)) / df.prcc_f.shift(1)
df['opeps_pct'] = (df['opeps'] - df['opeps'].shift(1)) / df.prcc_f.shift(1)
df['ppegtps_pct'] = (df['ppegtps'] - df['ppegtps'].shift(1)) / df.prcc_f.shift(1)
df['ppentps_pct'] = (df['ppentps'] - df['ppentps'].shift(1)) / df.prcc_f.shift(1)
df['rectps_pct'] = (df['rectps'] - df['rectps'].shift(1)) / df.prcc_f.shift(1)
df['reps_pct'] = (df['reps'] - df['reps'].shift(1)) / df.prcc_f.shift(1)
df['revtps_pct'] = (df['revtps'] - df['revtps'].shift(1)) / df.prcc_f.shift(1)
df['saleps_pct'] = (df['saleps'] - df['saleps'].shift(1)) / df.prcc_f.shift(1)
df['wcapps_pct'] = (df['wcapps'] - df['wcapps'].shift(1)) / df.prcc_f.shift(1)
df['xaccps_pct'] = (df['xaccps'] - df['xaccps'].shift(1)) / df.prcc_f.shift(1)
df['xadps_pct'] = (df['xadps'] - df['xadps'].shift(1)) / df.prcc_f.shift(1)
df['xintps_pct'] = (df['xintps'] - df['xintps'].shift(1)) / df.prcc_f.shift(1)
df['xoprps_pct'] = (df['xoprps'] - df['xoprps'].shift(1)) / df.prcc_f.shift(1)
df['xrdps_pct'] = (df['xrdps'] - df['xrdps'].shift(1)) / df.prcc_f.shift(1)
df['xsgaps_pct'] = (df['xsgaps'] - df['xsgaps'].shift(1)) / df.prcc_f.shift(1)
df['bkvlps_pct'] = (df['bkvlps'] - df['bkvlps'].shift(1)) / df.prcc_f.shift(1)
df['opeps_pct'] = (df['opeps'] - df['opeps'].shift(1)) / df.prcc_f.shift(1)
df['ret'] = (df['prcc_f'] - df['prcc_f'].shift(1)) / df.prcc_f.shift(1)

# shifting predictor variables
df['EarningsPerShare_pct_ny'] = df['EarningsPerShare_pct'].shift(-1)
df['EarningsPerShare_pct_prev'] = df['EarningsPerShare_pct'].shift(1)
df['actps2'] = df['actps'].shift(1)
df['apps2'] = df['apps'].shift(1)
df['apalchps2'] = df['apalchps'].shift(1)
df['atps2'] = df['atps'].shift(1)
df['bkvlps2'] = df['bkvlps'].shift(1)
df['lctps2'] = df['lctps'].shift(1)
df['capxps2'] = df['capxps'].shift(1)
df['ceqps2'] = df['ceqps'].shift(1)
df['chps2'] = df['chps'].shift(1)
df['chechps2'] = df['chechps'].shift(1)
df['cogsps2'] = df['cogsps'].shift(1) 
df['dlcps2'] = df['dlcps'].shift(1)
df['dlttps2'] = df['dlttps'].shift(1) 
df['dtps2'] = df['dtps'].shift(1) 
df['dvps2'] = df['dvps'].shift(1) 
df['dvtps2'] = df['dvtps'].shift(1)
df['ebitps2'] = df['ebitps'].shift(1) 
df['ebitdaps2'] = df['ebitdaps'].shift(1) 
df['fincfps2'] = df['fincfps'].shift(1) 
df['invtps2'] = df['invtps'].shift(1)
df['ivncfps2'] = df['ivncfps'].shift(1)
df['ltps2'] = df['ltps'].shift(1)
#df['nips'] = 
df['nopips2'] = df['nopips'].shift(1)
df['oancfps2'] = df['oancfps'].shift(1)
df['opeps2'] = df['opeps'].shift(1)
df['ppegtps2'] = df['ppegtps'].shift(1) 
df['ppentps2'] = df['ppentps'].shift(1) 
df['rectps2'] = df['rectps'] .shift(1)
df['revtps2'] = df['revtps'].shift(1) 
df['saleps2'] = df['saleps'].shift(1) 
df['wcapps2'] = df['wcapps'].shift(1) 
df['xaccps2'] = df['xaccps'].shift(1)
df['xadps2'] = df['xadps'].shift(1)
df['xintps2'] = df['xintps'].shift(1)
df['xoprps2'] = df['xoprps'].shift(1) 
df['xrdps2'] = df['xrdps'].shift(1)
df['xsgaps2'] = df['xsgaps'].shift(1) 
df['reps2'] = df['reps'].shift(1)
df['pm2'] = df['pm'].shift(1) 
df['atr2'] = df['atr'].shift(1) 
df['fl2'] = df['fl'].shift(1)
df['cr2'] = df['cr'].shift(1) 
df['de2'] = df['de'].shift(1) 
df['roa2'] = df['roa'].shift(1) 

df['actps_pct2'] = df['actps_pct'].shift(1)
df['apps_pct2'] = df['apps_pct'].shift(1) 
df['apalchps_pct2'] = df['apalchps_pct'].shift(1)
df['atps_pct2'] = df['atps_pct'].shift(1)
df['lctps_pct2'] = df['lctps_pct'].shift(1)
df['capxps_pct2'] = df['capxps_pct'].shift(1)
df['ceqps_pct2'] = df['ceqps_pct'].shift(1)
df['chps_pct2'] = df['chps_pct'].shift(1)
df['chechps_pct2'] = df['chechps_pct'].shift(1)
df['cogsps_pct2'] = df['cogsps_pct'].shift(1)
df['dlcps_pct2'] = df['dlcps_pct'].shift(1) 
df['dlttps_pct2'] = df['dlttps_pct'].shift(1)
df['dtps_pct2'] = df['dtps_pct'].shift(1)
df['dvps_pct2'] = df['dvps_pct'].shift(1)
df['dvtps_pct2'] = df['dvtps_pct'].shift(1) 
df['ebitps_pct2'] = df['ebitps_pct'].shift(1)
df['ebitdaps_pct2'] = df['ebitdaps_pct'].shift(1)
df['fincfps_pct2'] = df['fincfps_pct'].shift(1)
df['invtps_pct2'] = df['invtps_pct'].shift(1)
df['ivncfps_pct2'] = df['ivncfps_pct'].shift(1)
df['ltps_pct2'] = df['ltps_pct'].shift(1) 
#df['nips_pct'] = 
df['nopips_pct2'] = df['nopips_pct'].shift(1)
df['oancfps_pct2'] = df['oancfps_pct'].shift(1) 
df['opeps_pct2'] = df['opeps_pct'].shift(1)
df['ppegtps_pct2'] = df['ppegtps_pct'].shift(1)
df['ppentps_pct2'] = df['ppentps_pct'].shift(1)
df['rectps_pct2'] = df['rectps_pct'].shift(1)
df['reps_pct2'] = df['reps_pct'].shift(1)
df['revtps_pct2'] = df['revtps_pct'].shift(1)
df['saleps_pct2'] = df['saleps_pct'].shift(1) 
df['wcapps_pct2'] = df['wcapps_pct'].shift(1)
df['xaccps_pct2'] = df['xaccps_pct'].shift(1)
df['xadps_pct2'] = df['xadps_pct'].shift(1)
df['xintps_pct2'] = df['xintps_pct'].shift(1)
df['xoprps_pct2'] = df['xoprps_pct'].shift(1)
df['xrdps_pct2'] = df['xrdps_pct'].shift(1)
df['xsgaps_pct2'] = df['xsgaps_pct'].shift(1) 
df['bkvlps_pct2'] = df['bkvlps_pct'].shift(1)
df['opeps_pct2'] = df['opeps_pct'].shift(1)
df['ret2'] = df['ret'].shift(1) 

df = df[df.gvkey == df.gvkey.shift(-1)]
df = df[df.gvkey == df.gvkey.shift(1)]

df = df.dropna()
#df

In [99]:
for col in df.columns:
    m = df.loc[df[col] != np.inf, col].max()
    m2 = df.loc[df[col] != -np.inf, col].min()
    df[col].replace([np.inf], m, inplace = True)
    df[col].replace([-np.inf], m2, inplace = True)

In [100]:
var1 = ['fyear', 'actps', 'apps', 'apalchps','atps', 'bkvlps', 'capxps', 'ceqps', 'chps','chechps', 'cogsps','dd1ps',
       'dd2ps', 'dd3ps', 'dlcps','dlttps','dtps', 'dvps', 'dvtps', 'ebitps', 'ebitdaps', 'fincfps', 'invtps', 'ivncfps',
       'lctps', 'ltps', 'nips', 'nopips', 'opeps', 'oancfps', 'ppegtps', 'ppentps', 'rectps', 'reps', 'revtps', 'saleps',
       'wcapps', 'xaccps', 'xadps', 'xintps', 'xoprps', 'xrdps', 'xsgaps', 'actps_pct', 'apps_pct', 'apalchps_pct',
       'atps_pct', 'bkvlps_pct', 'capxps_pct', 'ceqps_pct', 'chps_pct', 'chechps_pct', 'cogsps_pct', 'dd1ps_pct',
       'dd2ps_pct', 'dd3ps_pct', 'dlcps_pct', 'dlttps_pct', 'dtps_pct', 'dvps_pct', 'dvtps_pct', 'ebitps_pct', 
       'ebitdaps_pct', 'fincfps_pct','invtps_pct', 'ivncfps_pct', 'lctps_pct', 'ltps_pct', 'nips_pct','nopips_pct', 'opeps_pct',
       'oancfps_pct', 'ppegtps_pct', 'ppentps_pct', 'rectps_pct', 'reps_pct', 'revtps_pct', 'saleps_pct',
       'wcapps_pct', 'xaccps_pct', 'xadps_pct', 'xintps_pct', 'xoprps_pct', 'xrdps_pct', 'xsgaps_pct', 'ret', 'pm', 'atr', 'fl', 
       'cr', 'de', 'roa', 'EarningsPerShare_pct', 'EarningsPerShare_pct_prev']

In [101]:
var2 = ['actps2', 'apps2', 'apalchps2','atps2', 'bkvlps2', 'capxps2', 'ceqps2', 'chps2','chechps2', 'cogsps2',
         'dlcps2','dlttps2','dtps2', 'dvps2', 'dvtps2', 'ebitps2', 'ebitdaps2', 'fincfps2', 'invtps2', 'ivncfps2',
       'lctps2', 'ltps2', 'nopips2', 'opeps2', 'oancfps2', 'ppegtps2', 'ppentps2', 'rectps2', 'reps2', 'revtps2', 'saleps2',
       'wcapps2', 'xaccps2', 'xadps2', 'xintps2', 'xoprps2', 'xrdps2', 'xsgaps2', 'actps_pct2', 'apps_pct2', 'apalchps_pct2',
       'atps_pct2', 'bkvlps_pct2', 'capxps_pct2', 'ceqps_pct2', 'chps_pct2', 'chechps_pct2', 'cogsps_pct2',
    'dlcps_pct2', 'dlttps_pct2', 'dtps_pct2', 'dvps_pct2', 'dvtps_pct2', 'ebitps_pct2', 
       'ebitdaps_pct2', 'fincfps_pct2','invtps_pct2', 'ivncfps_pct2', 'lctps_pct2', 'ltps_pct2', 'nopips_pct2', 'opeps_pct2',
       'oancfps_pct2', 'ppegtps_pct2', 'ppentps_pct2', 'rectps_pct2', 'reps_pct2', 'revtps_pct2', 'saleps_pct2',
       'wcapps_pct2', 'xaccps_pct2', 'xadps_pct2', 'xintps_pct2', 'xoprps_pct2', 'xrdps_pct2', 'xsgaps_pct2', 'ret2', 'pm2', 'atr2', 'fl2', 
       'cr2', 'de2', 'roa2']

In [102]:
var = var1 + var2

**LassoCV**

In [103]:
results = []
yearly_coefs = []
lasso_rmse = []
for i in range(2010, 2020):
    train_range = range(i - 10, i)
    train_data = df[df['fyear'].isin(train_range)].copy()[var]
    train_labels = df[df['fyear'].isin(train_range)].copy()['EarningsPerShare_pct_ny']
    val_data = df[df['fyear'] == i].copy()[var]
    val_labels = df[df['fyear'] == i].copy()['EarningsPerShare_pct_ny']


    model_yearly = LassoCV(max_iter = 10000).fit(train_data, train_labels)
    
    print('\n', 'Year', i, 'RMSE: ', np.sqrt(mse(val_labels, model_yearly.predict(val_data))))
    print('\n', 'Year', i, 'R2: ', r2_score(val_labels, model_yearly.predict(val_data)))
    
    yearly_coefs.append(model_yearly.coef_)
    lasso_rmse.append(np.sqrt(mse(val_labels, model_yearly.predict(val_data))))
    results.append([i, model_yearly.predict(val_data), val_labels])


 Year 2010 RMSE:  548.3030391875903

 Year 2010 R2:  0.002776641895664178

 Year 2011 RMSE:  2318.6573783068916

 Year 2011 R2:  -0.014663860691209729

 Year 2012 RMSE:  197.2427471817744

 Year 2012 R2:  -0.05608541883926388

 Year 2013 RMSE:  1295.1053755695418

 Year 2013 R2:  -3.245587034784675

 Year 2014 RMSE:  2865.8659240421816

 Year 2014 R2:  -0.0003685040209544521

 Year 2015 RMSE:  59.70949738413803

 Year 2015 R2:  -0.006179057501203289

 Year 2016 RMSE:  3944.231133366639

 Year 2016 R2:  -0.00022697180915476345

 Year 2017 RMSE:  44.86142654953527

 Year 2017 R2:  -0.04148072332934505

 Year 2018 RMSE:  276.41217854469414

 Year 2018 R2:  0.0006041681696149492

 Year 2019 RMSE:  2771.271412596657

 Year 2019 R2:  -0.00022029753783670003


In [498]:
years = range(2010, 2020)
for i, x in enumerate(yearly_coefs):
    non_zeros = []
    for j, coef in enumerate(x):
        if coef != 0:
            non_zeros.append(var[j])
    print('Year', years[i], ':', non_zeros)

Year 2010 : ['atps', 'revtps']
Year 2011 : ['atps', 'revtps']
Year 2012 : ['atps']
Year 2013 : ['atps']
Year 2014 : ['atps']
Year 2015 : ['atps']
Year 2016 : []
Year 2017 : ['atps']
Year 2018 : ['atps']
Year 2019 : ['actps', 'atps']


**Decision Tree**

In [18]:
results_dt = []
#yearly_coefs_dt = []
dt_rmse = []
for i in range(2010, 2020):
    train_range = range(i - 10, i)
    train_data = df[df['fyear'].isin(train_range)].copy()[var]
    train_labels = df[df['fyear'].isin(train_range)].copy()['EarningsPerShare_pct_ny']
    val_data = df[df['fyear'] == i].copy()[var]
    val_labels = df[df['fyear'] == i].copy()['EarningsPerShare_pct_ny']


    model_yearly = DecisionTreeRegressor().fit(train_data, train_labels)
    
    print('\n', 'Year', i, 'RMSE: ', np.sqrt(mse(val_labels, model_yearly.predict(val_data))))
    print('\n', 'Year', i, 'R2: ', r2_score(val_labels, model_yearly.predict(val_data)))
    #print(stats.summary(model_yearly, val_data, val_labels))
    #yearly_coefs_dt.append(model_yearly.coef_)
    dt_rmse.append(np.sqrt(mse(val_labels, model_yearly.predict(val_data))))
    results_dt.append([i, model_yearly.predict(val_data), val_labels])


 Year 2010 RMSE:  2279.406163270339

 Year 2010 R2:  -16.2343366739023

 Year 2011 RMSE:  2307.711781061285

 Year 2011 R2:  -0.005106702322758672

 Year 2012 RMSE:  239.61838303562863

 Year 2012 R2:  -0.5586091711289458

 Year 2013 RMSE:  2064.292928220762

 Year 2013 R2:  -9.78624264685677

 Year 2014 RMSE:  2833.8540870782394

 Year 2014 R2:  0.021854992587417965

 Year 2015 RMSE:  693.9089120659265

 Year 2015 R2:  -134.89183822235694

 Year 2016 RMSE:  5988.095308908474

 Year 2016 R2:  -1.3054263679287956

 Year 2017 RMSE:  2270.914974438924

 Year 2017 R2:  -2667.7434406779253

 Year 2018 RMSE:  498.97489908811406

 Year 2018 R2:  -2.256721221241701

 Year 2019 RMSE:  5514.855504739031

 Year 2019 R2:  -2.961008877872633


**XGBoost Regressor**

In [34]:
results_xgb = []
#yearly_coefs_dt = []
xgb_rmse = []
for i in range(2010, 2020):
    train_range = range(i - 10, i)
    train_data = df[df['fyear'].isin(train_range)].copy()[var]
    train_labels = df[df['fyear'].isin(train_range)].copy()['EarningsPerShare_pct_ny']
    val_data = df[df['fyear'] == i].copy()[var]
    val_labels = df[df['fyear'] == i].copy()['EarningsPerShare_pct_ny']


    model_yearly = xgb.XGBRegressor().fit(train_data, train_labels)
    
    print('\n', 'Year', i, 'RMSE: ', np.sqrt(mse(val_labels, model_yearly.predict(val_data))))
    print('\n', 'Year', i, 'R2: ', r2_score(val_labels, model_yearly.predict(val_data)))
    #print(stats.summary(model_yearly, val_data, val_labels))
    #yearly_coefs_dt.append(model_yearly.coef_)
    xgb_rmse.append(np.sqrt(mse(val_labels, model_yearly.predict(val_data))))
    results_xgb.append([i, model_yearly.predict(val_data), val_labels])


 Year 2010 RMSE:  1865.4545460813304

 Year 2010 R2:  -10.543047400845033

 Year 2011 RMSE:  2393.4505808269437

 Year 2011 R2:  -0.08117990225930671

 Year 2012 RMSE:  595.376131387169

 Year 2012 R2:  -8.622320108686624

 Year 2013 RMSE:  1809.3490205381947

 Year 2013 R2:  -7.2865210072921105

 Year 2014 RMSE:  2853.923832514992

 Year 2014 R2:  0.007951216424114893

 Year 2015 RMSE:  242.7608218179751

 Year 2015 R2:  -15.632044600059036

 Year 2016 RMSE:  5492.728766286632

 Year 2016 R2:  -0.9397696299402434

 Year 2017 RMSE:  425.4358043789865

 Year 2017 R2:  -92.66409707760326

 Year 2018 RMSE:  360.91773205362205

 Year 2018 R2:  -0.7038824331895186

 Year 2019 RMSE:  4753.419730286211

 Year 2019 R2:  -1.942726288013795


**Reading in Sentiment Data**

In [17]:
# reading in sentiment data

path = os.getcwd()[:-1] + '2'
csv_files = glob.glob(os.path.join(path + '/filing_sentiments/', "*.csv"))

sentiment_data = pd.DataFrame()

for f in csv_files:
    csv = pd.read_csv(f)
    sentiment_data = sentiment_data.append(csv)   
    
sentiment_data

Unnamed: 0.1,Unnamed: 0,gvkey,date,report_type,sentiment_score,tot_pos,tot_neu,tot_neg,raw_text
0,0,10005,2000-12-31,10-K,0.491296,47,414,56,10-K 1 sr10k.htm STANDARD REGISTER FORM 10-K ...
1,0,10005,2001-12-30,10-K,0.530717,40,231,22,10-K 1 sr10k02.htm THE STANDARD REGISTER COMP...
2,0,10005,2002-12-29,10-K,0.473976,129,853,190,10-K 1 sr10k2002.htm UNITED STATES SECURITIES...
3,0,10005,2003-12-28,10-K,0.468210,148,897,229,10-K 1 sr10k20032.htm FORM 10-K UNITED STATES...
4,0,10005,2005-01-02,10-K,0.481028,165,887,213,10-K 1 sr10k2004.htm FORM 10-K UNITED STATES ...
...,...,...,...,...,...,...,...,...,...
12,0,9906,2011-12-31,10-K,0.449349,72,823,181,10-K 1 form10k.htm SOUTHWESTERN PUBLIC SERVIC...
13,0,9906,2012-12-31,10-K,0.444085,61,839,182,10-K 1 form10k.htm SOUTHWESTERN PUBLIC SERVIC...
14,0,9906,2013-12-31,10-K,0.440252,49,602,144,10-K 1 sps1231201310-k.htm 10-K SPS 12.31.201...
15,0,9906,2014-12-31,10-K,0.455610,72,595,144,10-K 1 sps1231201410-k.htm 10-K SPS 12.31.201...


In [18]:
sentiment_data.tot_pos = sentiment_data.tot_pos.replace(0, 1)
sentiment_data.tot_neu = sentiment_data.tot_neu.replace(0, 1)
sentiment_data.tot_neg = sentiment_data.tot_neg.replace(0, 1)

In [19]:
sentiment_data['pos_ratio'] = sentiment_data.tot_pos / (sentiment_data.tot_neg + sentiment_data.tot_neu + sentiment_data.tot_pos)
sentiment_data['subjectivity'] = (sentiment_data.tot_pos - sentiment_data.tot_neg) / (sentiment_data.tot_pos + sentiment_data.tot_neg)

sentiment_data['fyear'] = sentiment_data.date.apply(lambda x : int(str(x[:4])))

In [20]:
sentiment_data['year1'] = sentiment_data.fyear + 1
sentiment_data['year2'] = sentiment_data.fyear + 2
sentiment_data['year3'] = sentiment_data.fyear + 3

In [21]:
sentiment_data

Unnamed: 0.1,Unnamed: 0,gvkey,date,report_type,sentiment_score,tot_pos,tot_neu,tot_neg,raw_text,pos_ratio,subjectivity,fyear,year1,year2,year3
0,0,10005,2000-12-31,10-K,0.491296,47,414,56,10-K 1 sr10k.htm STANDARD REGISTER FORM 10-K ...,0.090909,-0.087379,2000,2001,2002,2003
1,0,10005,2001-12-30,10-K,0.530717,40,231,22,10-K 1 sr10k02.htm THE STANDARD REGISTER COMP...,0.136519,0.290323,2001,2002,2003,2004
2,0,10005,2002-12-29,10-K,0.473976,129,853,190,10-K 1 sr10k2002.htm UNITED STATES SECURITIES...,0.110068,-0.191223,2002,2003,2004,2005
3,0,10005,2003-12-28,10-K,0.468210,148,897,229,10-K 1 sr10k20032.htm FORM 10-K UNITED STATES...,0.116170,-0.214854,2003,2004,2005,2006
4,0,10005,2005-01-02,10-K,0.481028,165,887,213,10-K 1 sr10k2004.htm FORM 10-K UNITED STATES ...,0.130435,-0.126984,2005,2006,2007,2008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12,0,9906,2011-12-31,10-K,0.449349,72,823,181,10-K 1 form10k.htm SOUTHWESTERN PUBLIC SERVIC...,0.066914,-0.430830,2011,2012,2013,2014
13,0,9906,2012-12-31,10-K,0.444085,61,839,182,10-K 1 form10k.htm SOUTHWESTERN PUBLIC SERVIC...,0.056377,-0.497942,2012,2013,2014,2015
14,0,9906,2013-12-31,10-K,0.440252,49,602,144,10-K 1 sps1231201310-k.htm 10-K SPS 12.31.201...,0.061635,-0.492228,2013,2014,2015,2016
15,0,9906,2014-12-31,10-K,0.455610,72,595,144,10-K 1 sps1231201410-k.htm 10-K SPS 12.31.201...,0.088779,-0.333333,2014,2015,2016,2017


In [22]:
all_data = pd.merge(df, sentiment_data[['gvkey', 'year1', 'sentiment_score', 'pos_ratio', 'subjectivity']], left_on = ['fyear', 'gvkey'], right_on = ['year1', 'gvkey'], suffixes = ('', '_1'))
all_data = pd.merge(all_data, sentiment_data[['gvkey', 'year2', 'sentiment_score', 'pos_ratio', 'subjectivity']], left_on = ['fyear', 'gvkey'], right_on = ['year2', 'gvkey'], suffixes = ('', '_2'))
all_data = pd.merge(all_data, sentiment_data[['gvkey', 'year3', 'sentiment_score', 'pos_ratio', 'subjectivity']], left_on = ['fyear', 'gvkey'], right_on = ['year3', 'gvkey'], suffixes = ('', '_3'))

In [23]:
all_data.columns

Index(['gvkey', 'datadate', 'fyear', 'ajex', 'act', 'ap', 'apalch', 'at',
       'bkvlps', 'capx',
       ...
       'pos_ratio', 'subjectivity', 'year2', 'sentiment_score_2',
       'pos_ratio_2', 'subjectivity_2', 'year3', 'sentiment_score_3',
       'pos_ratio_3', 'subjectivity_3'],
      dtype='object', length=238)

In [27]:
var3 = ['sentiment_score', 'pos_ratio', 'subjectivity', 'year2', 'sentiment_score_2', 'pos_ratio_2', 'subjectivity_2', 'year3',
           'sentiment_score_3', 'pos_ratio_3', 'subjectivity_3']
    
var_all = var + var3

**LassoCV with All Data**

In [28]:
results = []
yearly_coefs_all = []
lasso_rmse = []
for i in range(2010, 2017):
    
    train_range = range(i - 10, i)
    train_data = all_data[all_data['fyear'].isin(train_range)].copy()[var_all]
    train_labels = all_data[all_data['fyear'].isin(train_range)].copy()['EarningsPerShare_pct_ny']
    val_data = all_data[all_data['fyear'] == i].copy()[var_all]
    val_labels = all_data[all_data['fyear'] == i].copy()['EarningsPerShare_pct_ny']


    model_yearly = LassoCV().fit(train_data, train_labels)

    print('\n', 'Year', i, 'RMSE: ', np.sqrt(mse(val_labels, model_yearly.predict(val_data))))
    print('\n', 'Year', i, 'R2: ', r2_score(val_labels, model_yearly.predict(val_data)))
    #print(stats.summary(model_yearly, val_data, val_labels))
    yearly_coefs_all.append(model_yearly.coef_)
    lasso_rmse.append(np.sqrt(mse(val_labels, model_yearly.predict(val_data))))
    results.append([i, model_yearly.predict(val_data), val_labels])


 Year 2010 RMSE:  51.50458006201397

 Year 2010 R2:  -0.015733750469619556

 Year 2011 RMSE:  3.2337342154098994

 Year 2011 R2:  -158.26892089142564

 Year 2012 RMSE:  14.588170047038883

 Year 2012 R2:  -0.057555134353940884

 Year 2013 RMSE:  2.613575000866915

 Year 2013 R2:  -27.94849080926492

 Year 2014 RMSE:  105.46410052441013

 Year 2014 R2:  -0.005233743830030502

 Year 2015 RMSE:  971.7413615171031

 Year 2015 R2:  -119.04816949888536

 Year 2016 RMSE:  6.645638184788176

 Year 2016 R2:  -0.12700517826437685


In [36]:
list1, list2 = zip(*sorted(zip(yearly_coefs_all[0], var)))
list2

('atps',
 'ltps',
 'EarningsPerShare_pct',
 'EarningsPerShare_pct_prev',
 'actps',
 'actps2',
 'actps_pct',
 'actps_pct2',
 'apalchps',
 'apalchps2',
 'apalchps_pct',
 'apalchps_pct2',
 'apps',
 'apps2',
 'apps_pct',
 'apps_pct2',
 'atps2',
 'atps_pct',
 'atps_pct2',
 'atr',
 'atr2',
 'bkvlps',
 'bkvlps2',
 'bkvlps_pct',
 'bkvlps_pct2',
 'capxps',
 'capxps2',
 'capxps_pct',
 'capxps_pct2',
 'ceqps',
 'ceqps2',
 'ceqps_pct',
 'ceqps_pct2',
 'chechps',
 'chechps2',
 'chechps_pct',
 'chechps_pct2',
 'chps',
 'chps2',
 'chps_pct',
 'chps_pct2',
 'cogsps',
 'cogsps2',
 'cogsps_pct',
 'cogsps_pct2',
 'cr',
 'cr2',
 'dd1ps',
 'dd1ps_pct',
 'dd2ps',
 'dd2ps_pct',
 'dd3ps',
 'dd3ps_pct',
 'de',
 'de2',
 'dlcps',
 'dlcps2',
 'dlcps_pct',
 'dlcps_pct2',
 'dlttps',
 'dlttps2',
 'dlttps_pct',
 'dlttps_pct2',
 'dtps',
 'dtps2',
 'dtps_pct',
 'dtps_pct2',
 'dvps',
 'dvps2',
 'dvps_pct',
 'dvps_pct2',
 'dvtps',
 'dvtps2',
 'dvtps_pct',
 'dvtps_pct2',
 'ebitdaps',
 'ebitdaps2',
 'ebitdaps_pct',
 'ebitd

In [37]:
true_val = np.array([])
pred_val = np.array([])
for i in range(len(results)):
    true_val = np.concatenate((true_val, results[i][2]))
    pred_val = np.concatenate((pred_val, results[i][1]))

In [38]:
print('Total LassoCV Results:')
print('\n', 'RMSE:', np.sqrt(mse(true_val, pred_val)))
print('\n', 'R2:', r2_score(true_val, pred_val))

Total LassoCV Results:

 RMSE: 385.8945037769819

 R2: -42.555739766867795


In [494]:
years = range(2010, 2017)
for i, x in enumerate(yearly_coefs_all):
    non_zeros = []
    for j, coef in enumerate(x):
        if coef != 0:
            non_zeros.append(var_all[j])
    print('Year', years[i], ':', non_zeros)

Year 2010 : ['atps', 'ltps', 'revtps', 'saleps']
Year 2011 : ['atps', 'ltps', 'revtps', 'saleps']
Year 2012 : ['atps', 'ltps', 'revtps', 'saleps']
Year 2013 : ['atps', 'ltps', 'revtps']
Year 2014 : ['atps', 'ltps', 'revtps', 'saleps']
Year 2015 : ['atps', 'ltps', 'revtps', 'saleps']
Year 2016 : ['atps', 'ltps', 'ppegtps', 'revtps', 'saleps']


**RidgeCV with All Data**

In [39]:
results = []
yearly_coefs_all_rcv = []
ridge_rmse = []
for i in range(2010, 2017):
    
    train_range = range(i - 10, i)
    train_data = all_data[all_data['fyear'].isin(train_range)].copy()[var_all]
    train_labels = all_data[all_data['fyear'].isin(train_range)].copy()['EarningsPerShare_pct_ny']
    val_data = all_data[all_data['fyear'] == i].copy()[var_all]
    val_labels = all_data[all_data['fyear'] == i].copy()['EarningsPerShare_pct_ny']


    model_yearly = RidgeCV().fit(train_data, train_labels)

    print('\n', 'Year', i, 'RMSE: ', np.sqrt(mse(val_labels, model_yearly.predict(val_data))))
    print('\n', 'Year', i, 'R2: ', r2_score(val_labels, model_yearly.predict(val_data)))
    #print(stats.summary(model_yearly, val_data, val_labels))
    yearly_coefs_all_rcv.append(model_yearly.coef_)
    ridge_rmse.append(np.sqrt(mse(val_labels, model_yearly.predict(val_data))))
    results.append([i, model_yearly.predict(val_data), val_labels])


 Year 2010 RMSE:  1295.9611660996616

 Year 2010 R2:  -642.0905948596023

 Year 2011 RMSE:  863.0207714403476

 Year 2011 R2:  -11343964.342766624

 Year 2012 RMSE:  362.6350420125087

 Year 2012 R2:  -652.492971129864

 Year 2013 RMSE:  116.58823598659822

 Year 2013 R2:  -57604.69957741808

 Year 2014 RMSE:  419.65586413792903

 Year 2014 R2:  -14.916381673608047

 Year 2015 RMSE:  45683.475181949354

 Year 2015 R2:  -265320.4929943046

 Year 2016 RMSE:  20.88940772980272

 Year 2016 R2:  -10.135373261437767


In [42]:
list1, list2 = zip(*sorted(zip(yearly_coefs_all_rcv[0], var_all)))
list2

('xadps_pct2',
 'chps_pct2',
 'apps_pct',
 'sentiment_score_3',
 'fincfps2',
 'oancfps2',
 'ivncfps2',
 'dlcps_pct2',
 'invtps_pct2',
 'ret2',
 'dlcps_pct',
 'ret',
 'subjectivity_3',
 'actps_pct',
 'dtps_pct2',
 'pos_ratio_2',
 'cogsps_pct',
 'ceqps_pct',
 'xaccps_pct',
 'apalchps_pct2',
 'wcapps_pct',
 'dvtps_pct2',
 'chechps',
 'ppegtps_pct',
 'dlttps_pct',
 'ltps_pct2',
 'atps_pct',
 'sentiment_score_2',
 'lctps_pct',
 'xoprps',
 'xsgaps_pct2',
 'dvps_pct2',
 'sentiment_score',
 'xoprps_pct2',
 'subjectivity',
 'subjectivity_2',
 'opeps_pct',
 'capxps_pct',
 'ebitdaps',
 'ebitdaps2',
 'opeps_pct2',
 'cr',
 'atps_pct2',
 'xoprps2',
 'dd1ps',
 'reps_pct2',
 'atr',
 'ltps_pct',
 'apalchps',
 'ebitps',
 'xintps_pct',
 'dd2ps',
 'xoprps_pct',
 'dvps',
 'nopips2',
 'lctps2',
 'apps',
 'nopips_pct2',
 'invtps',
 'wcapps2',
 'ltps',
 'xadps2',
 'xsgaps',
 'nopips',
 'reps',
 'ppentps',
 'xrdps',
 'dlttps',
 'nips',
 'actps',
 'chps',
 'bkvlps2',
 'dtps2',
 'atps2',
 'nips_pct',
 'EarningsP

In [43]:
years = np.arange(2010, 2017)
for x, coef in enumerate(yearly_coefs_all_rcv):
    list1, list2 = zip(*sorted(zip(coef, var_all)))
    print(years[x], ':')
    print('Important Positive Variables:', list2[:5])
    print('Important Negative Variables:', list2[-5:], '\n')

2010 :
Important Positive Variables: ('xadps_pct2', 'chps_pct2', 'apps_pct', 'sentiment_score_3', 'fincfps2')
Important Negative Variables: ('chechps_pct', 'xrdps_pct2', 'xaccps_pct2', 'invtps_pct', 'chps_pct') 

2011 :
Important Positive Variables: ('xadps_pct2', 'chps_pct2', 'xrdps_pct', 'atps_pct', 'sentiment_score_3')
Important Negative Variables: ('xrdps_pct2', 'xaccps_pct2', 'invtps_pct', 'nopips_pct', 'chps_pct') 

2012 :
Important Positive Variables: ('xadps_pct2', 'chps_pct2', 'xrdps_pct', 'ltps_pct2', 'fincfps2')
Important Negative Variables: ('xaccps_pct2', 'xrdps_pct2', 'invtps_pct', 'nopips_pct', 'chps_pct') 

2013 :
Important Positive Variables: ('xadps_pct2', 'chps_pct2', 'xrdps_pct', 'atps_pct', 'fincfps2')
Important Negative Variables: ('xrdps_pct2', 'invtps_pct', 'xaccps_pct2', 'chps_pct', 'nopips_pct') 

2014 :
Important Positive Variables: ('xadps_pct2', 'chps_pct2', 'xrdps_pct', 'ltps_pct2', 'ceqps_pct2')
Important Negative Variables: ('pos_ratio', 'xrdps_pct2', 'x

In [44]:
true_val = np.array([])
pred_val = np.array([])
for i in range(len(results)):
    true_val = np.concatenate((true_val, results[i][2]))
    pred_val = np.concatenate((pred_val, results[i][1]))

In [45]:
print('Total RidgeCV Results:')
print('\n', 'RMSE:', np.sqrt(mse(true_val, pred_val)))
print('\n', 'R2:', r2_score(true_val, pred_val))

Total RidgeCV Results:

 RMSE: 18019.308519106315

 R2: -94968.56539922472


**Decision Tree Regressor with All Data**

In [46]:
results_dt = []
#yearly_coefs = []
dt_rmse = []
for i in range(2010, 2017):
    
    train_range = range(i - 10, i)
    train_data = all_data[all_data['fyear'].isin(train_range)].copy()[var_all]
    train_labels = all_data[all_data['fyear'].isin(train_range)].copy()['EarningsPerShare_pct_ny']
    val_data = all_data[all_data['fyear'] == i].copy()[var_all]
    val_labels = all_data[all_data['fyear'] == i].copy()['EarningsPerShare_pct_ny']


    model_yearly = DecisionTreeRegressor().fit(train_data, train_labels)

    print('\n', 'Year', i, 'RMSE: ', np.sqrt(mse(val_labels, model_yearly.predict(val_data))))
    print('\n', 'Year', i, 'R2: ', r2_score(val_labels, model_yearly.predict(val_data)))
    #print(stats.summary(model_yearly, val_data, val_labels))
    #yearly_coefs.append(model_yearly.coef_)
    dt_rmse.append(np.sqrt(mse(val_labels, model_yearly.predict(val_data))))
    results_dt.append([i, model_yearly.predict(val_data), val_labels])


 Year 2010 RMSE:  38.34765029797617

 Year 2010 R2:  0.4369257088723756

 Year 2011 RMSE:  0.49563300007913164

 Year 2011 R2:  -2.7414747902294394

 Year 2012 RMSE:  14.184848099555547

 Year 2012 R2:  0.00011336616841706171

 Year 2013 RMSE:  1.2618886929169286

 Year 2013 R2:  -5.74835763391316

 Year 2014 RMSE:  105.34752487128499

 Year 2014 R2:  -0.003012684464965165

 Year 2015 RMSE:  347.232233347686

 Year 2015 R2:  -14.328308498466047

 Year 2016 RMSE:  7.0985677685977935

 Year 2016 R2:  -0.28586090230963856


In [47]:
true_val = np.array([])
pred_val = np.array([])
for i in range(len(results)):
    true_val = np.concatenate((true_val, results_dt[i][2]))
    pred_val = np.concatenate((pred_val, results_dt[i][1]))

In [48]:
print('Total Decision Tree Results:')
print('\n', 'RMSE:', np.sqrt(mse(true_val, pred_val)))
print('\n', 'R2:', r2_score(true_val, pred_val))

Total Decision Tree Results:

 RMSE: 143.9872795296624

 R2: -5.0639618571379375


**XGBoost Regressor with All Data**

In [49]:
results_xgb = []
#yearly_coefs = []
xgb_rmse = []
for i in range(2010, 2017):
    
    train_range = range(i - 10, i)
    train_data = all_data[all_data['fyear'].isin(train_range)].copy()[var_all]
    train_labels = all_data[all_data['fyear'].isin(train_range)].copy()['EarningsPerShare_pct_ny']
    val_data = all_data[all_data['fyear'] == i].copy()[var_all]
    val_labels = all_data[all_data['fyear'] == i].copy()['EarningsPerShare_pct_ny']


    model_yearly = xgb.XGBRegressor().fit(train_data, train_labels)

    print('\n', 'Year', i, 'RMSE: ', np.sqrt(mse(val_labels, model_yearly.predict(val_data))))
    print('\n', 'Year', i, 'R2: ', r2_score(val_labels, model_yearly.predict(val_data)))
    #print(stats.summary(model_yearly, val_data, val_labels))
    #yearly_coefs.append(model_yearly.coef_)
    xgb_rmse.append(np.sqrt(mse(val_labels, model_yearly.predict(val_data))))
    results_xgb.append([i, model_yearly.predict(val_data), val_labels])


 Year 2010 RMSE:  36.755709731268325

 Year 2010 R2:  0.4827055624448361

 Year 2011 RMSE:  0.8812629512265846

 Year 2011 R2:  -10.828601992275335

 Year 2012 RMSE:  14.060699096874348

 Year 2012 R2:  0.017539241865560884

 Year 2013 RMSE:  2.261138486405115

 Year 2013 R2:  -20.667575407149684

 Year 2014 RMSE:  867.590815104981

 Year 2014 R2:  -67.02799072847576

 Year 2015 RMSE:  344.7777016630531

 Year 2015 R2:  -14.112367483993257

 Year 2016 RMSE:  6.596192514506027

 Year 2016 R2:  -0.11029701057572572


In [50]:
true_val = np.array([])
pred_val = np.array([])
for i in range(len(results)):
    true_val = np.concatenate((true_val, results_xgb[i][2]))
    pred_val = np.concatenate((pred_val, results_xgb[i][1]))

In [51]:
print('Total XGBoost Results:')
print('\n', 'RMSE:', np.sqrt(mse(true_val, pred_val)))
print('\n', 'R2:', r2_score(true_val, pred_val))

Total XGBoost Results:

 RMSE: 367.24428458186776

 R2: -38.44739233093091


**Classification Model**

In [104]:
class_data = all_data
class_data['target'] = class_data.EarningsPerShare_pct_ny.apply(lambda x : 1 if x > 0 else 0)
training_mask = pd.Series([random.random() for i in range(len(class_data))])

training_set = class_data[class_data['fyear'] <= 2010]
val_set = class_data[class_data['fyear'] > 2010]

In [105]:
lr_model_1 = LogisticRegression().fit(training_set[var_all], training_set['target'])

In [106]:
lr_model_1.score(val_set[var_all], val_set['target'])

0.5697624190064795

In [107]:
confusion_matrix(val_set['target'], lr_model_1.predict(val_set[var_all]))

array([[  71,  904],
       [  92, 1248]], dtype=int64)

In [108]:
rf_model_1 = RandomForestClassifier().fit(training_set[var_all], training_set['target'])

In [109]:
confusion_matrix(val_set['target'], rf_model_1.predict(val_set[var_all]))

array([[ 202,  773],
       [ 146, 1194]], dtype=int64)

**Building a Portfolio**

In [250]:
port = df = pd.read_csv('Assignment1Data6.csv')[['gvkey', 'fyear', 'prcc_f']]
port['year_shift'] = port.fyear + 1
port = pd.merge(port[['gvkey', 'fyear', 'prcc_f']], port[['gvkey', 'year_shift', 'prcc_f']].rename(columns = {'prcc_f' : 'prcc_o'}), left_on = ['gvkey', 'fyear'], right_on = ['gvkey', 'year_shift'])
port.drop('year_shift', axis = 1, inplace = True)

In [251]:
port

Unnamed: 0,gvkey,fyear,prcc_f,prcc_o
0,1000,1970.0,10.000,
1,1000,1971.0,5.750,10.000
2,1000,1972.0,5.125,5.750
3,1000,1973.0,1.750,5.125
4,1000,1974.0,2.125,1.750
...,...,...,...,...
581847,345980,2019.0,,
581848,345980,2020.0,18.240,
581849,347085,2019.0,,
581850,347085,2020.0,,


In [252]:
all_data.columns

Index(['gvkey', 'datadate', 'fyear', 'ajex', 'act', 'ap', 'apalch', 'at',
       'bkvlps', 'capx',
       ...
       'subjectivity', 'year2', 'sentiment_score_2', 'pos_ratio_2',
       'subjectivity_2', 'year3', 'sentiment_score_3', 'pos_ratio_3',
       'subjectivity_3', 'target'],
      dtype='object', length=239)

In [373]:
fama = pd.read_csv('FFannual.csv')

fama = fama[fama['Year'].isin(range(2010, 2017))]
fama['Mkt-RF'] = fama['Mkt-RF'] / 100
fama['SMB'] = fama['SMB'] / 100
fama['HML'] = fama['HML'] / 100
fama['RF'] = fama['RF'] / 100

fama

Unnamed: 0,Year,Mkt-RF,SMB,HML,RF
83,2010,0.1737,0.1429,-0.051,0.0012
84,2011,0.0044,-0.0579,-0.0854,0.0004
85,2012,0.1627,-0.0132,0.0985,0.0006
86,2013,0.352,0.0763,0.0265,0.0002
87,2014,0.1171,-0.078,-0.0146,0.0002
88,2015,0.0007,-0.0393,-0.0953,0.0002
89,2016,0.133,0.0663,0.2264,0.002


**Monthly Portfolios**

In [325]:
from dateutil.relativedelta import relativedelta

In [326]:
monthly = pd.read_csv('monthlydata.csv')
monthly = monthly.drop(columns = ['gsector', 'iid'])
monthly['year'] = monthly.datadate.apply(lambda x : int(x[:4]))
monthly['month'] = monthly.datadate.apply(lambda x : int(x[5:7]))
monthly.datadate = pd.to_datetime(monthly.datadate)

In [327]:
monthly2 = monthly.copy()
monthly2.datadate = monthly2.datadate.apply(lambda x : x + relativedelta(months = 1))

In [328]:
monthly2.rename(columns = {'prccm' : 'prcc_o'}, inplace = True)

In [329]:
monthly_final = pd.merge(monthly, monthly2[['gvkey', 'datadate', 'prcc_o']], on = ['gvkey', 'datadate'])

In [330]:
monthly_final['pct_change'] = (monthly_final.prccm - monthly_final.prcc_o) / monthly_final.prcc_o

In [331]:
monthly_final.dropna(inplace = True)

In [332]:
monthly_final

Unnamed: 0,gvkey,datadate,prccm,year,month,prcc_o,pct_change
0,1003,2002-08-31,0.040,2002,8,0.0010,39.000000
1,1003,2002-09-30,0.040,2002,9,0.0400,0.000000
2,1003,2002-11-30,0.040,2002,11,0.0400,0.000000
3,1003,2003-01-31,0.040,2003,1,0.0400,0.000000
4,1003,2003-02-28,0.040,2003,2,0.0400,0.000000
...,...,...,...,...,...,...,...
3265785,351371,2021-04-30,1.420,2021,4,0.0550,24.818182
3265786,351458,2022-02-28,0.090,2022,2,0.0850,0.058824
3265787,351491,2022-02-28,8.000,2022,2,10.5100,-0.238820
3265788,351590,2022-01-31,17.589,2022,1,18.4389,-0.046093


In [333]:
FFmonthly = pd.read_csv('FFmonthly.csv')

In [334]:
FFmonthly['year'] = FFmonthly.date.apply(lambda x : int(str(x)[:4]))
FFmonthly = FFmonthly[FFmonthly['year'].isin(range(2010, 2017))]

FFmonthly['Mkt-RF'] = FFmonthly['Mkt-RF'] / 100
FFmonthly['SMB'] = FFmonthly['SMB'] / 100
FFmonthly['HML'] = FFmonthly['HML'] / 100
FFmonthly['RF'] = FFmonthly['RF'] / 100

**Portfolio from Lasso with All Data**

In [451]:
years = np.arange(2010, 2017)
inv_df = pd.DataFrame()
for x, coefs in enumerate(yearly_coefs_all):
    year = years[x]
    data = all_data[all_data['fyear'] == year]
    gvkey = data['gvkey']
    data = data[var_all]
    result = data @ coefs
    data['gvkey'] = gvkey
    data['result_lassocv_all'] = result
    
    inv_df = inv_df.append(data[['gvkey', 'fyear', 'result_lassocv_all']])    

In [452]:
port_sorted = pd.DataFrame()
for i in range(int(inv_df.fyear.min()), int(inv_df.fyear.max() + 1)):
    yearly_sorted = inv_df[inv_df['fyear'] == i].sort_values(by = 'result_lassocv_all', ascending = False)
    port_sorted = port_sorted.append(yearly_sorted)

port_sorted = port_sorted[['gvkey', 'fyear', 'result_lassocv_all']]

In [455]:
monthly_port_returns = []

for i in range(2010, 2017):
    cur_top = port_sorted[port_sorted['fyear'] == i]['gvkey'].iloc[:20]
    cur_bot = port_sorted[port_sorted['fyear'] == i]['gvkey'].iloc[-20:]
    
    for j in range(1, 13):
        top_ret = monthly_final[(monthly_final['month'] == j) & (monthly_final['year'] == i) & (monthly_final['gvkey'].isin(cur_top))]['pct_change'].mean()
        bot_ret = monthly_final[(monthly_final['month'] == j) & (monthly_final['year'] == i) & (monthly_final['gvkey'].isin(cur_bot))]['pct_change'].mean()
        
        ret = top_ret - bot_ret
        
        monthly_port_returns.append(ret)

In [456]:
FFmonthly['monthly_ret'] = monthly_port_returns
FFmonthly['monthly_ret'] = FFmonthly['monthly_ret'].fillna(0)

In [457]:
y = FFmonthly['monthly_ret']
x = FFmonthly['Mkt-RF'].copy().dropna()
x = sm.add_constant(x)

model = sm.OLS(y, x)
results = model.fit()

alpha = results.params[0]
alpha = float(alpha)
results.summary()

0,1,2,3
Dep. Variable:,monthly_ret,R-squared:,0.117
Model:,OLS,Adj. R-squared:,0.106
Method:,Least Squares,F-statistic:,10.89
Date:,"Mon, 18 Apr 2022",Prob (F-statistic):,0.00143
Time:,15:56:08,Log-Likelihood:,185.07
No. Observations:,84,AIC:,-366.1
Df Residuals:,82,BIC:,-361.3
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0011,0.003,0.353,0.725,-0.005,0.007
Mkt-RF,0.2576,0.078,3.300,0.001,0.102,0.413

0,1,2,3
Omnibus:,6.852,Durbin-Watson:,1.834
Prob(Omnibus):,0.033,Jarque-Bera (JB):,7.165
Skew:,0.453,Prob(JB):,0.0278
Kurtosis:,4.107,Cond. No.,26.4


In [458]:
print('Alpha:', alpha * 12)

Alpha: 0.013018038766075508


In [459]:
#sharpe ratio
np.mean(FFmonthly['monthly_ret']) / np.std(FFmonthly['monthly_ret']) * 12 ** (.5)

0.47438769038404494

In [460]:
y = FFmonthly['monthly_ret']
x = FFmonthly[['Mkt-RF','SMB','HML']].copy().dropna()
x = sm.add_constant(x)

model = sm.OLS(y, x)
results = model.fit()

alpha = results.params[0]
alpha = float(alpha)
results.summary()

0,1,2,3
Dep. Variable:,monthly_ret,R-squared:,0.149
Model:,OLS,Adj. R-squared:,0.118
Method:,Least Squares,F-statistic:,4.686
Date:,"Mon, 18 Apr 2022",Prob (F-statistic):,0.00457
Time:,15:56:08,Log-Likelihood:,186.63
No. Observations:,84,AIC:,-365.3
Df Residuals:,80,BIC:,-355.5
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0014,0.003,0.467,0.642,-0.005,0.008
Mkt-RF,0.2029,0.086,2.361,0.021,0.032,0.374
SMB,0.2489,0.144,1.726,0.088,-0.038,0.536
HML,-0.0447,0.137,-0.327,0.744,-0.317,0.227

0,1,2,3
Omnibus:,6.731,Durbin-Watson:,1.838
Prob(Omnibus):,0.035,Jarque-Bera (JB):,7.679
Skew:,0.394,Prob(JB):,0.0215
Kurtosis:,4.254,Cond. No.,51.1


In [461]:
print('Alpha:', alpha * 12)

Alpha: 0.017164093871480993


In [462]:
#sharpe ratio
np.mean(FFmonthly['monthly_ret']) / np.std(FFmonthly['monthly_ret']) * 12 ** (.5)

0.47438769038404494

**Portfolio from Lasso with Financial Data**

In [463]:
years = np.arange(2010, 2017)
inv_df = pd.DataFrame()
for x, coefs in enumerate(yearly_coefs[:7]):
    year = years[x]
    data = all_data[all_data['fyear'] == year]
    gvkey = data['gvkey']
    data = data[var]
    result = data @ coefs
    data['gvkey'] = gvkey
    data['result_lassocv'] = result
    
    inv_df = inv_df.append(data[['gvkey', 'fyear', 'result_lassocv']])    

In [464]:
port_sorted = pd.DataFrame()
for i in range(int(inv_df.fyear.min()), int(inv_df.fyear.max() + 1)):
    yearly_sorted = inv_df[inv_df['fyear'] == i].sort_values(by = 'result_lassocv', ascending = False)
    port_sorted = port_sorted.append(yearly_sorted)

port_sorted = port_sorted[['gvkey', 'fyear', 'result_lassocv']]

In [467]:
monthly_port_returns = []

for i in range(2010, 2017):
    cur_top = port_sorted[port_sorted['fyear'] == i]['gvkey'].iloc[:20]
    cur_bot = port_sorted[port_sorted['fyear'] == i]['gvkey'].iloc[-20:]
    
    for j in range(1, 13):
        top_ret = monthly_final[(monthly_final['month'] == j) & (monthly_final['year'] == i) & (monthly_final['gvkey'].isin(cur_top))]['pct_change'].mean()
        bot_ret = monthly_final[(monthly_final['month'] == j) & (monthly_final['year'] == i) & (monthly_final['gvkey'].isin(cur_bot))]['pct_change'].mean()
        
        ret = top_ret - bot_ret
        
        monthly_port_returns.append(ret)

In [468]:
FFmonthly['monthly_ret'] = monthly_port_returns
FFmonthly['monthly_ret'] = FFmonthly['monthly_ret'].fillna(0)

In [469]:
y = FFmonthly['monthly_ret']
x = FFmonthly['Mkt-RF'].copy().dropna()
x = sm.add_constant(x)

model = sm.OLS(y, x)
results = model.fit()

alpha = results.params[0]
alpha = float(alpha)
results.summary()

0,1,2,3
Dep. Variable:,monthly_ret,R-squared:,0.112
Model:,OLS,Adj. R-squared:,0.101
Method:,Least Squares,F-statistic:,10.31
Date:,"Mon, 18 Apr 2022",Prob (F-statistic):,0.00189
Time:,15:56:35,Log-Likelihood:,181.61
No. Observations:,84,AIC:,-359.2
Df Residuals:,82,BIC:,-354.4
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.0067,0.003,-2.101,0.039,-0.013,-0.000
Mkt-RF,0.2611,0.081,3.211,0.002,0.099,0.423

0,1,2,3
Omnibus:,3.635,Durbin-Watson:,2.13
Prob(Omnibus):,0.162,Jarque-Bera (JB):,2.972
Skew:,-0.325,Prob(JB):,0.226
Kurtosis:,3.653,Cond. No.,26.4


In [470]:
print('Alpha:', alpha * 12)

Alpha: -0.08071711651634389


In [471]:
#sharpe ratio
np.mean(FFmonthly['monthly_ret']) / np.std(FFmonthly['monthly_ret']) * 12 ** (.5)

-0.45449838868636416

In [472]:
y = FFmonthly['monthly_ret']
x = FFmonthly[['Mkt-RF','SMB','HML']].copy().dropna()
x = sm.add_constant(x)

model = sm.OLS(y, x)
results = model.fit()

alpha = results.params[0]
alpha = float(alpha)
results.summary()

0,1,2,3
Dep. Variable:,monthly_ret,R-squared:,0.138
Model:,OLS,Adj. R-squared:,0.106
Method:,Least Squares,F-statistic:,4.277
Date:,"Mon, 18 Apr 2022",Prob (F-statistic):,0.00748
Time:,15:56:35,Log-Likelihood:,182.88
No. Observations:,84,AIC:,-357.8
Df Residuals:,80,BIC:,-348.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.0064,0.003,-1.991,0.050,-0.013,-1.87e-06
Mkt-RF,0.2005,0.090,2.232,0.028,0.022,0.379
SMB,0.2175,0.151,1.442,0.153,-0.083,0.518
HML,0.0757,0.143,0.530,0.598,-0.209,0.360

0,1,2,3
Omnibus:,4.593,Durbin-Watson:,2.125
Prob(Omnibus):,0.101,Jarque-Bera (JB):,4.134
Skew:,-0.357,Prob(JB):,0.127
Kurtosis:,3.819,Cond. No.,51.1


In [473]:
print('Alpha:', alpha * 12)

Alpha: -0.07644375112338962


In [474]:
#sharpe ratio
np.mean(FFmonthly['monthly_ret']) / np.std(FFmonthly['monthly_ret']) * 12 ** (.5)

-0.45449838868636416

**Portfolio from Classification Model with All Data**

In [475]:
predictions = rf_model_1.predict_proba(all_data[var_all])
inv_df = all_data[['gvkey', 'fyear']]
inv_df['class_results'] = predictions[:, 1]

In [476]:
port_sorted = pd.DataFrame()
for i in range(int(inv_df.fyear.min()), int(inv_df.fyear.max() + 1)):
    yearly_sorted = inv_df[inv_df['fyear'] == i].sort_values(by = 'class_results', ascending = False)
    port_sorted = port_sorted.append(yearly_sorted)

port_sorted = port_sorted[['gvkey', 'fyear', 'class_results']]

In [479]:
monthly_port_returns = []

for i in range(2010, 2017):
    cur_top = port_sorted[port_sorted['fyear'] == i]['gvkey'].iloc[:20]
    cur_bot = port_sorted[port_sorted['fyear'] == i]['gvkey'].iloc[-20:]
    
    for j in range(1, 13):
        top_ret = monthly_final[(monthly_final['month'] == j) & (monthly_final['year'] == i) & (monthly_final['gvkey'].isin(cur_top))]['pct_change'].mean()
        bot_ret = monthly_final[(monthly_final['month'] == j) & (monthly_final['year'] == i) & (monthly_final['gvkey'].isin(cur_bot))]['pct_change'].mean()
        
        ret = top_ret - bot_ret
        
        monthly_port_returns.append(ret)

In [480]:
FFmonthly['monthly_ret'] = monthly_port_returns
FFmonthly['monthly_ret'] = FFmonthly['monthly_ret'].fillna(0)

In [481]:
y = FFmonthly['monthly_ret']
x = FFmonthly['Mkt-RF'].copy().dropna()
x = sm.add_constant(x)

model = sm.OLS(y, x)
results = model.fit()

alpha = results.params[0]
alpha = float(alpha)
results.summary()

0,1,2,3
Dep. Variable:,monthly_ret,R-squared:,0.036
Model:,OLS,Adj. R-squared:,0.024
Method:,Least Squares,F-statistic:,3.038
Date:,"Mon, 18 Apr 2022",Prob (F-statistic):,0.0851
Time:,15:57:03,Log-Likelihood:,186.66
No. Observations:,84,AIC:,-369.3
Df Residuals:,82,BIC:,-364.4
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0036,0.003,1.192,0.237,-0.002,0.010
Mkt-RF,-0.1335,0.077,-1.743,0.085,-0.286,0.019

0,1,2,3
Omnibus:,8.574,Durbin-Watson:,1.791
Prob(Omnibus):,0.014,Jarque-Bera (JB):,14.499
Skew:,0.308,Prob(JB):,0.00071
Kurtosis:,4.94,Cond. No.,26.4


In [482]:
print('Alpha:', alpha * 12)

Alpha: 0.04312795946904327


In [483]:
#sharpe ratio
np.mean(FFmonthly['monthly_ret']) / np.std(FFmonthly['monthly_ret']) * 12 ** (.5)

0.2772619878931217

In [484]:
y = FFmonthly['monthly_ret']
x = FFmonthly[['Mkt-RF','SMB','HML']].copy().dropna()
x = sm.add_constant(x)

model = sm.OLS(y, x)
results = model.fit()

alpha = results.params[0]
alpha = float(alpha)
results.summary()

0,1,2,3
Dep. Variable:,monthly_ret,R-squared:,0.043
Model:,OLS,Adj. R-squared:,0.007
Method:,Least Squares,F-statistic:,1.192
Date:,"Mon, 18 Apr 2022",Prob (F-statistic):,0.318
Time:,15:57:03,Log-Likelihood:,186.96
No. Observations:,84,AIC:,-365.9
Df Residuals:,80,BIC:,-356.2
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0034,0.003,1.124,0.264,-0.003,0.009
Mkt-RF,-0.1055,0.086,-1.233,0.221,-0.276,0.065
SMB,-0.1059,0.144,-0.738,0.463,-0.392,0.180
HML,-0.0229,0.136,-0.168,0.867,-0.294,0.248

0,1,2,3
Omnibus:,7.562,Durbin-Watson:,1.781
Prob(Omnibus):,0.023,Jarque-Bera (JB):,12.891
Skew:,0.216,Prob(JB):,0.00159
Kurtosis:,4.87,Cond. No.,51.1


In [485]:
print('Alpha:', alpha * 12)

Alpha: 0.04112501614285259


In [486]:
#sharpe ratio
np.mean(FFmonthly['monthly_ret']) / np.std(FFmonthly['monthly_ret']) * 12 ** (.5)

0.2772619878931217