# Imports

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn import svm, preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
import warnings
warnings.filterwarnings("ignore")
from sklearn.svm import SVR
from sklearn.linear_model import Lasso, Ridge
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.models import Sequential

# Class Setup

In [2]:
class model():

    def __init__(self):
        self.r2_oos     = object()
        self.r2_is      = object()
        self.predictors = object()
        self.summary    = object()
        self.model      = object()
        self.fitted_model = object()
        self.X_train    = object()
        self.y_train    = object()
        self.X_test     = object()
        self.y_test     = object()
        self.pred       = object()

# Data

In [3]:
df = pd.DataFrame(np.load(r'../data/master_prepared.npy', allow_pickle=True))
cols = np.load(r'../data/columns_prepared.npy', allow_pickle=True)
df.columns = cols

In [4]:
# Market Value
df['mv_^0.5'] = df['mv'] ** 0.5

# Book to Market Value
df['bvtmv_^2'] = df['bvtmv'] ** 2
df['bvtmv_^3'] = df['bvtmv'] ** 3

# Operating Margin
df['opmarg_^2'] = df['opmarg'] ** 2
df['opmarg_^3'] = df['opmarg'] ** 3

# Free cash flow
df['fcf_^2'] = df['fcf'] ** 2
df['fcf_^3'] = df['fcf'] ** 3

# Return on invested capital
df['roic_^2'] = df['roic'] ** 2
df['roic_^3'] = df['roic'] ** 3

# Volatility
df['std_12m_^2'] = df['std_12m'] ** 2
df['std_12m_^3'] = df['std_12m'] ** 3

# Return
df['retxs_12m_^2'] = df['retxs_12m'] ** 2
df['retxs_12m_^3'] = df['retxs_12m'] ** 3
df['retxs_3m_^2'] = df['retxs_3m'] ** 2
df['retxs_3m_^3'] = df['retxs_3m'] ** 3

# Dividend yield
df['dy_^2'] = df['dy'] ** 2
df['dy_^3'] = df['dy'] ** 3

# Beta
df['beta_^2'] = df['beta'] ** 2
df['beta_^3'] = df['beta'] ** 3

# Shares Outstanding
df['so_^2'] = df['so'] ** 2
df['so_^3'] = df['so'] ** 3


In [5]:
df['mv_bvtmv'] = df['bvtmv'] / df['mv']

df['ret12m_bvtmv'] = df['ret_12m'] * df['mv']

In [7]:
drop_cols = [
    'date',
    'eligible',
    'companyid', 
    'ret_1f1',
    'ret_1f3',
    'ret_1f6',
    'ret_1f12',
    'retxs_1f1',
    'retxs_1f3',
    'retxs_1f6',
    'retxs_1f12'
]

y_col = 'retxs_1f1'

# X cols are the independent variables
x_cols = [i for i in list(df.columns) if i not in drop_cols]

# Only use eligible data
df_eligible = df[df['eligible']==1]
X = df_eligible[x_cols].astype('float64')
y = df_eligible[y_col].astype('float64')


In [8]:
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.dropna(how='any', inplace=True)
y = y.loc[X.index]


In [63]:
def prepare_data(start_index, train_range):

    # Dates
    global dates
    dates = df['date'].unique()
    train_dates = dates[start_index:start_index+train_range+1]
    test_dates = dates[start_index+train_range+1]

    # Training Data
    X_train = X[df_eligible['date'].isin(train_dates)].astype('float64')
    X_train = sm.add_constant(X_train)
    y_train = y[df_eligible['date'].isin(train_dates)].astype('float64')

    # Test Data
    X_test = X[df_eligible['date']==test_dates].astype('float64')
    X_test.insert(0, 'const', 1)
    y_test = y[df_eligible['date']==test_dates].astype('float64')
        
    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = prepare_data(60, 36)

print("Input Shapes")
print("============")
print('X_train: ', X_train.values.shape)
print('y_train: ', y_train.values.shape)
print('X_test:  ', X_test.values.shape)
print('y_test:  ', y_test.values.shape)

Input Shapes
X_train:  (25712, 80)
y_train:  (25712,)
X_test:   (668, 80)
y_test:   (668,)


# Models

## *Linear Regression*

In [18]:
class LinearReg_sm(model):
    
    def fit_model(self, X_train, y_train):

        predictors = [
            'mv',
            'bvtmv',
            'ret_12m',
            'retxs_12m',
            'ret_6m',
            'std_12m',
            'fcf',
            'roic',
            'dy',
            'beta',
            'mom_3m12m',
            'mom_12m36m'
        ]
        self.predictors = predictors
        self.X_train = X_train[predictors]
        self.y_train = y_train

        model = sm.OLS(y_train, X_train[predictors]).fit()
        self.model = model
        self.summary = model.summary()
    
    def test_model(self, X_test, y_test):
        self.X_test = X_test[self.predictors]
        self.y_test = y_test
        pred = self.model.predict(self.X_test)
        self.r2_oos  = r2_score(y_test, pred)
        self.r2_is   = r2_score(self.y_train, self.model.predict(self.X_train))

LR= LinearReg_sm()
LR.fit_model(X_train, y_train)
LR.test_model(X_test, y_test)

print("Linear       ", np.mean(cross_val_score(estimator=LinearRegression(), X=X[LR.predictors], y=y, scoring='r2', cv=10)))
print(LR.summary)

Linear        -0.01653885571103586
                                 OLS Regression Results                                
Dep. Variable:              retxs_1f1   R-squared (uncentered):                   0.105
Model:                            OLS   Adj. R-squared (uncentered):              0.105
Method:                 Least Squares   F-statistic:                              252.3
Date:                Thu, 13 Feb 2020   Prob (F-statistic):                        0.00
Time:                        10:51:22   Log-Likelihood:                          14175.
No. Observations:               25712   AIC:                                 -2.833e+04
Df Residuals:                   25700   BIC:                                 -2.823e+04
Df Model:                          12                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
------

## *Polynomial Regression*

In [11]:
class PolynomialReg_sm(model):

    def fit_model(self, X_train, y_train):
        
        # Same as linear regression but with non-linear predictor set (eg. X^2 terms)
        predictors = [
            'bvtmv','bvtmv_^2','bvtmv_^3', 
            'retxs_12m','retxs_12m_^2','retxs_12m_^3', 
            'std_12m','std_12m_^2','std_12m_^3',
            'fcf','fcf_^2','fcf_^3', 
            'mv','mv_^0.5', 
            'roic','roic_^2','roic_^3', 
            'dy','dy_^2','dy_^3', 
            'beta','beta_^2','beta_^3', 
            'so', 'so_^2'
        ]
        
        self.X_train = X_train[predictors]
        self.y_train = y_train
        
        self.predictors = predictors

        model = sm.OLS(y_train, self.X_train).fit()
        self.model = model
        self.summary = self.model.summary()

    def test_model(self, X_test, y_test):
        self.X_test = X_test[self.predictors]
        self.y_test = y_test
        pred = self.model.predict(self.X_test)
        self.r2_oos  = r2_score(self.y_test, pred)
        self.r2_is   = r2_score(self.y_train, self.model.predict(self.X_train))

PR = PolynomialReg_sm()
PR.fit_model(X_train, y_train)
PR.test_model(X_test, y_test)

print("Polynomial   ", np.mean(cross_val_score(estimator=LinearRegression(), X=X[PR.predictors], y=y, scoring='r2', cv=10)))

# print(PR.r2_oos)
# print(PR.summary)

Polynomial    -0.07638820538320282


## *Regression Tree*

In [12]:
class RegTree(model):
    
    def __init__(self, max_depth):
        self.max_depth = max_depth
        self.model = DecisionTreeRegressor(max_depth=self.max_depth)

    def fit_model(self, X_train, y_train):
        
        # Same as linear regression but with non-linear predictor set (eg. X^2 terms)
        predictors = PR.predictors
        
        self.X_train = X_train[predictors]
        self.y_train = y_train
        
        self.predictors = predictors

        self.fitted_model = self.model.fit(self.X_train, self.y_train)
        self.summary = "Regression tree has no summary"

    def test_model(self, X_test, y_test):
        self.X_test = X_test[self.predictors]
        self.y_test = y_test
        pred = self.model.predict(self.X_test)
        self.r2_oos  = r2_score(y_test, pred)
        self.r2_is   = r2_score(self.y_train, self.model.predict(self.X_train))


RT = RegTree(max_depth=2)
# RT.fit_model(X_train, y_train)
# RT.test_model(X_test, y_test)

## *Neural Networks*

In [46]:


def norm(x):
    return (x-x.describe().transpose()['mean'])/x.describe().transpose()['std']

normXtrain = norm(X_train.drop('const', axis=1))
normXtest = norm(X_test.drop('const', axis=1))

model = Sequential()

model.add(Dense(8, input_dim=len(normXtrain.columns), activation='relu'))
model.add(Dense(1, activation='softmax'))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(normXtrain, y_train, epochs=5, batch_size=2, verbose=2)

model.predict(normXtest)

Train on 25712 samples
Epoch 1/5
25712/25712 - 8s - loss: 1.0013
Epoch 2/5
25712/25712 - 8s - loss: 1.0013
Epoch 3/5
25712/25712 - 8s - loss: 1.0013
Epoch 4/5
25712/25712 - 8s - loss: 1.0013
Epoch 5/5
25712/25712 - 8s - loss: 1.0013


array([[nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
       [nan],
      

# Model Comparison

In [75]:
cv = 10

print("Linear       ", np.mean(cross_val_score(estimator=LinearRegression(), X=X[LR.predictors], y=y, scoring='r2', cv=cv)))
print("Tree         ", np.mean(cross_val_score(estimator=DecisionTreeRegressor(max_depth=2), X=X[PR.predictors], y=y, scoring='r2', cv=cv)))
print("Polynomial   ", np.mean(cross_val_score(estimator=LinearRegression(), X=X[PR.predictors], y=y, scoring='r2', cv=cv)))
print("Lasso        ", np.mean(cross_val_score(estimator=Lasso(alpha=1.5), X=X[PR.predictors], y=y, scoring='r2', cv=cv)))
print("Ridge        ", np.mean(cross_val_score(estimator=Ridge(alpha=1.5), X=X[PR.predictors], y=y, scoring='r2', cv=cv)))


Linear        -0.01653885571103586
Tree          0.0014704875934970385
Polynomial    -0.07638820538320282
Lasso         -0.009879764986742434
Ridge         -0.07638031526471364


In [None]:
def describe_r2(model, predictors):
    r2_ts = pd.DataFrame(columns=['model_R2'], index=dates)

    for i in range(1,len(dates)-37):
        X_train, y_train, X_test, y_test = prepare_data(i, 36)
        X_train, X_test = X_train[predictors], X_test[predictors]

        try:
            model.fit(X_train, y_train)
            pred = model.predict(X_test)
            r2_ts['model_R2'].iloc[i] = r2_score(y_test, pred)

        except ValueError as err:
            print(err)

    fig, ax = plt.subplots()
    ax.set_ylim(-5, 1)
    ax.plot(r2_ts)
    plt.legend(r2_ts.columns)
    print("Mean:  ", r2_ts.mean())

describe_r2(model=DecisionTreeRegressor(max_depth=2), predictors=LR.predictors)
