# Imports

In [317]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn import svm, preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
import warnings
warnings.filterwarnings("ignore")
from sklearn.svm import SVR
from sklearn.linear_model import Lasso, Ridge
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.models import Sequential

# Class Setup

In [307]:
class model():

    def __init__(self):
        self.r2_oos     = object()
        self.r2_is      = object()
        self.predictors = object()
        self.summary    = object()
        self.model      = object()
        self.fitted_model = object()
        self.X_train    = object()
        self.y_train    = object()
        self.X_test     = object()
        self.y_test     = object()
        self.pred       = object()

# Data

In [308]:
df = pd.DataFrame(np.load(r'../data/master_prepared.npy', allow_pickle=True))
cols = np.load(r'../data/columns_prepared.npy', allow_pickle=True)
df.columns = cols

In [325]:
drop_cols = [
    'date',
    'eligible',
    'companyid', 
    'ret_1f1',
    'ret_1f3',
    'ret_1f6',
    'ret_1f12',
    'retxs_1f1',
    'retxs_1f3',
    'retxs_1f6',
    'retxs_1f12'
]

y_col = 'retxs_1f1'

# X cols are the independent variables
x_cols = [i for i in list(df.columns) if i not in drop_cols]

# Only use eligible data
df_eligible = df[df['eligible']==1]
X = df_eligible[x_cols].astype('float64')
y = df_eligible[y_col].astype('float64')

In [319]:
def prepare_data(start_index, train_range):

    # Dates
    global dates
    dates = df['date'].unique()
    train_dates = dates[start_index:start_index+train_range+1]
    test_dates = dates[start_index+train_range+1]

    # Training Data
    X_train = X[df_eligible['date'].isin(train_dates)].astype('float64')
    X_train = sm.add_constant(X_train)
    y_train = y[df_eligible['date'].isin(train_dates)].astype('float64')

    # Test Data
    X_test = X[df_eligible['date']==test_dates].astype('float64')
    X_test.insert(0, 'const', 1)
    y_test = y[df_eligible['date']==test_dates].astype('float64')
        
    return X_train, y_train, X_test, y_test

#X_train, y_train, X_test, y_test = prepare_data(60, 36)

# Models

In [332]:
linear_predictors = [
    'mv',
    'bvtmv',
#     'retxs_6m',
#     'retxs_12m',
#     'std_12m',
#     'fcf',
#     'roic',
#     'dy',
    'beta'
]

nonlinear_predictors = [
    'mv',
    'mv_^2',
    'mv_^3',
    'bvtmv',
    'bvtmv_^2',
    'bvtmv_^3',
#     'retxs_3m',
#     'retxs_3m_^2',
#     'retxs_6m',
#     'retxs_12m',
#     'retxs_12m_^2',
#     'std_6m',
#     'std_12m',
#     'std_12m_^2',
#     'std_24m',
#     'debtpct',
#     'fcf',
#     'roic',
#     'roic_^2',
#     'dy',
#     'dy_^2',
    'beta',
    'beta_^2',
    'beta_^3'
#     'mv_bvtmv',
#     'retxs12m_bvtmv',
#     'ind_technology',
#     'ind_utilities',
#     'ind_financials',
#     'ind_energy',
#     'ind_realestate'
]

In [326]:
X_train = X.iloc[:int(0.7 * len(X.index))]
y_train = y.iloc[:int(0.7 * len(y.index))]
X_test = X.iloc[int(0.7 * len(X.index)):]
y_test = y.iloc[int(0.7 * len(y.index)):]


PR = sm.OLS(y_train, X_train[nonlinear_predictors]).fit()

print(r2_score(y_test, PR.predict(X_test[nonlinear_predictors])))
print(PR.summary())

-0.014523745496226148
                            OLS Regression Results                            
Dep. Variable:              retxs_1f1   R-squared:                       0.009
Model:                            OLS   Adj. R-squared:                  0.009
Method:                 Least Squares   F-statistic:                     112.6
Date:                Tue, 25 Feb 2020   Prob (F-statistic):          3.04e-188
Time:                        11:15:41   Log-Likelihood:                 55059.
No. Observations:               95615   AIC:                        -1.101e+05
Df Residuals:                   95606   BIC:                        -1.100e+05
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
mv         -3.083e-07   1.64e-

# Model Comparison

In [333]:
cv = 10

# X_ = X[df_eligible['date'].isin(dates)].astype('float64')

print("Linear       ", np.mean(cross_val_score(estimator=LinearRegression(), X=X[linear_predictors], y=y, scoring='r2', cv=cv)))
print("Tree         ", np.mean(cross_val_score(estimator=DecisionTreeRegressor(max_depth=2), X=X[nonlinear_predictors], y=y, scoring='r2', cv=cv)))
print("Polynomial   ", np.mean(cross_val_score(estimator=LinearRegression(), X=X[nonlinear_predictors], y=y, scoring='r2', cv=cv)))
print("Lasso        ", np.mean(cross_val_score(estimator=Lasso(alpha=2), X=X[nonlinear_predictors], y=y, scoring='r2', cv=cv)))
print("Ridge        ", np.mean(cross_val_score(estimator=Ridge(alpha=2), X=X[nonlinear_predictors], y=y, scoring='r2', cv=cv)))


Linear        -0.0011746632198770257
Tree          0.00041611458410838463
Polynomial    0.00016015131100776124
Lasso         -0.0012421215581979728
Ridge         0.00016027718504232346


In [None]:
def describe_r2(model, predictors):
    r2_ts = pd.DataFrame(columns=['model_R2'], index=dates)

    for i in range(1,len(dates)-37):
        X_train, y_train, X_test, y_test = prepare_data(i, 36)
        X_train, X_test = X_train[predictors], X_test[predictors]

        try:
            model.fit(X_train, y_train)
            pred = model.predict(X_test)
            r2_ts['model_R2'].iloc[i] = r2_score(y_test, pred)

        except ValueError as err:
            print(err)

    fig, ax = plt.subplots()
    ax.plot(r2_ts)
    plt.legend(r2_ts.columns)
    print("Mean:  ", r2_ts.mean())
    print("Median:", np.median(r2_ts))

describe_r2(model=DecisionTreeRegressor(max_depth=3), predictors=nonlinear_predictors)


## *Neural Networks*

In [182]:


# def norm(x):
#     return (x-x.describe().transpose()['mean'])/x.describe().transpose()['std']

# normXtrain = norm(X_train.drop('const', axis=1))
# normXtest = norm(X_test.drop('const', axis=1))

# model = Sequential()

# model.add(Dense(8, input_dim=len(normXtrain.columns), activation='relu'))
# model.add(Dense(1, activation='softmax'))
# model.compile(loss='mean_squared_error', optimizer='adam')
# model.fit(normXtrain, y_train, epochs=5, batch_size=2, verbose=2)

# model.predict(normXtest)