# Imports

In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
%matplotlib inline
from statsmodels.sandbox.regression.predstd import wls_prediction_std
import seaborn as sns
from sklearn import svm, preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

# Read Data

In [3]:
df = pd.DataFrame(np.load(r'../data/master_prepared.npy', allow_pickle=True))
cols = np.load(r'../data/columns_prepared.npy', allow_pickle=True)
df.columns = cols

In [17]:
# Create polynomials
df['mv_sqrt'] = df['mv'] ** 0.5
df['bvtmv_sq'] = df['bvtmv'] ** 2

In [51]:
df['ret_3m']

0               NaN
1               NaN
2               NaN
3               NaN
4               NaN
            ...    
444558      0.16915
444559     0.332366
444560    0.0147474
444561    -0.199849
444562    0.0369658
Name: ret_3m, Length: 444563, dtype: object

In [24]:
drop_cols = [
    'date',
    'eligible',
    'companyid', 
    'ret_3m', 
    'ret_6m', 
    'std_3m', 
    'std_6m', 
    'map_6m'
]
dates = df['date'].unique()

# Regress on next periods return
y_col = 'r_1f1'
drop_cols.append(y_col)

# Independent variables
# x_cols = ['bvtmv', 'bvtmv_sq']
x_cols = [i for i in list(df.columns) if i not in drop_cols]

df_eligible = df[df['eligible']==1]
X = df_eligible[x_cols].astype('float')
y = df_eligible[y_col].astype('float')


In [45]:
# Start index of dates, and index range to regress over
start_index = 1
train_range = 20

train_dates = dates[start_index:start_index+train_range+1]
test_dates = dates[start_index+train_range+1]

X_train = X[df_eligible['date'].isin(train_dates)].astype('float')
X_train = sm.add_constant(X_train)
y_train = y[df_eligible['date'].isin(train_dates)].astype('float')

X_test = X[df_eligible['date']==test_dates].astype('float')
X_test.insert(0, 'const', 1)
y_test = y[df_eligible['date']==test_dates].astype('float')

# print('Train Range:  ', train_dates)
# print('Test Range:  ', test_dates)
print('X_train: ', X_train.values.shape)
print('y_train: ', y_train.values.shape)
print('X_test:  ', X_test.values.shape)
print('y_test:  ', y_test.values.shape)

# X_train = preprocessing.scale(X_train)
# X_test = preprocessing.scale(X_test)

X_train:  (8448, 35)
y_train:  (8448,)
X_test:   (861, 35)
y_test:   (861,)


# Model

In [46]:
model = sm.OLS(y_train, X_train).fit()
pred = model.predict(X_test)

# pred.plot()
# plt.show()
print(model.summary())
# print(model.params)
print('R2_oos:       ', r2_score(y_test, pred))
print('R2_is:        ', model.rsquared)

                            OLS Regression Results                            
Dep. Variable:                  r_1f1   R-squared:                       0.022
Model:                            OLS   Adj. R-squared:                  0.018
Method:                 Least Squares   F-statistic:                     5.460
Date:                Wed, 05 Feb 2020   Prob (F-statistic):           1.70e-22
Time:                        10:33:18   Log-Likelihood:                 7530.3
No. Observations:                8448   AIC:                        -1.499e+04
Df Residuals:                    8413   BIC:                        -1.474e+04
Df Model:                          34                                         
Covariance Type:            nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
const                 