In [191]:
## run following in termminal (or in notebook) if needed to get packages to work
# !conda create -n tensorflow
# !source activate tensorflow
# !pip install jupyter notebook
# !jupyter-notebook
# !which pip
# !pip install tensorflow
# !pip install keras
# !pip install sklearn pandas numpy seaborn
import tensorflow
from keras.models import Sequential
from keras.layers import Dense, LSTM
from sklearn.pipeline import Pipeline, FeatureUnion
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score as r2
from sklearn.metrics import explained_variance_score as evs
from sklearn.metrics import mean_absolute_percentage_error as mape

In [179]:
def load_df():
    stocks_df = pd.read_pickle('./data/stocks_df_2021-02-16.pickle')
    stocks_df.sort_values(by=['Date'], inplace=True)
    return stocks_df
# stocks_df.head()

def get_data(stocks_df):
    data = stocks_df.Close
    return data
def get_train_length(data):
    train_length = int(np.ceil(len(data.values)*0.8))
    return train_length
def get_train_test_split(data, train_length):
    train_length = get_train_length(data)
    training_set = data.iloc[:train_length].values
    test_set = data.iloc[train_length:].values
    return training_set, test_set
def get_x_y_split(data_set):
    # Creating a data structure with 60 time-steps and 1 output
    length=len(data_set)
    X = []
    y = []
    for i in range(60, length):
        X.append(data_set[i-60:i])
        y.append(data_set[i])
    X, y = np.array(X), np.array(y)
    X = np.reshape(X, (X.shape[0], X.shape[1]))
    return X, y
def timeseries_test_train_split():
    stocks_df = load_df()
    data = get_data(stocks_df)
    train_length = get_train_length(data)
    training_set, test_set = get_train_test_split(data, train_length)
    X_train, y_train = get_x_y_split(training_set)
    X_test, y_test = get_x_y_split(test_set)
    return X_train, y_train, X_test, y_test
# scaler = MinMaxScaler(feature_range=(0,1))
# scaled_data = scaler.fit_transform(data)
# pipe = Pipeline(('scaler', MinMaxScaler(feature_range=(0,1)))) 

In [180]:
# def run():
X_train, y_train, X_test, y_test = timeseries_test_train_split()

In [200]:
transformers = FeatureUnion([('scaler', MinMaxScaler()), ('norm', Normalizer())])
pipe = Pipeline([('preprocess', StandardScaler()), ('predictor', LinearRegression())])

In [201]:
lr_grid = {'predictor': [LinearRegression()]}
svr_grid = {
    'predictor': [SVR()],
    'predictor__kernel': ['linear', 'poly', 'rbf', 'sigmoid']   
}
d_tree_grid = {
    'predictor': [DecisionTreeRegressor()],
    'predictor__criterion': ['mse', 'friedman_mse', 'mae', 'poisson']  
}
mlp_grid = {
    'predictor': [MLPRegressor()],
    'predictor__learning_rate_init': [0.001],
    'predictor__random_state': [1],
    'predictor__max_iter': [500],
    'predictor__activation': ['relu','tanh','logistic'],
    'predictor__hidden_layer_sizes': [(100,),(50,),(125,)]
}

param_grid = [
#     {**lr_grid},
#     {**mlp_grid},
#     {**svr_grid}
    {**d_tree_grid}
    
]

In [202]:
pipe.fit(X_train, y_train)
grid = GridSearchCV(pipe, param_grid, verbose=4, cv = 3, scoring='r2')
grid.fit(X_train, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV 1/3] END predictor=DecisionTreeRegressor(), predictor__criterion=mse; total time=   0.1s
[CV 2/3] END predictor=DecisionTreeRegressor(), predictor__criterion=mse; total time=   0.1s
[CV 3/3] END predictor=DecisionTreeRegressor(), predictor__criterion=mse; total time=   0.1s
[CV 1/3] END predictor=DecisionTreeRegressor(), predictor__criterion=friedman_mse; total time=   0.1s
[CV 2/3] END predictor=DecisionTreeRegressor(), predictor__criterion=friedman_mse; total time=   0.1s
[CV 3/3] END predictor=DecisionTreeRegressor(), predictor__criterion=friedman_mse; total time=   0.1s
[CV 1/3] END predictor=DecisionTreeRegressor(), predictor__criterion=mae; total time=   1.6s
[CV 2/3] END predictor=DecisionTreeRegressor(), predictor__criterion=mae; total time=   1.2s
[CV 3/3] END predictor=DecisionTreeRegressor(), predictor__criterion=mae; total time=   0.8s
[CV 1/3] END predictor=DecisionTreeRegressor(), predictor__criterion=poisson

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('preprocess', StandardScaler()),
                                       ('predictor', LinearRegression())]),
             param_grid=[{'predictor': [DecisionTreeRegressor()],
                          'predictor__criterion': ['mse', 'friedman_mse', 'mae',
                                                   'poisson']}],
             scoring='r2', verbose=4)

In [203]:
grid_est = grid.best_estimator_
y_pred = pipe.predict(X_test)
grid_pred = grid_est.predict(X_test)
grid.score(X_test, y_test)
print(grid.best_params_)
print(grid_est.score(X_test, y_test), pipe.score(X_test, y_test))
print(grid.scorer_)
print('r2_score as r2;','explained_variance_score as evs;','mean_absolute_percentage_error as mape;')
print('grid', r2(y_test, y_pred), evs(y_test, y_pred), mape(y_test, y_pred))
print('linreg', r2(y_test, grid_pred), evs(y_test, grid_pred), mape(y_test, grid_pred))

{'predictor': DecisionTreeRegressor(), 'predictor__criterion': 'mse'}
-0.8412628060537997 0.07412783835393799
make_scorer(r2_score)
r2_score as r2; explained_variance_score as evs; mean_absolute_percentage_error as mape;
grid 0.07412783835393799 0.07524062458810554 2.7930824629715723
linreg -0.8412628060537997 -0.707814528307984 4.662569308422645


In [185]:
DecisionTreeRegressor().get_params()

{'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}