## Import

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import metrics

from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor


In [2]:
df = pd.read_csv('data/Cleaned_amzn_df.csv',index_col='AMZN')
df.dropna(inplace=True)

In [3]:
df = df.iloc[-500:]
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,sentiment,next_day_close_percent,percent_increase,percent_increase_close,four_percent,...,ROC,ROC_diff,tr,ATR,ATR_diff,pos_dx,neg_dx,dx,ADX,ADX_diff
AMZN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-02-13,1647.00,1656.38,1637.110,1640.00,3560321.0,Sell,-1.057927,-0.128049,-0.757317,0,...,-4.580708,-2.639890,19.27,47.920000,-0.443571,0.00,38.23,25.018348,29.017135,-0.391278
2019-02-14,1624.50,1637.90,1606.060,1622.65,4120524.0,Sell,-0.905925,0.757403,0.549102,0,...,-0.220141,4.360567,33.94,48.305000,0.385000,0.00,0.00,41.299223,27.570711,-1.446424
2019-02-15,1627.86,1628.91,1604.500,1607.95,4343893.0,Buy,1.220809,2.901210,1.557884,0,...,-1.552675,-1.332534,24.41,46.014286,-2.290714,0.00,0.00,41.299223,26.966062,-0.604649
2019-02-19,1601.00,1634.00,1600.560,1627.58,3681656.0,Sell,-0.336696,1.660133,0.541909,0,...,-1.882675,-0.330000,33.44,45.033571,-0.980714,5.09,0.00,42.575272,24.074783,-2.891279
2019-02-20,1630.00,1634.93,1610.120,1622.10,3337589.0,Sell,-0.163985,2.003576,1.170705,0,...,-1.107142,0.775533,24.81,40.872143,-4.161429,0.00,9.56,22.878377,22.709063,-1.365721
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-02-01,3242.36,3350.26,3235.025,3342.88,4160212.0,Buy,1.110420,2.725793,1.110420,0,...,7.117497,3.833290,144.06,81.959714,5.097143,113.27,0.00,32.956101,18.797755,0.513129
2021-02-02,3380.00,3427.74,3361.125,3380.00,6183716.0,Sell,-1.996154,1.597633,-0.823964,0,...,3.573595,-3.543902,84.86,84.011014,2.051300,0.00,126.10,7.195686,17.470854,-1.326901
2021-02-03,3425.01,3434.00,3308.620,3312.53,7088781.0,Buy,0.557580,1.946247,1.196065,0,...,0.167524,-3.406072,125.38,88.029586,4.018571,6.26,0.00,0.867212,17.325803,-0.145051
2021-02-04,3330.00,3347.00,3277.750,3331.00,3670661.0,Buy,0.634944,1.380967,0.634944,0,...,1.177621,1.010097,69.25,88.875307,0.845721,0.00,0.00,0.867212,17.273923,-0.051880


## Train/Test Split

In [4]:
y = df['next_day_close_percent']
x = df[['SMA', 'SMA_diff', 'Stochastic', 'Stochastic_diff', 'RSI', 'RSI_diff', 'ROC', 'ROC_diff', 'ATR', 'ATR_diff', 'ADX', 'ADX_diff']]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 123)

In [6]:
scaler = StandardScaler()

scaled_x_train = scaler.fit_transform(x_train)
scaled_x_test = scaler.fit_transform(x_test)

## Grid Search CV

In [7]:
dtr = DecisionTreeRegressor()
svr = SVR()
knr = KNeighborsRegressor()
forest = RandomForestRegressor()
ada_reg = AdaBoostRegressor()
grad_boost_reg = GradientBoostingRegressor()

models = [dtr, svr, knr, forest, ada_reg, grad_boost_reg]

In [None]:
for model in models:
    
    if model == dtr:
        param_grid = {
               'criterion': ['mse', 'friedman_mse', 'mae'],
               'max_depth': [None, 2, 3, 4, 5, 6],
               'min_samples_split': [2, 5, 10],
               'min_samples_leaf': [1, 2, 3, 4, 5, 6],
               'max_features': ['auto', 'sqrt', 'log2']}
        
        
    if model == svr:
        param_grid = {
                'kernel': ['linear', 'rbf'],
                'degree' : [1,3,5],
                'gamma': [0.001, 0.01, 0.1, 1],
                'C': [0.001, 0.01, 0.1, 1, 10]}
 

    if model == knr:
        param_grid = {
               'n_neighbors': [3,5,7,11,19],
               'weights': ['uniform', 'distance'],
               'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
               'metric' : ['minkowski', 'euclidean', 'manhattan']}
    
    
    if model == forest:
        param_grid = {
                'n_estimators': [20, 50, 100, 300, 500],
                'criterion': ['mse', 'mae'],
                'max_depth': [5, 8, 15, 25, 30],
                'min_samples_split': [2, 5, 10, 15, 100],
                'min_samples_leaf': [1, 2, 5, 10],
                'max_features' : ['auto', 'sqrt', 'log2']}
    
    
    if model == ada_reg:
        param_grid = {
                'n_estimators': [50, 100, 150, 200, 250],
                'learning_rate': [0.01, 0.05, 0.1,  0.2, 0.5, 1],
                'loss' : ['linear', 'square', 'exponential']}
    
        
    if model == grad_boost_reg:
        param_grid = {
                'loss' : ['ls', 'lad', 'huber', 'quantile'],
                'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.5],
                'n_estimators': [25, 50, 75, 100, 150, 200, 250],
                'criterion' : ['friedman_mse', 'mse']}
                
    
    grid_search = GridSearchCV(model, param_grid, cv=3, return_train_score=True, scoring='r2') #scoring='r2'

    if model == KNeighborsRegressor() or model == SVR():
        grid_search.fit(scaled_x_train, y_train)

        #Mean training score
        gs_training_score = np.mean(grid_search.cv_results_['mean_train_score'])

        #Mean test score
        gs_testing_score = grid_search.score(scaled_x_test, y_test)

    else:
        grid_search.fit(x_train, y_train)

        #Mean training score
        gs_training_score = np.mean(grid_search.cv_results_['mean_train_score'])

        #Mean test score
        gs_testing_score = grid_search.score(x_test, y_test)

    print(model, '\n')
    print(f"Mean Training Score: {gs_training_score :.2%}")
    print(f"Mean Test Score: {gs_testing_score :.2%}")
    print("Best Parameter Combination Found During Grid Search:")
    print(grid_search.best_params_)
    print('\n\n')

DecisionTreeRegressor() 

Mean Training Score: 25.38%
Mean Test Score: -15.12%
Best Parameter Combination Found During Grid Search:
{'criterion': 'mse', 'max_depth': 2, 'max_features': 'log2', 'min_samples_leaf': 3, 'min_samples_split': 5}



SVR() 

Mean Training Score: 1.53%
Mean Test Score: -0.08%
Best Parameter Combination Found During Grid Search:
{'C': 0.001, 'degree': 1, 'gamma': 0.001, 'kernel': 'rbf'}



KNeighborsRegressor() 

Mean Training Score: 58.93%
Mean Test Score: -1.75%
Best Parameter Combination Found During Grid Search:
{'algorithm': 'auto', 'metric': 'manhattan', 'n_neighbors': 19, 'weights': 'distance'}





### test

In [None]:
y = df['percent_increase_close'].astype(int)
x = df[['SMA', 'SMA_diff', 'Stochastic', 'Stochastic_diff', 'RSI', 'RSI_diff', 'ROC', 'ROC_diff', 'ATR', 'ATR_diff', 'ADX', 'ADX_diff']]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 123)

scaler = StandardScaler()

scaled_x_train = scaler.fit_transform(x_train)
# scaled_y_train = scaler.fit_transform(y_train)

scaled_x_test = scaler.fit_transform(x_test)
# scaled_y_test = scaler.fit_transform(y_test)

In [None]:
dtr = DecisionTreeRegressor()
svr = SVR()
knr = KNeighborsRegressor()
forest = RandomForestRegressor()
ada_reg = AdaBoostRegressor()
grad_boost_reg = GradientBoostingRegressor()

models = [dtr, svr, knr, forest, ada_reg, grad_boost_reg]


for model in models:
    
    if model == dtr:
        param_grid = {
               'criterion': ['mse', 'friedman_mse', 'mae'],
               'max_depth': [None, 2, 3, 4, 5, 6],
               'min_samples_split': [2, 5, 10],
               'min_samples_leaf': [1, 2, 3, 4, 5, 6],
               'max_features': ['auto', 'sqrt', 'log2']}
        
        
    if model == svr:
        param_grid = {
                'kernel': ['linear', 'rbf'],
                'degree' : [1,3,5],
                'gamma': [0.001, 0.01, 0.1, 1],
                'C': [0.001, 0.01, 0.1, 1, 10]}
 

    if model == knr:
        param_grid = {
               'n_neighbors': [3,5,7,11,19],
               'weights': ['uniform', 'distance'],
               'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
               'metric' : ['minkowski', 'euclidean', 'manhattan']}
    
    
    if model == forest:
        param_grid = {
                'n_estimators': [20, 50, 100, 300, 500],
                'criterion': ['mse', 'mae'],
                'max_depth': [5, 8, 15, 25, 30],
                'min_samples_split': [2, 5, 10, 15, 100],
                'min_samples_leaf': [1, 2, 5, 10],
                'max_features' : ['auto', 'sqrt', 'log2']}
    
    
    if model == ada_reg:
        param_grid = {
                'n_estimators': [50, 100, 150, 200, 250],
                'learning_rate': [0.01, 0.05, 0.1,  0.2, 0.5, 1],
                'loss' : ['linear', 'square', 'exponential']}
    
        
    if model == grad_boost_reg:
        param_grid = {
                'loss' : ['ls', 'lad', 'huber', 'quantile'],
                'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.5],
                'n_estimators': [25, 50, 75, 100, 150, 200, 250],
                'criterion' : ['friedman_mse', 'mse']}
                
    
    grid_search = GridSearchCV(model, param_grid, cv=3, return_train_score=True, scoring='r2') #scoring='r2'

    if model == KNeighborsRegressor() or model == SVR():
        grid_search.fit(scaled_x_train, y_train)

        #Mean training score
        gs_training_score = np.mean(grid_search.cv_results_['mean_train_score'])

        #Mean test score
        gs_testing_score = grid_search.score(scaled_x_test, y_test)

    else:
        grid_search.fit(x_train, y_train)

        #Mean training score
        gs_training_score = np.mean(grid_search.cv_results_['mean_train_score'])

        #Mean test score
        gs_testing_score = grid_search.score(x_test, y_test)

    print(model, '\n')
    print(f"Mean Training Score: {gs_training_score :.2%}")
    print(f"Mean Test Score: {gs_testing_score :.2%}")
    print("Best Parameter Combination Found During Grid Search:")
    print(grid_search.best_params_)
    print('\n\n')