## Import

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import f_classif
from sklearn.metrics import precision_score, recall_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('../data/amzn_final_dataset.csv',index_col='AMZN')
df.dropna(inplace=True)
df['c_four_percent_high'] = df['c_four_percent_high'].map({'Buy': 1, '0': 0})

## Grid Search CV

In [3]:
knn = KNeighborsClassifier()
dtc = DecisionTreeClassifier()
forest = RandomForestClassifier()
ada = AdaBoostClassifier()
grad_boost =  GradientBoostingClassifier()
xgb = XGBClassifier()
svm = SVC()

models = [knn, dtc, xgb, svm]

In [4]:
scaler = StandardScaler()
#indexes = [-2500,-2000,-1500,-1000,-500,-300,-200]
indexes = [-1500]

for index in indexes:
    a = df.iloc[index: ]


    y = a['c_four_percent_high']

    x = a[['SMA', 'Stochastic','RSI', 'ROC', 'ATR', 'ADX',
           'ADX_diff', 'SMA_diff', 'Stochastic_diff', 'RSI_diff', 'ROC_diff', 'ATR_diff',
           'rsi_over_80','rsi_over_70','rsi_over_60','rsi_under_40','rsi_under_30','rsi_under_20',
           'Open', 'High', 'Low', 'Close'
           ]]
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)    
    
    scaled_x_train = scaler.fit_transform(x_train)
    scaled_x_test = scaler.fit_transform(x_test)
    

    for model in models:

        if model == knn:
            param_grid = {
                   'n_neighbors': [3,5,7,11,19],
                   'weights': ['uniform', 'distance'],
                   'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                   'metric' : ['minkowski', 'euclidean', 'manhattan']}

        if model == dtc:
            param_grid = {
                   'criterion': ['gini', 'entropy'],
                   'max_depth': [2, 3, 4, 5, 6],
                   'min_samples_split': [2, 5, 10],
                   'min_samples_leaf': [1, 2, 3, 4, 5, 6]}



        if model == xgb:
            param_grid = {
                    'n_estimators': [100, 150, 200, 250],
                    'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.5],
                    'max_depth': [3,4,5,6,7,8,9]}

        if model == svm:
            param_grid = {
                    'C': [0.001, 0.01, 0.1, 1, 10],
                    'gamma': [0.001, 0.01, 0.1, 1],
                    'kernel': ['linear']}


    #     for score in scores:

        grid_search = GridSearchCV(model, param_grid, cv=3, return_train_score=True, scoring = 'precision')

        if model == knn or model == svm:
            grid_search.fit(scaled_x_train, y_train)
            best_model = grid_search.best_estimator_

            #Mean training score
            train_model = best_model.predict(scaled_x_train)
            train_score = precision_score(y_train, train_model)

            #Mean test score
            test_model = best_model.predict(scaled_x_test)
            test_score = precision_score(y_test, test_model)



        else:
            grid_search.fit(x_train, y_train)
            best_model = grid_search.best_estimator_

            #Mean training score
            train_model = best_model.predict(x_train)
            train_score = precision_score(y_train, train_model)

            #Mean test score
            test_model = best_model.predict(x_test)
            test_score = precision_score(y_test, test_model)

        print(model, '\n')
        print(f"Precision Training Score: {train_score :.2%}")
        print(f"Precision Test Score: {test_score :.2%}")
        print("Best Parameter Combination Found During Grid Search:")
        print(grid_search.best_params_)
        print('====================================================== \n\n')

KNeighborsClassifier() 

Precision Training Score: 100.00%
Precision Test Score: 54.55%
Best Parameter Combination Found During Grid Search:
{'algorithm': 'auto', 'metric': 'manhattan', 'n_neighbors': 11, 'weights': 'distance'}


DecisionTreeClassifier() 

Precision Training Score: 61.08%
Precision Test Score: 57.38%
Best Parameter Combination Found During Grid Search:
{'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 4, 'min_samples_split': 2}


XGBClassifier() 

Precision Training Score: 100.00%
Precision Test Score: 66.67%
Best Parameter Combination Found During Grid Search:
{'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 100}


SVC() 

Precision Training Score: 0.00%
Precision Test Score: 0.00%
Best Parameter Combination Found During Grid Search:
{'C': 0.001, 'gamma': 0.001, 'kernel': 'linear'}




> 1500 rows seems to be good starting place on how many rows to incorporate

> knn has overfitting problems
> svc has to have very small data set to work

> Decision Tree and XGB seems to be a good starting place on models

### Decision Tree

Fine tuning the grid search: I am running grid-searches over differ train/test subsets, deleting any hyper parameters that might lead to over fitting or low precision scores

changed x

In [5]:
a = df.iloc[-1500: ]

y = a['c_four_percent_high']

x = a[['SMA', 'ROC', 'ATR', 'ADX',
        'High', 'Low', 'Close'
       ]]


In [6]:
dtc = DecisionTreeClassifier()
b = pd.DataFrame(columns = ['train', 'test', 'criterion', 'max_depth', 'min_samples_leaf', 'min_samples_split'])

for i in range(25): 
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

    param_grid = {
                'criterion': ['gini', 'entropy'],
                'max_depth': [2, 3, 4],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 3, 4, 5, 6]}

    grid_search = GridSearchCV(dtc, param_grid, cv=3, return_train_score=True, scoring = 'precision')


    grid_search.fit(x_train, y_train)


    best_model = grid_search.best_estimator_

    train_model = best_model.predict(x_train)

    test_model = best_model.predict(x_test)

    train_score = precision_score(y_train, train_model)
    test_score = precision_score(y_test, test_model)


    b.loc[i] = [train_score,
                test_score,
                grid_search.best_params_['criterion'],
                grid_search.best_params_['max_depth'],
                grid_search.best_params_['min_samples_leaf'],
                grid_search.best_params_['min_samples_split']]
    
b.sort_values(by='test', ascending=False)

Unnamed: 0,train,test,criterion,max_depth,min_samples_leaf,min_samples_split
2,0.978723,0.8,entropy,4,6,2
4,0.79661,0.777778,gini,3,1,2
3,0.8,0.769231,gini,3,1,2
10,0.737179,0.685714,gini,4,4,10
21,0.669118,0.676471,gini,3,1,2
0,0.586957,0.675,gini,2,1,2
16,0.627737,0.647059,gini,2,1,2
8,0.592391,0.636364,gini,2,1,2
11,0.681159,0.636364,entropy,4,1,5
17,0.656522,0.625,gini,4,2,2


In [7]:
for i in range(790, 800): 
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=i)

    param_grid = {
                'criterion': ['gini', 'entropy'],
                'max_depth': [2, 3, 4],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 3, 4, 5, 6]}

    grid_search = GridSearchCV(dtc, param_grid, cv=3, return_train_score=True, scoring = 'precision', n_jobs=-1)


    grid_search.fit(x_train, y_train)


    best_model = grid_search.best_estimator_

    train_model = best_model.predict(x_train)

    test_model = best_model.predict(x_test)

    train_score = precision_score(y_train, train_model)
    test_score = precision_score(y_test, test_model)
    
    recall_test = recall_score(y_test, test_model)


    diff = abs(train_score - test_score)
    
    if test_score > .69 and diff < .05 and recall_test > .11:
        print('random_state :',i)
        print('train precision score :', round(train_score,2))
        print('test precision score :', round(test_score,2))
        print(grid_search.best_params_)
        print('test recall score :', round(recall_test,2))
        print()

random_state : 790
train precision score : 0.86
test precision score : 0.84
{'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 6, 'min_samples_split': 2}
test recall score : 0.21



Best Choices:
_________

random_state : 790  
train precision score : 0.86  
test precision score : 0.84  
{'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 6, 'min_samples_split': 2}  
test recall score : 0.21  

    ------------
    | 93 | 354 |
    ------------
    | 16 | 1040|  
    ------------
  
  
  
random_state : 711  
train precision score : 0.75  
test precision score : 0.74  
{'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 1, 'min_samples_split': 2}  
test recall score : 0.27   

    -------------
    | 141 | 306 |
    -------------
    | 47 | 1030 |  
    -------------
    
random_state : 253  
train precision score : 0.8  
test precision score : 0.76  
{'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 2, 'min_samples_split': 10}  
test recall score : 0.24  

    -------------
    | 130 | 317 |
    -------------
    | 33  | 1020|  
    -------------

> I am moving foward with the random state of 790

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=896)

### ADA Boost

In [9]:
y = df['c_four_percent_high']

x = df[['SMA', 'ROC', 'ATR', 'ADX',
        'High', 'Low', 'Close'
       ]]

In [10]:
dtc = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=1, min_samples_split=2)
ada = AdaBoostClassifier(base_estimator=dtc)

param_grid = {
        'n_estimators': [100,150, 200, 250],
        'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.5]}

grid_search = GridSearchCV(ada, param_grid, cv=3, return_train_score=True, scoring = 'precision', n_jobs=-1)


grid_search.fit(x_train, y_train)
best_model = grid_search.best_estimator_

#Mean training score
train_model = best_model.predict(x_train)
train_score = precision_score(y_train, train_model)

#Mean test score
test_model = best_model.predict(x_test)
test_score = precision_score(y_test, test_model)

print(f"Precision Training Score: {train_score :.2%}")
print(f"Precision Test Score: {test_score :.2%}")
print("Best Parameter Combination Found During Grid Search:")
print(grid_search.best_params_)
print('====================================================== \n\n')

Precision Training Score: 100.00%
Precision Test Score: 61.18%
Best Parameter Combination Found During Grid Search:
{'learning_rate': 0.5, 'n_estimators': 200}




### Gradient Boosting

In [11]:
grad3 = GradientBoostingClassifier(init=dtc)

param_grid = {
    'n_estimators': [375],
    'learning_rate': [0.001, 0.01, 0.1, 0.2],
    'loss': ['deviance', 'exponential'],
    'criterion': ['friedman_mse', 'mse']
}


grid_search = GridSearchCV(grad_boost, param_grid, cv=3, return_train_score=True, scoring = 'precision')



grid_search.fit(x_train, y_train)
best_model = grid_search.best_estimator_

#Mean training score
train_model = best_model.predict(x_train)
train_score = precision_score(y_train, train_model)

#Mean test score
test_model = best_model.predict(x_test)
test_score = precision_score(y_test, test_model)

print(f"Precision Training Score: {train_score :.2%}")
print(f"Precision Test Score: {test_score :.2%}")
print("Best Parameter Combination Found During Grid Search:")
print(grid_search.best_params_)
print('====================================================== \n\n')
   

Precision Training Score: 88.36%
Precision Test Score: 67.86%
Best Parameter Combination Found During Grid Search:
{'criterion': 'friedman_mse', 'learning_rate': 0.01, 'loss': 'deviance', 'n_estimators': 375}


