## Import

In [76]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import f_classif
from sklearn.metrics import precision_score, recall_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('../data/amzn_final_dataset.csv',index_col='AMZN')
df.dropna(inplace=True)
df['c_four_percent_high'] = df['c_four_percent_high'].map({'Buy': 1, '0': 0})

## Best estimator models

### Decision Tree

Fine tuning the grid search: I am running grid-searches over differ train/test subsets, deleting any hyper parameters that might lead to over fitting or low precision scores

changed x

In [5]:
a = df.iloc[-1500: ]

y = a['c_four_percent_high']

x = a[['SMA', 'ROC', 'ATR', 'ADX',
        'High', 'Low', 'Close'
       ]]


In [6]:
dtc = DecisionTreeClassifier()
b = pd.DataFrame(columns = ['train', 'test', 'criterion', 'max_depth', 'min_samples_leaf', 'min_samples_split'])

for i in range(25): 
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

    param_grid = {
                'criterion': ['gini', 'entropy'],
                'max_depth': [2, 3, 4],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 3, 4, 5, 6]}

    grid_search = GridSearchCV(dtc, param_grid, cv=3, return_train_score=True, scoring = 'precision')


    grid_search.fit(x_train, y_train)


    best_model = grid_search.best_estimator_

    train_model = best_model.predict(x_train)

    test_model = best_model.predict(x_test)

    train_score = precision_score(y_train, train_model)
    test_score = precision_score(y_test, test_model)


    b.loc[i] = [train_score,
                test_score,
                grid_search.best_params_['criterion'],
                grid_search.best_params_['max_depth'],
                grid_search.best_params_['min_samples_leaf'],
                grid_search.best_params_['min_samples_split']]
    
b.sort_values(by='test', ascending=False)

Unnamed: 0,train,test,criterion,max_depth,min_samples_leaf,min_samples_split
23,0.848485,0.68,entropy,4,6,2
16,0.787097,0.677419,gini,4,1,2
13,0.589888,0.666667,gini,2,1,2
19,0.72,0.666667,entropy,4,5,2
7,0.654321,0.666667,gini,3,1,2
15,0.59116,0.651163,gini,2,1,2
21,0.594118,0.645833,gini,2,1,2
12,0.59322,0.627451,gini,2,1,2
24,0.763158,0.608696,entropy,4,6,2
2,0.668874,0.605263,gini,3,1,2


In [7]:
for i in range(790, 800): 
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=i)

    param_grid = {
                'criterion': ['gini', 'entropy'],
                'max_depth': [2, 3, 4],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 3, 4, 5, 6]}

    grid_search = GridSearchCV(dtc, param_grid, cv=3, return_train_score=True, scoring = 'precision', n_jobs=-1)


    grid_search.fit(x_train, y_train)


    best_model = grid_search.best_estimator_

    train_model = best_model.predict(x_train)

    test_model = best_model.predict(x_test)

    train_score = precision_score(y_train, train_model)
    test_score = precision_score(y_test, test_model)
    
    recall_test = recall_score(y_test, test_model)


    diff = abs(train_score - test_score)
    
    if test_score > .69 and diff < .05 and recall_test > .11:
        print('random_state :',i)
        print('train precision score :', round(train_score,2))
        print('test precision score :', round(test_score,2))
        print(grid_search.best_params_)
        print('test recall score :', round(recall_test,2))
        print()

random_state : 790
train precision score : 0.86
test precision score : 0.84
{'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 6, 'min_samples_split': 2}
test recall score : 0.21



Best Choices:
_________

random_state : 790  
train precision score : 0.86  
test precision score : 0.84  
{'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 6, 'min_samples_split': 2}  
test recall score : 0.21  

    ------------
    | 93 | 354 |
    ------------
    | 16 | 1040|  
    ------------
      
    
    
random_state : 711  
train precision score : 0.75  
test precision score : 0.74  
{'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 1, 'min_samples_split': 2}  
test recall score : 0.27   

    -------------
    | 141 | 306 |
    -------------
    | 47 | 1030 |  
    -------------
        
          
          
random_state : 253  
train precision score : 0.8  
test precision score : 0.76  
{'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 2, 'min_samples_split': 10}  
test recall score : 0.24  

    -------------
    | 130 | 317 |
    -------------
    | 33  | 1020|  
    -------------

> I am moving foward with the random state of 790

## Ensemble Models

In [53]:
y = a['c_four_percent_high']

x = a[['SMA', 'ROC', 'ATR', 'ADX',
        'High', 'Close'
       ]]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=790)

dtc = DecisionTreeClassifier(criterion='gini',
                             max_depth=4,
                             min_samples_leaf=6,
                             min_samples_split=2)


### Random Forest

In [75]:
forest = RandomForestClassifier()

param_grid = {
            'criterion': ['gini'],
            'max_depth': [4],
            'min_samples_leaf': [6],
            'min_samples_split': [2],
            'n_estimators': [800]}

grid_search = GridSearchCV(forest, param_grid, cv=3, return_train_score=True, scoring = 'precision', n_jobs=-1)


grid_search.fit(x_train, y_train)
best_model = grid_search.best_estimator_

#Mean training score
train_model = best_model.predict(x_train)
train_score = precision_score(y_train, train_model)

#Mean test score
test_model = best_model.predict(x_test)
test_score = precision_score(y_test, test_model)

print(f"Precision Training Score: {train_score :.2%}")
print(f"Precision Test Score: {test_score :.2%}")
print("Best Parameter Combination Found During Grid Search:")
print(grid_search.best_params_)

Precision Training Score: 80.53%
Precision Test Score: 65.62%
Best Parameter Combination Found During Grid Search:
{'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 6, 'min_samples_split': 2, 'n_estimators': 800}


### ADA Boost

In [54]:
ada = AdaBoostClassifier(base_estimator=dtc)

param_grid = {
        'n_estimators': [25],
        'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.5]}

grid_search = GridSearchCV(ada, param_grid, cv=3, return_train_score=True, scoring = 'precision', n_jobs=-1)


grid_search.fit(x_train, y_train)
best_model = grid_search.best_estimator_

#Mean training score
train_model = best_model.predict(x_train)
train_score = precision_score(y_train, train_model)

#Mean test score
test_model = best_model.predict(x_test)
test_score = precision_score(y_test, test_model)

print(f"Precision Training Score: {train_score :.2%}")
print(f"Precision Test Score: {test_score :.2%}")
print("Best Parameter Combination Found During Grid Search:")
print(grid_search.best_params_)

Precision Training Score: 85.71%
Precision Test Score: 84.00%
Best Parameter Combination Found During Grid Search:
{'learning_rate': 0.001, 'n_estimators': 25}


### Gradient Boosting

In [51]:
grad_boost = GradientBoostingClassifier(init=dtc)

param_grid = {
    'n_estimators': [30],
    'learning_rate': [0.001, 0.01, 0.1, 0.2],
    'loss': ['deviance', 'exponential'],
    'criterion': ['friedman_mse', 'mse']
}


grid_search = GridSearchCV(grad_boost, param_grid, cv=3, return_train_score=True, scoring = 'precision', n_jobs=-1)



grid_search.fit(x_train, y_train)
best_model = grid_search.best_estimator_

#Mean training score
train_model = best_model.predict(x_train)
train_score = precision_score(y_train, train_model)

#Mean test score
test_model = best_model.predict(x_test)
test_score = precision_score(y_test, test_model)

print(f"Precision Training Score: {train_score :.2%}")
print(f"Precision Test Score: {test_score :.2%}")
print("Best Parameter Combination Found During Grid Search:")
print(grid_search.best_params_)
   

Precision Training Score: 85.71%
Precision Test Score: 84.00%
Best Parameter Combination Found During Grid Search:
{'criterion': 'friedman_mse', 'learning_rate': 0.001, 'loss': 'deviance', 'n_estimators': 30}
