# Stock Price Predictor

The first step is to load the required modules to make the predictions we need.

In [1]:
%matplotlib notebook

import warnings
warnings.filterwarnings('ignore')

### TODO: comment the line below one you've ran it once
%run -i './src/download.py'

import sys, os, pdb
import uuid, json, time
import pandas as pd

# import predictions algorithms
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor

sys.path.append(os.getcwd() + '/src')
# import main stocks predictor / data preprocessing file
import lib.stocks as st
import lib.visualizer as vzr

#### Configurations & Parameters

Below we set the tickers we would like to train on and the dates for starting predictions.

In [20]:
DATE_TRAIN_START = '2016-01-01'
DATE_TEST_START = '2018-01-01'
DATE_END = '2018-06-01'

WINDOWS = [5]
HORIZONS = [7]

TICKERS_TRAIN = ['AMZN', 'GOOGL', 'AAPL', 'NVDA', 'NFLX']
TICKERS_PREDICT = ['NFLX', 'AMZN']

#### Downloaded CSV file preview - AMZN ticker

In [31]:
pd.read_csv('_data/tickers/AMZN.csv').tail(10).sort_index()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Ex-Dividend,Split Ratio,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume
550,2018-03-14,1597.0,1606.44,1590.89,1591.0,4164395.0,0.0,1.0,1597.0,1606.44,1590.89,1591.0,4164395.0
551,2018-03-15,1595.0,1596.91,1578.11,1582.32,4026744.0,0.0,1.0,1595.0,1596.91,1578.11,1582.32,4026744.0
552,2018-03-16,1583.45,1589.44,1567.5,1571.68,5145054.0,0.0,1.0,1583.45,1589.44,1567.5,1571.68,5145054.0
553,2018-03-19,1554.53,1561.66,1525.35,1544.93,6376619.0,0.0,1.0,1554.53,1561.66,1525.35,1544.93,6376619.0
554,2018-03-20,1550.34,1587.0,1545.41,1586.51,4507049.0,0.0,1.0,1550.34,1587.0,1545.41,1586.51,4507049.0
555,2018-03-21,1586.45,1590.0,1563.17,1581.86,4667291.0,0.0,1.0,1586.45,1590.0,1563.17,1581.86,4667291.0
556,2018-03-22,1565.47,1573.85,1542.4,1544.1,6177737.0,0.0,1.0,1565.47,1573.85,1542.4,1544.1,6177737.0
557,2018-03-23,1539.01,1549.02,1495.36,1495.56,7843966.0,0.0,1.0,1539.01,1549.02,1495.36,1495.56,7843966.0
558,2018-03-26,1530.0,1556.99,1499.25,1555.86,5547618.0,0.0,1.0,1530.0,1556.99,1499.25,1555.86,5547618.0
559,2018-03-27,1572.4,1575.96,1482.32,1497.05,6793279.0,0.0,1.0,1572.4,1575.96,1482.32,1497.05,6793279.0


#### Processed CSV file preview - AMZN ticker

In [33]:
tickers_datafiles = st.getStockDataFromCSV(['AMZN'], DATE_TRAIN_START, DATE_TEST_START)
tickers_datafiles[0].tail(10).sort_index()

Unnamed: 0_level_0,adj_close_AMZN,volume_AMZN,returns_AMZN
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-12-15,1179.14,0.478797,0.004156
2017-12-18,1190.58,-0.390455,0.009702
2017-12-19,1187.38,-0.076623,-0.002688
2017-12-20,1177.62,-0.092484,-0.00822
2017-12-21,1174.76,-0.119223,-0.002429
2017-12-22,1168.36,-0.23915,-0.005448
2017-12-26,1176.76,0.264398,0.00719
2017-12-27,1182.26,-0.094403,0.004674
2017-12-28,1186.1,0.013532,0.003248
2017-12-29,1169.47,0.455692,-0.014021


The next step is to create a directory where we will save the transformed data. This is done to avoid loading many data files in memeory since our algorithm may apply multiple windows and horizons (a file for each).

Once we've created a directory, we proceed to load a single data representing needed information about all the specified stocks __before__ transformation.

In [21]:
# create a directory with a unique ID
TRIAL_ID = uuid.uuid1()
DIRECTORY = "_trials/{}".format(TRIAL_ID)
os.makedirs(DIRECTORY)

print("Loading data for {}...".format(', '.join(TICKERS_TRAIN)))

# Merge tickers data and show some visualizations
data_files = st.loadMergedData(
    WINDOWS, HORIZONS, TICKERS_TRAIN, TICKERS_PREDICT,
    DATE_TRAIN_START, DATE_END, DATE_TEST_START, DIRECTORY
)

print("A new trial started with ID: {}\n".format(TRIAL_ID))
print("The data files generated are:")
print(data_files)

Loading data for AMZN, GOOGL, AAPL, NVDA, NFLX...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

A new trial started with ID: 9b940872-2b62-11e9-b524-9cb6d06b490b

The data files generated are:
[(7, 5, '_trials/9b940872-2b62-11e9-b524-9cb6d06b490b/finance_w7_h5.csv')]


Now we create a list of regressors which we would like to use for making predictions. We will be comparing all the models we choose to test by using metrics as well as visually through graphs below:

In [22]:
import lib.tpot_stock_pipeline as tp

classifiers = [
    ('GradientBoosted', MultiOutputRegressor(GradientBoostingRegressor())),
#     ('AdaBoost', MultiOutputRegressor(AdaBoostRegressor()))
    ('TPot', MultiOutputRegressor(tp.get_tpot_pipeline()))
]

In [23]:
import seaborn as sns
from IPython.display import display

import warnings
warnings.filterwarnings('ignore')

# - combine the results of each classifier along with its w + h into a response object
all_results = {}

# - train each of the models on the data and save the highest performing
#         model as a pickle file
for h, w, file_path in data_files:
    # Start measuing time
    time_start = time.time()
    
    # load data
    finance = pd.read_csv(file_path, encoding='utf-8', header=0)
    finance = finance.set_index(finance.columns[0])
    finance.index.name = 'Date'
    finance.index = pd.to_datetime(finance.index)
    finance.sort_index()
    
    # perform preprocessing
    X_train, y_train, X_test, y_test = \
        st.prepareDataForClassification(finance, DATE_TEST_START, DATE_END, TICKERS_PREDICT, h, w)

    results = {}

    print("Starting an iteration with a horizon of {} and a window of {}...".format(h, w))

    for i, clf_ in enumerate(classifiers):
        print("Training and testing the {} model...".format(clf_[0]))
        
        # perform k-fold cross validation
        results['cross_validation_%s'%clf_[0]] = \
            st.performCV(X_train, y_train, 10, clf_[1], clf_[0], visualize_folds=True)
        
        # perform predictions with testing data and record result
        preds, results['accuracy_%s'%clf_[0]] = \
            st.trainPredictStocks(X_train, y_train, X_test, y_test, clf_[1], DIRECTORY)
        
        for c in preds.columns:
            preds[c] = preds[c].rolling(window=5).mean()

#         print("\nBelow is a sample of of the results:\n")
#         display(preds.sample(5).sort_index().reindex_axis(sorted(preds.columns), axis=1))
            
        # plot results
        vzr.visualize_predictions(preds, title='Testing Data Results')

    results['window'] = w
    results['horizon'] = h

    # Stop time counter
    time_end = time.time()
    results['time_lapsed'] = time_end - time_start

    all_results['H%s_W%s'%(h, w)] = results

print(json.dumps(all_results, indent=4))

Starting an iteration with a horizon of 7 and a window of 5...
Training and testing the GradientBoosted model...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Training and testing the TPot model...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

{
    "H7_W5": {
        "cross_validation_GradientBoosted": 3.583,
        "accuracy_GradientBoosted": 23.88557779309265,
        "cross_validation_TPot": 0.148,
        "accuracy_TPot": 0.024944011161745067,
        "window": 5,
        "horizon": 7,
        "time_lapsed": 17.575018644332886
    }
}
