### Temporary yaml loader

In [1]:
import yaml
def load_yaml(path):
    with open(path, mode='r') as file:
        return yaml.load(file, Loader=yaml.FullLoader)
    
config = load_yaml('extra/dataextractor.yaml')

In [2]:
config

{'data': {'useYfinance': True,
  'yfinance': {'stock': 'NVDA',
   'start': datetime.date(2019, 1, 8),
   'end': datetime.date(2020, 1, 8),
   'interval': '1d'},
  'binance': {'coin': 'BTCUSDT', 'interval': '1d'}},
 'processing': {'resample': True,
  'sampling': {'time': 'D',
   'aggregate': {'Open': 'first',
    'High': 'max',
    'Low': 'min',
    'Close': 'last',
    'Volume': 'sum'}},
  'label': {'source': 'Close', 'shift': -1}},
 'features': {'window': 14,
  'type': 0,
  'filter': ['Label',
   'Open',
   'High',
   'Low',
   'Volume',
   'stocK',
   'stocD',
   'stocSD',
   'Momentum',
   'ROC',
   'LWR',
   'AOosci',
   'Disp5',
   'Disp10',
   'OSCP',
   'CCI',
   'RSI',
   'OBV',
   'MA',
   'BIAS6',
   'PSY12',
   'ASY5',
   'ASY4',
   'ASY3',
   'ASY2',
   'ASY1']},
 'split': {'scalerName': 'SS',
  'trainTestPercentage': 0.8,
  'trainTestValidate': {'train': 0.7, 'test': 0.15, 'validate': 0.15}}}

### IMPORTS

In [3]:
%run dataProcessing.ipynb
%run dataFeatures.ipynb
%run dataSplitScale.ipynb

## STEP 1: Import data 

In [4]:
df = get_data(config)

[*********************100%***********************]  1 of 1 completed


In [5]:
df.head(16)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Label
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-01-07,138.5,144.889999,136.429993,143.399994,17729000,139.830002
2019-01-08,146.690002,146.779999,136.899994,139.830002,19650400,142.580002
2019-01-09,141.899994,144.490005,139.860001,142.580002,15431500,145.229996
2019-01-10,141.800003,145.580002,139.360001,145.229996,13078900,148.830002
2019-01-11,144.330002,149.75,143.210007,148.830002,21869100,150.440002
2019-01-14,146.720001,151.460007,145.770004,150.440002,18254200,149.869995
2019-01-15,151.759995,153.350006,149.130005,149.869995,15425300,148.839996
2019-01-16,150.970001,152.300003,148.619995,148.839996,11752600,151.720001
2019-01-17,147.509995,153.330002,146.410004,151.720001,12335900,156.929993
2019-01-18,153.729996,157.979996,151.649994,156.929993,16283400,148.770004


## STEP 2: Create features

In [6]:
df = add_features(df, config['features'])

In [7]:
df.head()

Unnamed: 0_level_0,Label,Open,High,Low,Volume,stocK,stocD,stocSD,Momentum,ROC,...,RSI,OBV,MA,BIAS6,PSY12,ASY5,ASY4,ASY3,ASY2,ASY1
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-01-31,144.729996,137.259995,145.190002,136.380005,21071300,42.670676,22.021422,26.155683,-1.479996,-1.01907,...,48.256657,49040500,142.179999,-0.007183,66.7,-1.870123,-2.700879,1.358319,4.415431,4.525209
2019-02-01,149.179993,144.5,146.789993,142.580002,15626200,45.950447,36.668886,24.769444,-4.100006,-2.754825,...,49.183378,64666700,139.095999,0.014901,75.0,-2.024819,1.188595,3.170095,2.602316,0.679423
2019-02-04,149.949997,145.369995,150.679993,144.479996,13214800,60.843339,49.821487,36.170598,-1.26001,-0.83755,...,53.275596,77881500,141.329999,0.059693,75.0,1.556549,3.134663,2.744333,1.853895,3.028367
2019-02-05,153.0,149.660004,151.429993,148.300003,13560600,63.420327,56.738038,47.742804,0.080002,0.053381,...,53.966386,91442100,144.999997,0.050315,75.0,2.610696,2.186957,1.40754,1.771599,0.51483
2019-02-06,147.419998,151.289993,155.600006,151.070007,17561600,73.627833,65.963833,57.507786,4.160004,2.79495,...,56.697306,109003700,148.121997,0.045558,75.0,2.152287,1.559056,1.852267,1.264217,2.013604


## STEP 3: SPLIT DATA

In [8]:
x_train, x_test, y_train, y_test, scaler = train_test_split(df, config['split'])

In [9]:
x_train, x_test, x_validate, y_train, y_test, y_validate, scaler = train_test_validate_split(df, config['split'])

In [10]:
scaler

StandardScaler(copy=True, with_mean=True, with_std=True)

In [11]:
x_train

array([[-1.95761469, -1.57990882, -1.87260748, ...,  0.84001851,
         2.45573497,  1.82620289],
       [-1.45044821, -1.46676253, -1.43310165, ...,  2.09913897,
         1.41038765,  0.22505295],
       [-1.38950456, -1.19167407, -1.29841475, ...,  1.80324947,
         0.97888739,  1.20300972],
       ...,
       [ 0.81918969,  0.72970335,  0.67227384, ..., -1.20167752,
        -1.28191536, -1.07004558],
       [ 0.52497753,  0.64625692,  0.69141392, ..., -0.77517334,
        -0.43415497,  0.45732533],
       [ 0.7631501 ,  0.69010101,  0.58862643, ..., -0.68865294,
        -0.11669365, -0.61155285]])

In [12]:
y_train

array([-1.45546979, -1.14091229, -1.08648289, -0.87088694, -1.26532124,
       -1.2123059 , -1.33388784, -1.00024451, -0.87936905, -0.76273572,
       -0.56410506, -0.61358583, -0.47857316, -0.67508329, -0.43333344,
       -0.46867701, -0.58106928, -0.7005307 , -0.78182154, -0.62701656,
       -0.60368968, -0.62206794, -0.93803949, -1.13525719, -1.0377086 ,
       -0.29549376, -0.19794517,  0.23324566,  0.01694322,  0.31736352,
        0.25657254,  0.73441822,  0.64181718,  1.31617298,  0.8609477 ,
        0.5979915 ,  0.81641447,  0.79026058,  0.84327592,  1.00656302,
        1.19883209,  1.24972691,  1.64698823,  1.62154082,  1.81168936,
        1.87106629,  1.69222794,  1.89298021,  1.85339451,  1.74524329,
        1.36989481,  1.61800732,  1.55297422,  1.48299465,  1.63638559,
        1.79189705,  1.82724062,  1.52611384,  0.90265285,  0.99030528,
        1.10835265,  1.07088856,  1.26315764,  1.25043339,  1.02706289,
        0.55063125,  0.60788766,  0.34422497,  0.24738395, -0.48