## Import Data and Libraries

In [15]:
from Functions_6 import normalize_data, time_series_CV_split, wrapper_feature_selector, train_and_predict, warn
import pandas as pd
from pandas import DataFrame
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
from sklearn.preprocessing import MinMaxScaler 
from sklearn.grid_search import GridSearchCV
from time import time
from math import sqrt
from sklearn.svm import SVR
from pyspark import SparkContext

from matplotlib import pyplot
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
%matplotlib inline

# Train Set
train_data = pd.read_csv('/Users/Andy/Desktop/Bitcoin/Data/Train_data_after_EDA.csv')
train_data = train_data.drop(columns=['date'])
print('n_features:', len(train_data.iloc[0]))
print('n_samples:', len(train_data))
train_data.head()

n_features: 10
n_samples: 365


Unnamed: 0,close,volume,BCHAIN-DIFF,BCHAIN-AVBLS,BCHAIN-MIREV,BCHAIN-CPTRA,BCHAIN-NTRAN,BCHAIN-HRATE,BCHAIN-CPT,BCHAIN-NTRBL
0,13657.2,10291200000,1870000000000.0,1.037057,35435185.86,125.970638,290422,15177350.25,122.012747,1781.730061
1,14982.1,16846600192,1920000000000.0,1.043383,32334328.99,138.227164,241757,14975580.96,133.74723,1549.724359
2,15201.0,16871900160,1930000000000.0,1.041368,40553327.64,122.635624,340980,16415540.67,118.93169,1994.035088
3,15599.2,21783199744,1930000000000.0,1.065513,39612658.08,103.108719,395963,15071578.27,100.041312,2522.057325
4,17429.5,23840899072,1930000000000.0,1.065833,42527795.29,102.933856,425008,16127548.73,100.063517,2529.809524


In [2]:
# Test Set
test_data = pd.read_csv('/Users/Andy/Desktop/Bitcoin/Data/Test_data_after_EDA.csv')
test_data = test_data.drop(columns=['date'])
print('n_features:', len(test_data.iloc[0]))
print('n_samples:', len(test_data))
test_data.head()

n_features: 10
n_samples: 181


Unnamed: 0,close,volume,BCHAIN-DIFF,BCHAIN-AVBLS,BCHAIN-MIREV,BCHAIN-CPTRA,BCHAIN-NTRAN,BCHAIN-HRATE,BCHAIN-CPT,BCHAIN-NTRBL
0,3843.52,4324200990,5250000000000.0,0.888394,7406437.589,28.520962,259684,43291796.76,28.520962,1675.380645
1,3943.41,5244856836,5620000000000.0,0.801779,7030739.129,29.95309,234725,41615985.27,29.95309,1575.33557
2,3836.74,4530215219,5620000000000.0,0.947861,7368988.356,27.122182,271696,42174589.1,27.122182,1799.311258
3,3857.72,4847965467,5620000000000.0,0.966222,7564081.637,25.991979,291016,43291796.76,25.991979,1877.522581
4,3845.19,5137609824,5620000000000.0,0.959199,7194486.33,25.533007,281772,41615985.27,25.533007,1891.087248


In [3]:
combined_data = pd.concat([train_data, test_data], ignore_index=True) # use data from 01/2018 onwards only - best stationarity
combined_data['Price'] = combined_data['close'].shift(-1) # Dependent variable Y
combined_data = combined_data.iloc[:-1,1:] # remove the last row and the first close 
print('n_features:', len(combined_data.iloc[0]))
print('n_samples:', len(combined_data))
combined_data.head()

n_features: 10
n_samples: 545


Unnamed: 0,volume,BCHAIN-DIFF,BCHAIN-AVBLS,BCHAIN-MIREV,BCHAIN-CPTRA,BCHAIN-NTRAN,BCHAIN-HRATE,BCHAIN-CPT,BCHAIN-NTRBL,Price
0,10291200000,1870000000000.0,1.037057,35435185.86,125.970638,290422,15177350.25,122.012747,1781.730061,14982.1
1,16846600192,1920000000000.0,1.043383,32334328.99,138.227164,241757,14975580.96,133.74723,1549.724359,15201.0
2,16871900160,1930000000000.0,1.041368,40553327.64,122.635624,340980,16415540.67,118.93169,1994.035088,15599.2
3,21783199744,1930000000000.0,1.065513,39612658.08,103.108719,395963,15071578.27,100.041312,2522.057325,17429.5
4,23840899072,1930000000000.0,1.065833,42527795.29,102.933856,425008,16127548.73,100.063517,2529.809524,17527.0


## Data Preparation

In [5]:
# Feature Meta Subset
subset = [8,7,6,5,4,3,2,1,0]

# Split train data into X (features) and Y (dependent variable)
data = combined_data.values 
Y_train = data[0:-181,-1].reshape(-1,1) 
X_train = data[0:-181,:-1]

# Training Validation samples size (1/6/18 - 31/12/18)
n_validation = 214

### Parameters Tuning

In [19]:
from sklearn.pipeline import Pipeline
pipe = pipe = Pipeline([
    ('normalize', MinMaxScaler()),
    ('svr', SVR())
])
pipe

Pipeline(memory=None,
     steps=[('normalize', MinMaxScaler(copy=True, feature_range=(0, 1))), ('svr', SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False))])

In [21]:
%%time
sc = SparkContext.getOrCreate()

param_grid = [{"svr__kernel": ['rbf','linear','poly','sigmoid'],
              "svr__gamma": [0.001, 0.01, 0.1, 1, 'auto'],
              "svr__C": [0.001, 0.01, 0.1, 1, 10],
              "svr__shrinking": [True,False]
        }]
split = time_series_CV_split(len(X_train),214,0)
gs = GridSearchCV(sc, pipe, param_grid, cv=split, scoring=make_scorer(mean_squared_error,greater_is_better=False), verbose=1) 
gs.fit(X_train,Y_train.reshape(-1,))
print("Best MSE:", str(gs.cv_results_['mean_test_score'][gs.best_index_]))
print('Best Parameters: ',gs.cv_results_['params'][gs.best_index_])
sc.stop()

TypeError: __init__() got multiple values for argument 'scoring'

### Training

Replace the best parameters from above into the ExtraTreesRegressor() below

In [7]:
%%time
# Feature Selection For Train Set
selected_features,rmse = wrapper_feature_selector(X_train,Y_train,SVR(C=10,gamma=1,kernel='poly',shrinking=True),np.arange(0,35).tolist())
print('Train RMSE: {:0.2f}'.format(rmse))
print('Selected Features:',selected_features)

Train RMSE: 426.28
Selected Features: [0, 1, 2, 6, 7, 8, 9, 12, 13, 14, 15, 18, 20, 21, 22, 23, 25, 26, 27, 28, 29, 32, 33, 34]
CPU times: user 41 s, sys: 138 ms, total: 41.1 s
Wall time: 41.6 s


### Prediction

Replace the best parameters from above into the ExtraTreesRegressor() below

In [5]:
%%time
# Feature Meta Subset
subset = [34, 28, 30, 32, 1, 27, 2, 8, 25, 12, 22, 0, 17, 6, 18, 24, 9, 5, 31, 10, 19, 33, 23, 14] 

# Split train data into X (features) and Y (dependent variable)
data = combined_data.values 
Y_train = data[:-92,-1].reshape(-1,1) 
X_train = data[:-92,:-1]

# Test samples size (01/07/18 - 30/09/18)
n_validation = 90

# Split test data into X (features) and Y (dependent variable)
Y_test = data[:,-1].reshape(-1,1) # including train data for fitting the model
X_test = data[:,:-1]

rmse,Y_test,Y_pred= train_and_predict(X_test,Y_test,SVR(C=10,gamma=1,kernel='poly',shrinking=True),n_validation,predict=True)
print('Test RMSE: {:0.2f}'.format(rmse))


Test Sample 1 - RMSE: 3.56, Selected Features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 33, 34]
Test Sample 1 - RMSE: 244.56, Selected Features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 33, 34]
Test Sample 2 - RMSE: 0.30, Selected Features: [0, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21, 22, 23, 26, 27, 28, 29, 32, 33, 34]
Test Sample 2 - RMSE: 51.25, Selected Features: [0, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21, 22, 23, 26, 27, 28, 29, 32, 33, 34]
Test Sample 3 - RMSE: 0.57, Selected Features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
Test Sample 3 - RMSE: 129.78, Selected Features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
Test Sample 4 - RMSE: 0.47, Selecte

Test Sample 27 - RMSE: 2.54, Selected Features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 33, 34]
Test Sample 27 - RMSE: 346.39, Selected Features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 33, 34]
Test Sample 28 - RMSE: 0.40, Selected Features: [3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
Test Sample 28 - RMSE: 48.42, Selected Features: [3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
Test Sample 29 - RMSE: 0.09, Selected Features: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34]
Test Sample 29 - RMSE: 388.52, Selected Features: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34]
Test 

Test Sample 52 - RMSE: 0.53, Selected Features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 34]
Test Sample 52 - RMSE: 105.30, Selected Features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 34]
Test Sample 53 - RMSE: 0.44, Selected Features: [0, 1, 3, 4, 5, 6, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 29, 30, 31, 32, 33, 34]
Test Sample 53 - RMSE: 153.52, Selected Features: [0, 1, 3, 4, 5, 6, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 29, 30, 31, 32, 33, 34]
Test Sample 54 - RMSE: 0.32, Selected Features: [0, 2, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
Test Sample 54 - RMSE: 124.25, Selected Features: [0, 2, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34

Test Sample 77 - RMSE: 0.75, Selected Features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32, 33, 34]
Test Sample 77 - RMSE: 363.13, Selected Features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32, 33, 34]
Test Sample 78 - RMSE: 151.74, Selected Features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 22, 23, 24, 28, 30, 33]
Test Sample 78 - RMSE: 160.15, Selected Features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 22, 23, 24, 28, 30, 33]
Test Sample 79 - RMSE: 0.41, Selected Features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 34]
Test Sample 79 - RMSE: 53.85, Selected Features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 34]
Test Sample 80 - RMSE: 0.44, Se

In [8]:
# Output prediction to csv file
output = {'Y_pred': Y_pred.reshape(-1,)}
output = DataFrame(output)
output.to_csv('SVR.csv', index=False)


### VISUALIZATION

In [7]:
trace1 = go.Scatter(
    x = np.arange(0, len(Y_pred), 1),
    y = Y_pred.reshape(-1,),
    mode = 'lines',
    name = 'Predicted labels',
    line = dict(color=('rgb(244, 146, 65)'), width=2)
)
trace2 = go.Scatter(
    x = np.arange(0, len(Y_test), 1),
    y = Y_test.reshape(-1,),
    mode = 'lines',
    name = 'True labels',
    line = dict(color=('rgb(66, 244, 155)'), width=2)
)

layout = dict(title = 'Comparison of true prices (on the test dataset) with prices our model predicted',
             xaxis = dict(title = 'Day number'), yaxis = dict(title = 'Price, USD'))
fig = dict(data=[trace1, trace2], layout=layout)
py.iplot(fig, filename='results_demonstrating0')

print('Test RMSE:',str(rmse))

Test RMSE: 237.7914693479136
