## Import Data and Libraries

In [1]:
from Functions import normalize_data, time_series_CV_split, wrapper_feature_selector, train_and_predict, warn
import pandas as pd
from pandas import DataFrame
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
from sklearn.preprocessing import MinMaxScaler 
from time import time
from math import sqrt
from sklearn.svm import SVR

from matplotlib import pyplot
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
%matplotlib inline

data = pd.read_csv('bitcoin_train.csv')
data = data.drop(columns=['date','low','open','high','marketcap'])
print('n_features:', len(data.iloc[0]))
print('n_samples:', len(data))
data.tail()

n_features: 36
n_samples: 2735


Unnamed: 0,close,volatility,volume,google_trends,gold,silver,platinum,palladium,oil,usd_eur,...,TXN_per_block,est_TXN_vol,cost_per_TXN,total_TXN_fees,usd_trade_vol,hash_rate,avg_block_size,difficulty,num_unique_addr,miners_revenue
2730,6093.67,0.031736,3279760000.0,6.0,1260.3,16.225,864.0,939.0,75.23,1.1672,...,1274.987261,675604827.0,61.446716,213583.5164,501623219.9,39627403.83,0.920948,5080000000000.0,415602,12092550.0
2731,6157.13,0.020789,3296220000.0,6.0,1254.6,16.21,860.0,946.0,77.41,1.1616,...,1338.383562,609799528.3,59.349571,201953.0378,363193880.4,36850961.53,1.067592,5080000000000.0,410397,11395190.0
2732,5903.44,0.049391,3467800000.0,6.0,1251.55,16.11,852.0,945.0,73.45,1.1583,...,1557.933884,795615808.3,50.289212,197368.2529,345979167.6,30540865.37,1.088043,5080000000000.0,397865,9288788.0
2733,6218.3,0.070443,3966230000.0,8.0,1250.45,16.03,851.0,953.0,74.13,1.1658,...,1195.057325,703945479.4,65.127555,166798.5735,262900494.4,39627403.83,0.68943,5080000000000.0,396405,12052690.0
2734,6404.0,0.039642,4543860000.0,7.0,1250.45,16.03,851.0,953.0,74.13,1.1658,...,1310.470199,656285943.9,57.609894,163181.8222,414797814.1,38112980.76,0.746513,5080000000000.0,453050,11236720.0


In [2]:
test_data = pd.read_csv('bitcoin_test.csv')
test_data = test_data.drop(columns=['date','low','open','high','marketcap'])
print('n_features:', len(test_data.iloc[0]))
print('n_samples:', len(test_data))
test_data.head()

n_features: 36
n_samples: 92


Unnamed: 0,close,volatility,volume,google_trends,gold,silver,platinum,palladium,oil,usd_eur,...,TXN_per_block,est_TXN_vol,cost_per_TXN,total_TXN_fees,usd_trade_vol,hash_rate,avg_block_size,difficulty,num_unique_addr,miners_revenue
0,6385.82,0.022569,4788259840,62,1250.45,16.03,851.0,953,74.13,1.1658,...,1284.148936,493208885.1,58.735614,140545.4062,489427084.1,35588942.3,0.98734,5077500000000.0,368307,10494418.5
1,6614.18,0.058242,4396930048,72,1247.8,15.98,839.0,941,73.89,1.1639,...,1062.904762,991949405.1,75.393733,106682.7087,224327884.6,37103365.37,0.705291,5077500000000.0,341861,11673361.88
2,6529.59,0.034094,4672309760,69,1251.75,15.93,838.0,954,74.19,1.1665,...,1413.432624,720058629.1,57.309569,150160.0866,281178824.1,37594816.26,0.98755,5172890000000.0,433257,11271293.25
3,6597.55,0.048633,4176689920,65,1255.65,16.045,834.0,948,74.19,1.1665,...,1868.508621,872392592.0,45.224241,185064.5374,356469308.6,30929068.69,1.113817,5363680000000.0,456149,9617154.0
4,6639.14,0.030521,4999240192,69,1255.5,15.95,845.5,947,73.05,1.1709,...,1543.19403,665699858.5,54.318173,142640.2951,345023639.3,35728406.94,0.935353,5363680000000.0,435401,11089706.0


In [3]:
# Combined train and test sets
combined_data = pd.concat([data[2613:], test_data], ignore_index=True)
combined_data['Price'] = combined_data['close'].shift(-1)
print('n_features:', len(combined_data.iloc[0]))
print('n_samples:', len(combined_data))
combined_data = combined_data.iloc[:-1,1:]
combined_data.tail()

n_features: 37
n_samples: 214


Unnamed: 0,volatility,volume,google_trends,gold,silver,platinum,palladium,oil,usd_eur,usd_jpy,...,est_TXN_vol,cost_per_TXN,total_TXN_fees,usd_trade_vol,hash_rate,avg_block_size,difficulty,num_unique_addr,miners_revenue,Price
208,0.034161,4726180000.0,61.0,1201.9,14.29,827.0,1065.0,73.4,1.1777,0.00905,...,503494967.4,52.112921,102965.2698,273203917.3,52622781.21,0.807114,7152630000000.0,469028,12385677.98,6495.0
209,0.028964,4437300000.0,59.0,1194.25,14.475,824.0,1059.0,72.22,1.1737,0.00904,...,842041019.0,54.247914,130284.3136,390313648.8,60089527.19,0.768241,7152630000000.0,490588,13676604.3,6676.75
210,0.037517,4606810000.0,58.0,1185.4,14.42,812.0,1067.0,72.18,1.1707,0.00905,...,748444892.4,46.610199,130238.5658,312748554.4,49422747.22,0.898005,7152630000000.0,474079,11369486.09,6644.13
211,0.027904,5014430000.0,58.0,1187.25,14.305,815.0,1094.0,73.16,1.1576,0.00906,...,824319235.8,47.597177,154586.939,283381555.4,50844984.55,0.877031,7152630000000.0,481907,11836751.48,6601.96
212,0.019986,4363690000.0,48.0,1187.25,14.305,815.0,1094.0,73.16,1.1576,0.00902,...,913410388.9,51.317781,193097.0923,457654461.3,54756137.2,0.855169,7152630000000.0,527057,13046981.4,6625.56


## Data Preparation

In [4]:
# Feature Meta Subset
subset = [34, 28, 30, 32, 1, 27, 2, 8, 25, 12, 22, 0, 17, 6, 18, 24, 9, 5, 31, 10, 19, 33, 23, 14] 

# Split train data into X (features) and Y (dependent variable)
data = combined_data.values 
Y_train = data[:-92,-1].reshape(-1,1) 
X_train = data[:-92,:-1]

# Training Validation samples size (1/4/18 - 30/6/18)
n_validation = 91


### Parameters Tuning

In [6]:
from sklearn.pipeline import Pipeline
pipe = pipe = Pipeline([
    ('normalize', MinMaxScaler()),
    ('svr', SVR())
])
pipe

Pipeline(memory=None,
     steps=[('normalize', MinMaxScaler(copy=True, feature_range=(0, 1))), ('svr', SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False))])

In [7]:
%%time
sc = SparkContext()

param_grid = [{"svr__kernel": ['rbf','linear','poly','sigmoid'],
              "svr__gamma": [0.001, 0.01, 0.1, 1, 'auto'],
              "svr__C": [0.001, 0.01, 0.1, 1, 10],
              "svr__shrinking": [True,False]
        }]
split = TimeSeriesCVSplit(len(X),91,0)
gs = GridSearchCV(sc, pipe, param_grid, cv=split, scoring=make_scorer(RMSE,greater_is_better=False), verbose=1) 
gs.fit(X,Y.reshape(-1,))
print("Best RMSE:", str(gs.cv_results_['mean_test_score'][gs.best_index_]))
print('Best Parameters: ',gs.cv_results_['params'][gs.best_index_])
sc.stop()

Fitting 91 folds for each of 200 candidates, totalling 18200 fits
Best RMSE: -361.8919588280773
Best Parameters:  {'svr__C': 10, 'svr__gamma': 1, 'svr__kernel': 'poly', 'svr__shrinking': True}
CPU times: user 1.44 s, sys: 487 ms, total: 1.93 s
Wall time: 2min 40s


### Training

Replace the best parameters from above into the ExtraTreesRegressor() below

In [7]:
%%time
# Feature Selection For Train Set
selected_features,rmse = wrapper_feature_selector(X_train,Y_train,SVR(C=10,gamma=1,kernel='poly',shrinking=True),np.arange(0,35).tolist())
print('Train RMSE: {:0.2f}'.format(rmse))
print('Selected Features:',selected_features)

Train RMSE: 426.28
Selected Features: [0, 1, 2, 6, 7, 8, 9, 12, 13, 14, 15, 18, 20, 21, 22, 23, 25, 26, 27, 28, 29, 32, 33, 34]
CPU times: user 41 s, sys: 138 ms, total: 41.1 s
Wall time: 41.6 s


### Prediction

Replace the best parameters from above into the ExtraTreesRegressor() below

In [5]:
%%time
# Feature Meta Subset
subset = [34, 28, 30, 32, 1, 27, 2, 8, 25, 12, 22, 0, 17, 6, 18, 24, 9, 5, 31, 10, 19, 33, 23, 14] 

# Split train data into X (features) and Y (dependent variable)
data = combined_data.values 
Y_train = data[:-92,-1].reshape(-1,1) 
X_train = data[:-92,:-1]

# Test samples size (01/07/18 - 30/09/18)
n_validation = 90

# Split test data into X (features) and Y (dependent variable)
Y_test = data[:,-1].reshape(-1,1) # including train data for fitting the model
X_test = data[:,:-1]

rmse,Y_test,Y_pred= train_and_predict(X_test,Y_test,SVR(C=10,gamma=1,kernel='poly',shrinking=True),n_validation,predict=True)
print('Test RMSE: {:0.2f}'.format(rmse))


Test Sample 1 - RMSE: 3.56, Selected Features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 33, 34]
Test Sample 1 - RMSE: 244.56, Selected Features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 33, 34]
Test Sample 2 - RMSE: 0.30, Selected Features: [0, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21, 22, 23, 26, 27, 28, 29, 32, 33, 34]
Test Sample 2 - RMSE: 51.25, Selected Features: [0, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21, 22, 23, 26, 27, 28, 29, 32, 33, 34]
Test Sample 3 - RMSE: 0.57, Selected Features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
Test Sample 3 - RMSE: 129.78, Selected Features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
Test Sample 4 - RMSE: 0.47, Selecte

Test Sample 27 - RMSE: 2.54, Selected Features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 33, 34]
Test Sample 27 - RMSE: 346.39, Selected Features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 33, 34]
Test Sample 28 - RMSE: 0.40, Selected Features: [3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
Test Sample 28 - RMSE: 48.42, Selected Features: [3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
Test Sample 29 - RMSE: 0.09, Selected Features: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34]
Test Sample 29 - RMSE: 388.52, Selected Features: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34]
Test 

Test Sample 52 - RMSE: 0.53, Selected Features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 34]
Test Sample 52 - RMSE: 105.30, Selected Features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 34]
Test Sample 53 - RMSE: 0.44, Selected Features: [0, 1, 3, 4, 5, 6, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 29, 30, 31, 32, 33, 34]
Test Sample 53 - RMSE: 153.52, Selected Features: [0, 1, 3, 4, 5, 6, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 29, 30, 31, 32, 33, 34]
Test Sample 54 - RMSE: 0.32, Selected Features: [0, 2, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
Test Sample 54 - RMSE: 124.25, Selected Features: [0, 2, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34

Test Sample 77 - RMSE: 0.75, Selected Features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32, 33, 34]
Test Sample 77 - RMSE: 363.13, Selected Features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32, 33, 34]
Test Sample 78 - RMSE: 151.74, Selected Features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 22, 23, 24, 28, 30, 33]
Test Sample 78 - RMSE: 160.15, Selected Features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 22, 23, 24, 28, 30, 33]
Test Sample 79 - RMSE: 0.41, Selected Features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 34]
Test Sample 79 - RMSE: 53.85, Selected Features: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 34]
Test Sample 80 - RMSE: 0.44, Se

In [8]:
# Output prediction to csv file
output = {'Y_pred': Y_pred.reshape(-1,)}
output = DataFrame(output)
output.to_csv('SVR.csv', index=False)


### VISUALIZATION

In [7]:
trace1 = go.Scatter(
    x = np.arange(0, len(Y_pred), 1),
    y = Y_pred.reshape(-1,),
    mode = 'lines',
    name = 'Predicted labels',
    line = dict(color=('rgb(244, 146, 65)'), width=2)
)
trace2 = go.Scatter(
    x = np.arange(0, len(Y_test), 1),
    y = Y_test.reshape(-1,),
    mode = 'lines',
    name = 'True labels',
    line = dict(color=('rgb(66, 244, 155)'), width=2)
)

layout = dict(title = 'Comparison of true prices (on the test dataset) with prices our model predicted',
             xaxis = dict(title = 'Day number'), yaxis = dict(title = 'Price, USD'))
fig = dict(data=[trace1, trace2], layout=layout)
py.iplot(fig, filename='results_demonstrating0')

print('Test RMSE:',str(rmse))

Test RMSE: 237.7914693479136
