## Import Data and Libraries

In [1]:
from Functions import normalize_data, time_series_CV_split, wrapper_feature_selector, train_and_predict, fine_tune_KNN, warn
import pandas as pd
from pandas import DataFrame
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
from sklearn.preprocessing import MinMaxScaler 
from time import time
from math import sqrt
from sklearn.neighbors import KNeighborsRegressor
from matplotlib import pyplot
import plotly.offline as py
import plotly.graph_objs as go
import warnings
warnings.warn = warn
py.init_notebook_mode(connected=True)
%matplotlib inline

# Train Set
train_data = pd.read_csv('bitcoin_train.csv')
train_data = train_data.drop(columns=['date','low','open','high','marketcap'])
print('n_features:', len(train_data.iloc[0]))
print('n_samples:', len(train_data))
train_data.head()

n_features: 36
n_samples: 2735


Unnamed: 0,close,volatility,volume,google_trends,gold,silver,platinum,palladium,oil,usd_eur,...,TXN_per_block,est_TXN_vol,cost_per_TXN,total_TXN_fees,usd_trade_vol,hash_rate,avg_block_size,difficulty,num_unique_addr,miners_revenue
0,0.2989,0.037022,548.33,0.019074,1388.5,30.67,1753.0,784.0,89.39,1.3421,...,7.0,8950.0,1.714398,0.023855,789.544097,0.115112,0.002296,16307.42094,943,2129.2829
1,0.299,0.030563,106.19,0.019074,1368.0,29.21,1722.0,754.0,90.3,1.3213,...,8.0,4746.0,1.881279,0.017457,57.813954,0.121597,0.002129,16307.42094,1064,2242.485
2,0.298,0.030563,1031.28,0.019074,1368.5,29.08,1731.0,766.0,88.37,1.3091,...,7.0,5052.0,2.585471,0.026185,890.419309,0.119165,0.00173,16307.42094,950,2197.65
3,0.32,0.079137,13152.15,0.019074,1367.0,28.39,1735.0,754.0,88.07,1.2961,...,7.0,5304.0,2.634545,0.023675,13047.28158,0.116733,0.001584,16307.42094,887,2318.4
4,0.3229,0.044001,512.04,0.019074,1367.0,28.39,1735.0,754.0,88.07,1.2961,...,6.0,9216.0,2.226883,0.006458,1273.388302,0.155644,0.001875,16307.42094,1267,3099.8208


In [2]:
test_data = pd.read_csv('bitcoin_test.csv')
test_data = test_data.drop(columns=['date','low','open','high','marketcap'])
print('n_features:', len(test_data.iloc[0]))
print('n_samples:', len(test_data))
test_data.head()

n_features: 36
n_samples: 92


Unnamed: 0,close,volatility,volume,google_trends,gold,silver,platinum,palladium,oil,usd_eur,...,TXN_per_block,est_TXN_vol,cost_per_TXN,total_TXN_fees,usd_trade_vol,hash_rate,avg_block_size,difficulty,num_unique_addr,miners_revenue
0,6385.82,0.022569,4788259840,62,1250.45,16.03,851.0,953,74.13,1.1658,...,1284.148936,493208885.1,58.735614,140545.4062,489427084.1,35588942.3,0.98734,5077500000000.0,368307,10494418.5
1,6614.18,0.058242,4396930048,72,1247.8,15.98,839.0,941,73.89,1.1639,...,1062.904762,991949405.1,75.393733,106682.7087,224327884.6,37103365.37,0.705291,5077500000000.0,341861,11673361.88
2,6529.59,0.034094,4672309760,69,1251.75,15.93,838.0,954,74.19,1.1665,...,1413.432624,720058629.1,57.309569,150160.0866,281178824.1,37594816.26,0.98755,5172890000000.0,433257,11271293.25
3,6597.55,0.048633,4176689920,65,1255.65,16.045,834.0,948,74.19,1.1665,...,1868.508621,872392592.0,45.224241,185064.5374,356469308.6,30929068.69,1.113817,5363680000000.0,456149,9617154.0
4,6639.14,0.030521,4999240192,69,1255.5,15.95,845.5,947,73.05,1.1709,...,1543.19403,665699858.5,54.318173,142640.2951,345023639.3,35728406.94,0.935353,5363680000000.0,435401,11089706.0


In [3]:
# Combined train and test sets
combined_data = pd.concat([train_data[2613:], test_data], ignore_index=True) # use data from 03/2018 onwards only - best stationarity
combined_data['Price'] = combined_data['close'].shift(-1) # Dependent variable Y
print('n_features:', len(combined_data.iloc[0]))
print('n_samples:', len(combined_data))
combined_data = combined_data.iloc[:-1,1:]
combined_data.tail()

n_features: 37
n_samples: 214


Unnamed: 0,volatility,volume,google_trends,gold,silver,platinum,palladium,oil,usd_eur,usd_jpy,...,est_TXN_vol,cost_per_TXN,total_TXN_fees,usd_trade_vol,hash_rate,avg_block_size,difficulty,num_unique_addr,miners_revenue,Price
208,0.034161,4726180000.0,61.0,1201.9,14.29,827.0,1065.0,73.4,1.1777,0.00905,...,503494967.4,52.112921,102965.2698,273203917.3,52622781.21,0.807114,7152630000000.0,469028,12385677.98,6495.0
209,0.028964,4437300000.0,59.0,1194.25,14.475,824.0,1059.0,72.22,1.1737,0.00904,...,842041019.0,54.247914,130284.3136,390313648.8,60089527.19,0.768241,7152630000000.0,490588,13676604.3,6676.75
210,0.037517,4606810000.0,58.0,1185.4,14.42,812.0,1067.0,72.18,1.1707,0.00905,...,748444892.4,46.610199,130238.5658,312748554.4,49422747.22,0.898005,7152630000000.0,474079,11369486.09,6644.13
211,0.027904,5014430000.0,58.0,1187.25,14.305,815.0,1094.0,73.16,1.1576,0.00906,...,824319235.8,47.597177,154586.939,283381555.4,50844984.55,0.877031,7152630000000.0,481907,11836751.48,6601.96
212,0.019986,4363690000.0,48.0,1187.25,14.305,815.0,1094.0,73.16,1.1576,0.00902,...,913410388.9,51.317781,193097.0923,457654461.3,54756137.2,0.855169,7152630000000.0,527057,13046981.4,6625.56


## Parameters Tuning

In [4]:
# Feature Meta Subset
subset = [34, 28, 30, 32, 1, 27, 2, 8, 25, 12, 22, 0, 17, 6, 18, 24, 9, 5, 31, 10, 19, 33, 23, 14] 

# Split train data into X (features) and Y (dependent variable)
data = combined_data.values 
Y_train = data[:-92,-1].reshape(-1,1) 
X_train = data[:-92,:-1]

# Training Validation samples size (1/4/18 - 30/6/18)
n_validation = 91

# Fine-tune K,p,weights
k,p,weights = fine_tune_KNN(X_train,Y_train,subset)
print('Best k:',str(k)) #7
print('Best p:',str(p)) #1
print('Best weights:',str(weights)) #distance

# Feature Selection
selected_features = wrapper_feature_selector(X_train,Y_train,KNeighborsRegressor(n_neighbors=k,p=p,weights=weights),subset)[0]   
print('Selected Features:',str(selected_features))

### Training

In [40]:
%%time
rmse,Y_train_test,Y_train_pred= train_and_predict(X_train[:,selected_features],Y_train,KNeighborsRegressor(n_neighbors=k,p=p,weights=weights),predict=False)
print('Train RMSE: {:0.2f}'.format(rmse))


382.2145873704225
CPU times: user 76.1 ms, sys: 1.93 ms, total: 78 ms
Wall time: 82.8 ms


### Prediction

In [5]:
%%time
# Test samples size (01/07/18 - 30/09/18)
n_validation = 90

# Split test data into X (features) and Y (dependent variable)
Y_test = data[:,-1].reshape(-1,1) # including train data for fitting the model
X_test = data[:,:-1]

rmse,Y_test,Y_pred= train_and_predict(X_test,Y_test,KNeighborsRegressor(n_neighbors=7,p=1,weights='distance'),n_validation,subset,predict=True)
print('Test RMSE: {:0.2f}'.format(rmse))

Test Sample 1 - RMSE: 383.97, Selected Features: [32, 8, 12, 17, 6, 18, 9, 5, 10, 23, 14]
Test Sample 2 - RMSE: 383.43, Selected Features: [32, 8, 12, 17, 6, 18, 9, 5, 10, 23, 14]
Test Sample 3 - RMSE: 382.59, Selected Features: [32, 8, 12, 17, 6, 18, 9, 5, 10, 19, 23, 14]
Test Sample 4 - RMSE: 380.90, Selected Features: [32, 8, 12, 17, 6, 18, 9, 5, 10, 19, 23, 14]
Test Sample 5 - RMSE: 379.62, Selected Features: [32, 8, 12, 17, 6, 18, 9, 5, 10, 19, 23, 14]
Test Sample 6 - RMSE: 380.53, Selected Features: [32, 8, 12, 17, 6, 18, 9, 5, 10, 19, 23, 14]
Test Sample 7 - RMSE: 380.35, Selected Features: [32, 8, 12, 17, 6, 18, 9, 5, 10, 19, 23, 14]
Test Sample 8 - RMSE: 379.79, Selected Features: [32, 8, 12, 17, 6, 18, 9, 5, 10, 19, 23, 14]
Test Sample 9 - RMSE: 381.89, Selected Features: [32, 8, 12, 17, 6, 18, 9, 5, 10, 19, 23, 14]
Test Sample 10 - RMSE: 382.41, Selected Features: [32, 8, 12, 17, 6, 18, 9, 5, 10, 19, 23, 14]
Test Sample 11 - RMSE: 370.55, Selected Features: [32, 8, 12, 17, 6

In [None]:
# Output prediction to csv file
output = {'Y_pred': Y_pred.reshape(-1,)}
output = DataFrame(output)
output.to_csv('KNN.csv', index=False)


### VISUALIZATION

In [None]:
trace1 = go.Scatter(
    x = np.arange(0, len(Y_pred), 1),
    y = Y_pred.reshape(-1,),
    mode = 'lines',
    name = 'Predicted labels',
    line = dict(color=('rgb(244, 146, 65)'), width=2)
)
trace2 = go.Scatter(
    x = np.arange(0, len(Y_test), 1),
    y = Y_test.reshape(-1,),
    mode = 'lines',
    name = 'True labels',
    line = dict(color=('rgb(66, 244, 155)'), width=2)
)

layout = dict(title = 'Comparison of true prices (on the test dataset) with prices our model predicted',
             xaxis = dict(title = 'Day number'), yaxis = dict(title = 'Price, USD'))
fig = dict(data=[trace1, trace2], layout=layout)
py.iplot(fig, filename='results_demonstrating0')

print(np.sqrt(mean_squared_error(Y_test.reshape(-1,), Y_pred.reshape(-1,))))
