# Import libraries

In [1]:
import numpy as np
import pandas as pd
from math import sqrt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error,explained_variance_score

# read dataset

In [2]:
data=pd.read_csv(r'C:\Users\amrit\Downloads\bitstampUSD_1-min_data_2012-01-01_to_2018-11-11.csv')
data

Unnamed: 0,Timestamp,Open,High,Low,Close,Volume_BTC,Volume_Currency,Weighted_Price
0,1325317920,4.39,4.39,4.39,4.39,0.455581,2.000000,4.390000
1,1325317980,,,,,,,
2,1325318040,,,,,,,
3,1325318100,,,,,,,
4,1325318160,,,,,,,
...,...,...,...,...,...,...,...,...
1048570,1388232120,734.60,734.60,730.00,734.55,1.789687,1313.640757,734.005698
1048571,1388232180,734.55,734.55,730.71,730.71,0.110236,80.802051,732.991499
1048572,1388232240,734.40,734.40,730.51,730.51,0.554786,407.247985,734.063488
1048573,1388232300,730.51,733.63,730.51,731.10,0.620446,453.777190,731.372707


# Drop nan values

In [3]:
data.dropna(inplace=True)

# Dispaly maximum columns in the dataset

In [4]:
pd.set_option('display.max_columns',None)

In [5]:
data

Unnamed: 0,Timestamp,Open,High,Low,Close,Volume_BTC,Volume_Currency,Weighted_Price
0,1325317920,4.39,4.39,4.39,4.39,0.455581,2.000000,4.390000
478,1325346600,4.39,4.39,4.39,4.39,48.000000,210.720000,4.390000
547,1325350740,4.50,4.57,4.50,4.57,37.862297,171.380337,4.526411
548,1325350800,4.58,4.58,4.58,4.58,9.000000,41.220000,4.580000
1224,1325391360,4.58,4.58,4.58,4.58,1.502000,6.879160,4.580000
...,...,...,...,...,...,...,...,...
1048570,1388232120,734.60,734.60,730.00,734.55,1.789687,1313.640757,734.005698
1048571,1388232180,734.55,734.55,730.71,730.71,0.110236,80.802051,732.991499
1048572,1388232240,734.40,734.40,730.51,730.51,0.554786,407.247985,734.063488
1048573,1388232300,730.51,733.63,730.51,731.10,0.620446,453.777190,731.372707


# Check data shape

In [6]:
data.shape

(342169, 8)

In [7]:
data.head()

Unnamed: 0,Timestamp,Open,High,Low,Close,Volume_BTC,Volume_Currency,Weighted_Price
0,1325317920,4.39,4.39,4.39,4.39,0.455581,2.0,4.39
478,1325346600,4.39,4.39,4.39,4.39,48.0,210.72,4.39
547,1325350740,4.5,4.57,4.5,4.57,37.862297,171.380337,4.526411
548,1325350800,4.58,4.58,4.58,4.58,9.0,41.22,4.58
1224,1325391360,4.58,4.58,4.58,4.58,1.502,6.87916,4.58


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 342169 entries, 0 to 1048574
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Timestamp        342169 non-null  int64  
 1   Open             342169 non-null  float64
 2   High             342169 non-null  float64
 3   Low              342169 non-null  float64
 4   Close            342169 non-null  float64
 5   Volume_BTC       342169 non-null  float64
 6   Volume_Currency  342169 non-null  float64
 7   Weighted_Price   342169 non-null  float64
dtypes: float64(7), int64(1)
memory usage: 23.5 MB


# Cleaning process

# Convert Timestamp column in to date format

In [9]:
data['Timestamp'] = pd.to_datetime(data['Timestamp'], unit='s')
data['Timestamp'] = pd.to_datetime(data['Timestamp']).dt.to_period('m')
data.set_index(data['Timestamp'], inplace=True)
data.drop(['Timestamp'],axis=1,inplace=True)
data=data.to_timestamp()

In [10]:
data

Unnamed: 0_level_0,Open,High,Low,Close,Volume_BTC,Volume_Currency,Weighted_Price
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2011-12-01,4.39,4.39,4.39,4.39,0.455581,2.000000,4.390000
2011-12-01,4.39,4.39,4.39,4.39,48.000000,210.720000,4.390000
2011-12-01,4.50,4.57,4.50,4.57,37.862297,171.380337,4.526411
2011-12-01,4.58,4.58,4.58,4.58,9.000000,41.220000,4.580000
2012-01-01,4.58,4.58,4.58,4.58,1.502000,6.879160,4.580000
...,...,...,...,...,...,...,...
2013-12-01,734.60,734.60,730.00,734.55,1.789687,1313.640757,734.005698
2013-12-01,734.55,734.55,730.71,730.71,0.110236,80.802051,732.991499
2013-12-01,734.40,734.40,730.51,730.51,0.554786,407.247985,734.063488
2013-12-01,730.51,733.63,730.51,731.10,0.620446,453.777190,731.372707


In [11]:
data=data.reset_index()


In [12]:
data

Unnamed: 0,Timestamp,Open,High,Low,Close,Volume_BTC,Volume_Currency,Weighted_Price
0,2011-12-01,4.39,4.39,4.39,4.39,0.455581,2.000000,4.390000
1,2011-12-01,4.39,4.39,4.39,4.39,48.000000,210.720000,4.390000
2,2011-12-01,4.50,4.57,4.50,4.57,37.862297,171.380337,4.526411
3,2011-12-01,4.58,4.58,4.58,4.58,9.000000,41.220000,4.580000
4,2012-01-01,4.58,4.58,4.58,4.58,1.502000,6.879160,4.580000
...,...,...,...,...,...,...,...,...
342164,2013-12-01,734.60,734.60,730.00,734.55,1.789687,1313.640757,734.005698
342165,2013-12-01,734.55,734.55,730.71,730.71,0.110236,80.802051,732.991499
342166,2013-12-01,734.40,734.40,730.51,730.51,0.554786,407.247985,734.063488
342167,2013-12-01,730.51,733.63,730.51,731.10,0.620446,453.777190,731.372707


# Split Timestamp column into day, month and year

In [13]:
data['day']=pd.to_datetime(data['Timestamp'],format='%d-%m-%Y').dt.day
data['month']=pd.to_datetime(data['Timestamp'],format='%d-%m-%Y').dt.month
data['year']=pd.to_datetime(data['Timestamp'],format='%d-%m-%Y').dt.year


# Drop Timestamp column

In [14]:
data.drop(['Timestamp'],axis=1,inplace=True)

In [15]:
data.head()

Unnamed: 0,Open,High,Low,Close,Volume_BTC,Volume_Currency,Weighted_Price,day,month,year
0,4.39,4.39,4.39,4.39,0.455581,2.0,4.39,1,12,2011
1,4.39,4.39,4.39,4.39,48.0,210.72,4.39,1,12,2011
2,4.5,4.57,4.5,4.57,37.862297,171.380337,4.526411,1,12,2011
3,4.58,4.58,4.58,4.58,9.0,41.22,4.58,1,12,2011
4,4.58,4.58,4.58,4.58,1.502,6.87916,4.58,1,1,2012


In [16]:
x=data.drop(['Weighted_Price'],axis=1)
y=data.Weighted_Price

# Split data into xtrain, xtest and ytrain and ytest

In [17]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,random_state=0,test_size=0.2)

In [18]:
print(len(xtrain))
print(len(xtest))
print(len(ytrain))
print(len(ytest))

273735
68434
273735
68434


# Modelling

In [19]:
model=LinearRegression()

# Model fitting

In [20]:
model.fit(xtrain,ytrain)

LinearRegression()

# Check accuracy

In [21]:
model.score(xtest,ytest)

0.9999950054355766

# Prediction

In [22]:
model.predict([[6349.17,6349.32,6349.17,6349.32,0.038261,242.927410,1,11,2018]])

array([6349.58465521])

In [23]:
y.tail(3)

342166    734.063488
342167    731.372707
342168    733.369303
Name: Weighted_Price, dtype: float64

In [24]:
x.tail(3)

Unnamed: 0,Open,High,Low,Close,Volume_BTC,Volume_Currency,day,month,year
342166,734.4,734.4,730.51,730.51,0.554786,407.247985,1,12,2013
342167,730.51,733.63,730.51,731.1,0.620446,453.77719,1,12,2013
342168,733.0,734.0,733.0,734.0,9.214205,6757.415227,1,12,2013


In [25]:
y_pred=model.predict(xtest)

In [26]:
evs=explained_variance_score(ytest,y_pred)
print('EVS:',evs)
mae=mean_absolute_error(ytest,y_pred)
print('MAE:',mae)
mse=mean_squared_error(ytest,y_pred)
print('MSE:',mse)
rmse=sqrt(mse)
print('RMSE:',rmse)

EVS: 0.9999950054356913
MAE: 0.17771323373711945
MSE: 0.3678413307553557
RMSE: 0.6064992421721198


In [27]:
from sklearn import metrics

metrics.r2_score(ytest,y_pred)

0.9999950054355766

In [28]:
x.columns

Index(['Open', 'High', 'Low', 'Close', 'Volume_BTC', 'Volume_Currency', 'day',
       'month', 'year'],
      dtype='object')

# save it

In [31]:
import pickle
file=open('bitcoin.pkl','wb')
pickle.dump(model,file)

In [32]:
models=open('bitcoin.pkl','rb')
forest=pickle.load(models)

In [33]:
y_prediction=forest.predict(xtest)

In [34]:
metrics.r2_score(ytest,y_prediction)

0.9999950054355766