In [10]:
import pandas as pd

import time

import matplotlib
import matplotlib.pyplot as plt

from datetime import datetime

import numpy as np
from sklearn.model_selection import train_test_split

In [11]:
btc_min_data = pd.read_csv('/content/drive/MyDrive/bitstampUSD_1-min_data_2012-01-01_to_2021-03-31.csv') 

print(btc_min_data)

          Timestamp      Open      High       Low     Close  Volume_(BTC)  \
0        1325317920      4.39      4.39      4.39      4.39      0.455581   
1        1325317980       NaN       NaN       NaN       NaN           NaN   
2        1325318040       NaN       NaN       NaN       NaN           NaN   
3        1325318100       NaN       NaN       NaN       NaN           NaN   
4        1325318160       NaN       NaN       NaN       NaN           NaN   
...             ...       ...       ...       ...       ...           ...   
4857372  1617148560  58714.31  58714.31  58686.00  58686.00      1.384487   
4857373  1617148620  58683.97  58693.43  58683.97  58685.81      7.294848   
4857374  1617148680  58693.43  58723.84  58693.43  58723.84      1.705682   
4857375  1617148740  58742.18  58770.38  58742.18  58760.59      0.720415   
4857376  1617148800  58767.75  58778.18  58755.97  58778.18      2.712831   

         Volume_(Currency)  Weighted_Price  
0                 2.000000    

In [12]:

print(btc_min_data.columns)

Index(['Timestamp', 'Open', 'High', 'Low', 'Close', 'Volume_(BTC)',
       'Volume_(Currency)', 'Weighted_Price'],
      dtype='object')


**1) Conversion of Unix time to Datetime - Timestamp column** - 
The timestamp column is provided in unix time which means number of seconds that have elapsed since January 1, 1970. We have converted this column in datetime format which is more readable and helps in visualising the data according to date and corresponding time in minutes.

In [13]:
# 1) Conversion of Unix time to Datetime - Timestamp column
btc_min_data['Timestamp']=pd.to_datetime(btc_min_data['Timestamp'],unit='s')

In [14]:
btc_min_data=btc_min_data.rename(columns={'Volume_(BTC)':"Volume_BTC", 'Volume_(Currency)':"Volume_Currency"})

**2) Dropping all NAN or null values** - Few columns contained NAN values which are of no significant use for analysis. Hence the rows with such values are deleted. The pandas function dropna is used which determines the NAN values and drops them from the dataset.

In [15]:
# 2) Dropping all NAN or null values
btc_min_data=btc_min_data.dropna()

In [16]:
btc_min_data

Unnamed: 0,Timestamp,Open,High,Low,Close,Volume_BTC,Volume_Currency,Weighted_Price
0,2011-12-31 07:52:00,4.39,4.39,4.39,4.39,0.455581,2.000000,4.390000
478,2011-12-31 15:50:00,4.39,4.39,4.39,4.39,48.000000,210.720000,4.390000
547,2011-12-31 16:59:00,4.50,4.57,4.50,4.57,37.862297,171.380338,4.526411
548,2011-12-31 17:00:00,4.58,4.58,4.58,4.58,9.000000,41.220000,4.580000
1224,2012-01-01 04:16:00,4.58,4.58,4.58,4.58,1.502000,6.879160,4.580000
...,...,...,...,...,...,...,...,...
4857372,2021-03-30 23:56:00,58714.31,58714.31,58686.00,58686.00,1.384487,81259.372187,58692.753339
4857373,2021-03-30 23:57:00,58683.97,58693.43,58683.97,58685.81,7.294848,428158.146640,58693.226508
4857374,2021-03-30 23:58:00,58693.43,58723.84,58693.43,58723.84,1.705682,100117.070370,58696.198496
4857375,2021-03-30 23:59:00,58742.18,58770.38,58742.18,58760.59,0.720415,42332.958633,58761.866202


In [17]:
# Create new dataframe for final data
bitcoin_daily_data = pd.DataFrame({})
temp_group_data = btc_min_data.groupby([pd.Grouper(key="Timestamp", freq="D")])

**3) Grouping open column -** The data is for every minute, we have grouped it according to day by providing the parameter frequency= “D”. Thus all the data of a particular date are grouped together. The first available data in the dataset for that particular day is considered for the open column.

In [18]:
# 3) Grouping open column
bitcoin_daily_data["Open"] = pd.DataFrame(temp_group_data.Open.first())["Open"]


**4) Grouping Close column -** The last available data in the dataset for that particular day is considered for the close column.

In [19]:
# 4) Grouping Close column
bitcoin_daily_data["Close"] = pd.DataFrame(temp_group_data.Close.last())["Close"]


**5) Grouping High column -** The data for the same day are grouped together and the max value for the high column that is the highest of the day is taken.

In [20]:
# 5) Grouping High column
bitcoin_daily_data["High"] = pd.DataFrame(temp_group_data.High.max())["High"]


**6) Grouping Low column -** The minimum value available in the dataset for each day is taken for the low column.

In [21]:
# 6) Grouping Low column
bitcoin_daily_data["Low"] = pd.DataFrame(temp_group_data.Low.min())["Low"]


**7)Grouping Volume_BTC, Volume_Currency, and Weighted_price-** The mean data for volume_BTC, volume currency, and weighted price for each day are taken in respective columns.

In [22]:
# 7) Get mean value for Volume_BTC, Volume_Currency, Weighted_Price from datewise group
bitcoin_daily_data["Volume_BTC"] = pd.DataFrame(temp_group_data.Volume_BTC.mean())['Volume_BTC']
bitcoin_daily_data["Volume_Currency"] = pd.DataFrame(temp_group_data.Volume_Currency.mean())["Volume_Currency"]
bitcoin_daily_data["Weighted_Price"] = pd.DataFrame(temp_group_data.Weighted_Price.mean())["Weighted_Price"]


In [23]:
bitcoin_daily_data

Unnamed: 0_level_0,Open,Close,High,Low,Volume_BTC,Volume_Currency,Weighted_Price
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2011-12-31,4.39,4.58,4.58,4.39,23.829470,106.330084,4.471603
2012-01-01,4.58,5.00,5.00,4.58,7.200667,35.259720,4.806667
2012-01-02,5.00,5.00,5.00,5.00,19.048000,95.240000,5.000000
2012-01-03,5.32,5.29,5.32,5.14,11.004660,58.100651,5.252500
2012-01-04,4.93,5.57,5.57,4.93,11.914807,63.119577,5.208159
...,...,...,...,...,...,...,...
2021-03-27,55081.26,55839.42,56686.15,53948.35,1.823877,100884.732367,55193.357260
2021-03-28,55817.85,55790.92,56573.04,54677.51,1.447939,80632.115263,55832.958824
2021-03-29,55790.28,57600.10,58402.68,54892.42,3.732887,213754.555988,56913.993819
2021-03-30,57623.66,58760.59,59388.66,57011.00,2.363999,138231.241926,58346.912268


**8)Compute and add a column for the change percentage daily-**We computed the change percentage for each day by comparing the closing price of the current day with the closing price of the previous day. This column will contribute to analyzing the change and dividing the day into a particular category.

In [24]:
# 8) Compute and add a column for the change percentage daily
bitcoin_daily_data["Change_percentage"]=bitcoin_daily_data["Close"].pct_change()*100

In [25]:
bitcoin_daily_data

Unnamed: 0_level_0,Open,Close,High,Low,Volume_BTC,Volume_Currency,Weighted_Price,Change_percentage
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011-12-31,4.39,4.58,4.58,4.39,23.829470,106.330084,4.471603,
2012-01-01,4.58,5.00,5.00,4.58,7.200667,35.259720,4.806667,9.170306
2012-01-02,5.00,5.00,5.00,5.00,19.048000,95.240000,5.000000,0.000000
2012-01-03,5.32,5.29,5.32,5.14,11.004660,58.100651,5.252500,5.800000
2012-01-04,4.93,5.57,5.57,4.93,11.914807,63.119577,5.208159,5.293006
...,...,...,...,...,...,...,...,...
2021-03-27,55081.26,55839.42,56686.15,53948.35,1.823877,100884.732367,55193.357260,1.376237
2021-03-28,55817.85,55790.92,56573.04,54677.51,1.447939,80632.115263,55832.958824,-0.086856
2021-03-29,55790.28,57600.10,58402.68,54892.42,3.732887,213754.555988,56913.993819,3.242786
2021-03-30,57623.66,58760.59,59388.66,57011.00,2.363999,138231.241926,58346.912268,2.014736


In [26]:
bitcoin_daily_data.describe()

Unnamed: 0,Open,Close,High,Low,Volume_BTC,Volume_Currency,Weighted_Price,Change_percentage
count,3376.0,3376.0,3376.0,3376.0,3376.0,3376.0,3376.0,3378.0
mean,4602.417399,4619.68726,4750.700598,4442.507965,10.355675,31790.810259,4605.576442,0.38648
std,8193.870228,8245.987435,8497.261901,7874.336609,8.897358,62753.976425,8207.03162,4.564743
min,3.8,4.23,4.38,1.5,0.25,1.2275,4.331667,-48.518519
25%,244.7925,244.94,249.7775,239.9525,4.671673,1916.186042,244.95258,-1.206425
50%,696.02,697.12,716.465,668.265,7.622244,6832.004108,697.948484,0.215306
75%,7249.76,7257.85,7430.2675,7058.395,13.114867,36074.508308,7242.892623,1.970597
max,61177.03,61165.19,61781.83,58959.57,119.522868,950995.602917,60455.844831,40.142025


**9)Convert data frame to NumPy-**Now that the data is cleaned, to make it ready for preprocessing and apply machine learning models, we convert the DataFrame from pandas to NumPy. 

In [27]:
# 9) Convert the DataFrame to NumPy arrays for input/output to training model
x = bitcoin_daily_data.values
x = np.delete(arr=x, obj=1, axis=1) # dropping Close column from input x
print("x:",type(x),x.shape)

y = bitcoin_daily_data["Close"].values
print("y:",type(y),y.shape)

x: <class 'numpy.ndarray'> (3379, 7)
y: <class 'numpy.ndarray'> (3379,)


**10)Divide the data columns into test and train data-**
The entire data is divided into train data and test data. After processing the model with train data, our model will be tested using the test data and hence make predictions.


In [28]:
# 10) Split data into training and testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)