# Exercise project 3 (CNN time-series classification)
### **Step 0:** Importing necessary libraries

In [66]:
import keras
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

### **Step 1**: Reading the dataset

In [67]:
df = pd.read_csv("BTC-USD.csv")
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2014-09-17,465.864014,468.174011,452.421997,457.334015,457.334015,21056800
1,2014-09-18,456.859985,456.859985,413.104004,424.440002,424.440002,34483200
2,2014-09-19,424.102997,427.834991,384.532013,394.79599,394.79599,37919700
3,2014-09-20,394.673004,423.29599,389.882996,408.903992,408.903992,36863600
4,2014-09-21,408.084991,412.425995,393.181,398.821014,398.821014,26580100


Let's check if the dataset has any null values and use describe to have an overview of the numerical columns.

In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2691 entries, 0 to 2690
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       2691 non-null   object 
 1   Open       2691 non-null   float64
 2   High       2691 non-null   float64
 3   Low        2691 non-null   float64
 4   Close      2691 non-null   float64
 5   Adj Close  2691 non-null   float64
 6   Volume     2691 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 147.3+ KB


In [69]:
df.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,2691.0,2691.0,2691.0,2691.0,2691.0,2691.0
mean,11067.257301,11365.906054,10735.76901,11078.838895,11078.838895,14644490000.0
std,15942.26054,16372.505178,15442.789057,15944.078311,15944.078311,20081510000.0
min,176.897003,211.731003,171.509995,178.102997,178.102997,5914570.0
25%,603.274506,607.488006,598.279511,605.21051,605.21051,78542800.0
50%,6253.549805,6388.629883,6119.680176,6274.580078,6274.580078,5014430000.0
75%,10346.236817,10572.645996,10111.598144,10347.236817,10347.236817,24576290000.0
max,67549.734375,68789.625,66382.0625,67566.828125,67566.828125,350967900000.0


### **Step 2:** Data cleanup, sequencing and scaling

For this excercise, I will use the *Close* and *Volume* columns as the 2 classes.

In [70]:
df = df.drop(['Open', 'High', 'Low', 'Adj Close'], axis=1)

I scale the numeric columns using *scikit-learn*'s *MinMaxScaler()*

In [71]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

num_cols = ['Close', 'Volume']
df[num_cols] = scaler.fit_transform(df[num_cols])
df.head()

Unnamed: 0,Date,Close,Volume
0,2014-09-17,0.004144,4.3e-05
1,2014-09-18,0.003655,8.1e-05
2,2014-09-19,0.003216,9.1e-05
3,2014-09-20,0.003425,8.8e-05
4,2014-09-21,0.003275,5.9e-05


The Date column should be converted to a sequence column, so that the model could understand it.

In [72]:
df['Sequence'] = df.index + 1
df = df.drop(['Date'], axis=1)
df.head()

Unnamed: 0,Close,Volume,Sequence
0,0.004144,4.3e-05,1
1,0.003655,8.1e-05,2
2,0.003216,9.1e-05,3
3,0.003425,8.8e-05,4
4,0.003275,5.9e-05,5
