In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import requests, json

from datetime import datetime
from sklearn.metrics import mean_squared_error
from math import sqrt

from IPython.display import Markdown

from acquire import *
from prepare import *

import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting  import register_matplotlib_converters

import statsmodels.api as sm
from statsmodels.tsa.api import Holt

In [2]:
def time_split(df, train_size = .5, validate_size = .3):
    '''Splits time series data based on percentages and returns train, validate, test THE
    DATAFRAME MUST BE CHRONOLOGICALLY SORTED!'''
    t_size = int(len(df) * train_size)
    v_size = int(len(df) * validate_size)
    end = t_size + v_size
    return df[0:t_size], df[t_size:end], df[end:len(df)+1]

def plot_samples(target_var):
    '''
   plot each attribute 
   '''
    plt.figure(figsize = (12,4))
    sns.lineplot(data=train[target_var], label='train')
    sns.lineplot(data=validate[target_var], label='validate')
    sns.lineplot(data=test[target_var], label='test')
    plt.title(target_var.title())
    plt.legend()
    
def evaluate(target_var):
    '''
    the evaluate function will take in the actual values and the predicted values
    and compute the mean_squared_error and then take the sqrt returning a rounded rmse
    '''
    rmse = round(sqrt(mean_squared_error(validate[target_var], yhat_df[target_var])),0)
    return rmse

def plot_and_eval(target_var):
    '''
    a function to evaluate forecasts by computing the rmse and plot train and validate along with predictions
    '''
    plot_samples(target_var)
    sns.lineplot(data=yhat_df[target_var], label='RMSE')
    plt.title(target_var)
    rmse = evaluate(target_var)
    print(target_var, f'--RMSE: {rmse:.0f}')
    plt.show()

def append_eval_df(model_type, target_var):
    '''
    this function will take in the model type as a string, target variable
    as a string, and run the evaluate function to compute rmse, 
    and append to the dataframe a row with the model type, 
    target variable and rmse. 
    '''
    rmse = evaluate(target_var)
    d= {'model_type':[model_type], 'target_var':[target_var], 'rmse':[rmse]}
    d= pd.DataFrame(d)
    return eval_df.append(d, ignore_index= True)

def previous_plot_and_eval(target_var):
    '''
    a function to evaluate forecasts by computing the rmse and plot train and validate along with predictions
    '''
    rmse = evaluate(target_var)
    print(target_var, f'--RMSE: {rmse:.0f}')
    return rmse


In [3]:
df = pd.read_csv('bitstampUSD_1-min_data_2012-01-01_to_2021-03-31.csv')
df.Timestamp = pd.to_datetime(df.Timestamp, unit='s')
Markdown(fetch_data_dict(df))


### Data Dict

| Feature           | Datatype                         | Definition                                                 |
|:------------------|:---------------------------------|:-----------------------------------------------------------|
| Timestamp         | 4857377 non-null: datetime64[ns] | start tiem of time window (60s window), in Unix Time       |
| Open              | 3613769 non-null: float64        | Open price at start time window                            |
| High              | 3613769 non-null: float64        | High price within the time window                          |
| Low               | 3613769 non-null: float64        | Low price within the time window                           |
| Close             | 3613769 non-null: float64        | Close price at the end of the time window                  |
| Volume_(BTC)      | 3613769 non-null: float64        | Volume of BTC transacted in this window                    |
| Volume_(Currency) | 3613769 non-null: float64        | Volume of corresponding currency transacted in this window |
| Weighted_Price    | 3613769 non-null: float64        | VWAP - Volume Weighted Average Price                       |

In [4]:
df = df.set_index('Timestamp')
Markdown(df.tail().to_markdown())

| Timestamp           |    Open |    High |     Low |   Close |   Volume_(BTC) |   Volume_(Currency) |   Weighted_Price |
|:--------------------|--------:|--------:|--------:|--------:|---------------:|--------------------:|-----------------:|
| 2021-03-30 23:56:00 | 58714.3 | 58714.3 | 58686   | 58686   |       1.38449  |             81259.4 |          58692.8 |
| 2021-03-30 23:57:00 | 58684   | 58693.4 | 58684   | 58685.8 |       7.29485  |            428158   |          58693.2 |
| 2021-03-30 23:58:00 | 58693.4 | 58723.8 | 58693.4 | 58723.8 |       1.70568  |            100117   |          58696.2 |
| 2021-03-30 23:59:00 | 58742.2 | 58770.4 | 58742.2 | 58760.6 |       0.720415 |             42333   |          58761.9 |
| 2021-03-31 00:00:00 | 58767.8 | 58778.2 | 58756   | 58778.2 |       2.71283  |            159418   |          58764.3 |

## Calculate other metrics from data
> - Calculate month name, day names and price vairation from minute to minute

In [5]:
df = prep_bitcoin_data(df)
Markdown(df.tail().to_markdown())

| Timestamp           |    Open |    High |     Low |   Close |   Volume_(BTC) |   Volume_(Currency) |   Weighted_Price | day_of_week   | month   |   minute_price_diff |   price_delta |   day_num |   percent_change |
|:--------------------|--------:|--------:|--------:|--------:|---------------:|--------------------:|-----------------:|:--------------|:--------|--------------------:|--------------:|----------:|-----------------:|
| 2021-03-30 23:56:00 | 58714.3 | 58714.3 | 58686   | 58686   |       1.38449  |             81259.4 |          58692.8 | Tuesday       | 03_Mar  |              -28.31 |         28.31 |        30 |      -0.0482165  |
| 2021-03-30 23:57:00 | 58684   | 58693.4 | 58684   | 58685.8 |       7.29485  |            428158   |          58693.2 | Tuesday       | 03_Mar  |                1.84 |          9.46 |        30 |       0.00313544 |
| 2021-03-30 23:58:00 | 58693.4 | 58723.8 | 58693.4 | 58723.8 |       1.70568  |            100117   |          58696.2 | Tuesday       | 03_Mar  |               30.41 |         30.41 |        30 |       0.0518116  |
| 2021-03-30 23:59:00 | 58742.2 | 58770.4 | 58742.2 | 58760.6 |       0.720415 |             42333   |          58761.9 | Tuesday       | 03_Mar  |               18.41 |         28.2  |        30 |       0.0313403  |
| 2021-03-31 00:00:00 | 58767.8 | 58778.2 | 58756   | 58778.2 |       2.71283  |            159418   |          58764.3 | Wednesday     | 03_Mar  |               10.43 |         22.21 |        31 |       0.0177478  |

In [7]:
Markdown(fetch_data_dict(df))

### Data Dict

| Feature           | Datatype                  | Definition                                                 |
|:------------------|:--------------------------|:-----------------------------------------------------------|
| Open              | 4857377 non-null: float64 | Open price at start time window                            |
| High              | 4857377 non-null: float64 | High price within the time window                          |
| Low               | 4857377 non-null: float64 | Low price within the time window                           |
| Close             | 4857377 non-null: float64 | Close price at the end of the time window                  |
| Volume_(BTC)      | 4857377 non-null: float64 | Volume of BTC transacted in this window                    |
| Volume_(Currency) | 4857377 non-null: float64 | Volume of corresponding currency transacted in this window |
| Weighted_Price    | 4857377 non-null: float64 | VWAP - Volume Weighted Average Price                       |
| day_of_week       | 4857377 non-null: object  | Verbose name of the week                                   |
| month             | 4857377 non-null: object  | Month number and month name                                |
| minute_price_diff | 4857377 non-null: float64 | Delta between the Close and Open                           |
| price_delta       | 4857377 non-null: float64 | Delta between the High and Low                             |
| day_num           | 4857377 non-null: int64   | The numeric number of the day of the month                 |
| percent_change    | 4857377 non-null: float64 | Price difference / Open price represented as a percentage  |