<a href="https://colab.research.google.com/github/1pawn0/time-series-forecasting-lab/blob/main/preprocessing_the_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import polars as pl
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.preprocessing import SplineTransformer, MinMaxScaler, RobustScaler, StandardScaler
from statsmodels import tsa
from urllib.request import urlretrieve
from pathlib import Path
from tqdm.notebook import tqdm


### Download the csv dataset

In [None]:
from urllib.request import urlretrieve
from pathlib import Path

csv_url: str = "https://www.cryptodatadownload.com/cdd/Gemini_BTCUSD_1h.csv"
data_dir: Path = Path("./data")
data_dir.mkdir(exist_ok=True, parents=True)
csv_file: Path = data_dir / "Gemini_BTCUSD_1h.csv"

if not csv_file.exists():
    urlretrieve(csv_url, csv_file)
    print(f"CSV file downloaded to {csv_file}")
else:
    print(f"CSV file already exists at {csv_file}")

### Read the csv file as a Polars dataframe

In [None]:
import polars as pl

# Define the schema of df
df_schema: dict = {
    "date": pl.Datetime("ms"),
    "close": pl.Float64,
}
# Define which columns to load from the CSV
cols = list(df_schema.keys())
# Read the CSV file
df = (
    pl.read_csv(csv_file, columns=cols, skip_lines=1, schema_overrides=df_schema)
    .sort("date")
    .rename({"close": "price"})
)
# Fill the date gaps inside `df`
full_date_range = pl.datetime_range(
    start=df["date"][0],
    end=df["date"][-1],
    interval="1h",
    time_unit="ms",
    eager=True,
).to_frame(name="date")
df = full_date_range.join(df, on="date", how="left").interpolate()
# Filter the dataset to only include data after a specific start date
df = df.filter(pl.col("date") > datetime(2016, 10, 30))
price_pct_changes_df = df.with_columns(
    pl.col("price").pct_change().alias("price_pct_change")
)[1:]
price_pct_changes_df

## Exploratory Data Analysis

In [None]:
import plotly.express as px

px.line(
    df[1:],
    x="date",
    y="price",
    log_y=True,
    title="BTC/USD Price Over Time (Log Scale)",
    labels={"date": "Date", "price": "Price (USD) - Log Scale"},
).show()

px.line(
    price_pct_changes_df,
    x="date",
    y="price_pct_change",
    title="BTC/USD Price Percentage Change Over Time",
    labels={"date": "Date", "price_pct_change": "Price Percentage Change"},
).show()

px.violin(
    price_pct_changes_df,
    y="price_pct_change",
    box=True,
    points=False,
    title="BTC/USD Price Percentage Change Distribution",
    labels={"price_pct_change": "Price Percentage Change"},
).show()

px.histogram(
    price_pct_changes_df,
    x="price_pct_change",
    nbins=40,
    marginal="box",
    log_y=True,
    title="Histogram of Hourly Price Returns (BTC/USD)",
    labels={"price_pct_change": "Hourly Percentage Change"},
).show()


In [None]:
from statsmodels.tsa.stattools import acf, pacf

N_LAGS, ALPHA_VALUE = 40, 0.001
acf_values, acf_conf_intervals, q_stat, p_values = acf(
    x=price_pct_changes_df["price_pct_change"],
    adjusted=False,
    nlags=N_LAGS,
    qstat=True,
    fft=False,
    alpha=ALPHA_VALUE,
)
acf_values = acf_values[1:]
acf_conf_intervals = acf_conf_intervals[1:]
q_stat = q_stat[1:]
p_values = p_values[1:]


pacf_values, pacf_conf_intervals = pacf(
    x=price_pct_changes_df["price_pct_change"],
    nlags=N_LAGS,
    method="ols-inefficient",
    alpha=ALPHA_VALUE,
)
pacf_values = pacf_values[1:]
pacf_conf_intervals = pacf_conf_intervals[1:]


In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Create a figure
fig = make_subplots(
    rows=2,
    cols=1,
    subplot_titles=("Autocorrelation (ACF)", "Partial Autocorrelation (PACF)"),
)

# Add ACF Plot to the First Row
lags_acf = np.arange(1, len(acf_values) + 1)
ci_upper_acf = acf_conf_intervals[:, 1]
ci_lower_acf = acf_conf_intervals[:, 0]

# Add ACF confidence interval region
fig.add_trace(
    go.Scatter(
        x=np.concatenate([lags_acf, lags_acf[::-1]]),
        y=np.concatenate([ci_upper_acf, ci_lower_acf[::-1]]),
        fill="toself",
        fillcolor="RoyalBlue",
        opacity=0.2,
        line=dict(color="rgba(255,255,255,0)"),
        name="ACF 99.9% CI",
        showlegend=True,
    ),
    row=1,
    col=1,
)

# Add ACF bars
fig.add_trace(go.Bar(x=lags_acf, y=acf_values, name="ACF"), row=1, col=1)


# Add PACF Plot to the Second Row
lags_pacf = np.arange(1, len(pacf_values) + 1)
ci_upper_pacf = pacf_conf_intervals[:, 1]
ci_lower_pacf = pacf_conf_intervals[:, 0]

# Add PACF confidence interval region
fig.add_trace(
    go.Scatter(
        x=np.concatenate([lags_pacf, lags_pacf[::-1]]),
        y=np.concatenate([ci_upper_pacf, ci_lower_pacf[::-1]]),
        fill="toself",
        fillcolor="RoyalBlue",
        opacity=0.2,
        line=dict(color="rgba(255,255,255,0)"),
        name="PACF 99.9% CI",
        showlegend=True,
    ),
    row=2,
    col=1,
)

# Add PACF bars
fig.add_trace(go.Bar(x=lags_pacf, y=pacf_values, name="PACF"), row=2, col=1)


fig.update_layout(
    title_text="ACF and PACF of Price Returns",
    showlegend=False,
)

fig.update_xaxes(title_text="Lag", row=2, col=1)
fig.update_yaxes(title_text="Correlation", row=1, col=1)
fig.update_yaxes(title_text="Correlation", row=2, col=1)

fig.show()


## Preprocessing