In [33]:
# Imports

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error

%matplotlib inline
plt.style.use("ggplot")
sns.set_theme()

In [3]:
#load dataset
df = pd.read_csv('Binance_WBTCETH_d.csv', skiprows=1)
df

Unnamed: 0,Unix,Date,Symbol,Open,High,Low,Close,Volume WBTC,Volume ETH,tradecount
0,1759536000000,2025-10-04,WBTCETH,27.1600,27.3900,27.1600,27.2200,0.40975,11.164314,169
1,1759449600000,2025-10-03,WBTCETH,26.8600,27.2700,26.4500,27.0500,1.59839,43.162561,502
2,1759363200000,2025-10-02,WBTCETH,27.2400,27.3200,26.7300,26.8900,0.59010,15.916575,386
3,1759276800000,2025-10-01,WBTCETH,27.6000,27.7300,26.9200,27.2800,0.90510,24.810029,223
4,1759190400000,2025-09-30,WBTCETH,27.1300,27.6500,27.0500,27.6200,0.80532,22.023941,293
...,...,...,...,...,...,...,...,...,...,...
1856,1599177600000,2020-09-04,WBTCETH,26.6279,27.6672,26.3214,27.1388,11.43740,305.619449,544
1857,1599091200000,2020-09-03,WBTCETH,26.0149,27.4065,25.4458,26.6862,21.85060,578.793301,1069
1858,1599004800000,2020-09-02,WBTCETH,24.8566,26.8284,24.8243,25.9359,28.12000,719.895339,814
1859,1598918400000,2020-09-01,WBTCETH,26.9386,27.4292,24.6384,25.0915,13.44950,348.936288,650


In [4]:
#standardize our cols to lowercase and a snake type format
df.columns = df.columns.str.lower().str.replace(' ', '_') 


In [5]:
df.dtypes

unix             int64
date            object
symbol          object
open           float64
high           float64
low            float64
close          float64
volume_wbtc    float64
volume_eth     float64
tradecount       int64
dtype: object

In [6]:
# convert date to datetime
df['date'] = pd.to_datetime(df['date'], errors='coerce')

In [7]:
df["date_dow"] = df["date"].dt.dayofweek  # 0=Monday, 6=Sunday

In [8]:
print(df['symbol'].nunique())

1


In [9]:
#standardize the symbols 
df['symbol'] = df['symbol'].str.lower()    


In [10]:
#checking for missing values
df.isnull().sum()

unix           0
date           0
symbol         0
open           0
high           0
low            0
close          0
volume_wbtc    0
volume_eth     0
tradecount     0
date_dow       0
dtype: int64

In [11]:
#stats
df[['open','close','high','low']].describe()

Unnamed: 0,open,close,high,low
count,1861.0,1861.0,1861.0,1861.0
mean,21.650056,21.651482,22.133693,21.272929
std,9.280784,9.284769,9.983165,9.077203
min,11.38,11.39,11.62,3.35
25%,14.62,14.62,14.83,14.38
50%,18.08,18.09,18.33,17.79
75%,27.08,27.07,27.67,26.73
max,55.5,55.39,150.0,52.87


In [None]:
df['return'] = df['close'].pct_change()

In [None]:
df['target'] = df['close'].shift(-1)

In [42]:
df

Unnamed: 0,unix,date,symbol,open,high,low,close,volume_wbtc,volume_eth,tradecount,date_dow,return,target
1,1759449600000,2025-10-03,wbtceth,26.8600,27.2700,26.4500,27.0500,1.59839,43.162561,502,4,-0.006245,26.8900
2,1759363200000,2025-10-02,wbtceth,27.2400,27.3200,26.7300,26.8900,0.59010,15.916575,386,3,-0.005915,27.2800
3,1759276800000,2025-10-01,wbtceth,27.6000,27.7300,26.9200,27.2800,0.90510,24.810029,223,2,0.014504,27.6200
4,1759190400000,2025-09-30,wbtceth,27.1300,27.6500,27.0500,27.6200,0.80532,22.023941,293,1,0.012463,27.1400
5,1759104000000,2025-09-29,wbtceth,27.0800,27.4800,27.0100,27.1400,5.27933,143.270248,1238,0,-0.017379,27.1000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1855,1599264000000,2020-09-05,wbtceth,27.2410,47.7579,26.8395,30.2455,18.87670,552.907778,1159,5,0.037535,27.1388
1856,1599177600000,2020-09-04,wbtceth,26.6279,27.6672,26.3214,27.1388,11.43740,305.619449,544,4,-0.102716,26.6862
1857,1599091200000,2020-09-03,wbtceth,26.0149,27.4065,25.4458,26.6862,21.85060,578.793301,1069,3,-0.016677,25.9359
1858,1599004800000,2020-09-02,wbtceth,24.8566,26.8284,24.8243,25.9359,28.12000,719.895339,814,2,-0.028116,25.0915


In [28]:
df = df.dropna()

In [29]:
n = len(df)
idx = np.arange(n)
np.random.seed(9)
np.random.shuffle(idx)

# Apply shuffled index before splitting
df_shuffled = df.iloc[idx].reset_index(drop=True)
df_shuffled

Unnamed: 0,unix,date,symbol,open,high,low,close,volume_wbtc,volume_eth,tradecount,date_dow,return,target
0,1611100800000,2021-01-20,wbtceth,26.318,27.5193,25.6211,25.8122,100.46190,2657.536702,13187,2,-0.068669,26.312
1,1663459200000,2022-09-18,wbtceth,13.710,14.6500,13.7100,14.5600,24.91601,354.853353,2137,6,0.024631,13.680
2,1759276800000,2025-10-01,wbtceth,27.600,27.7300,26.9200,27.2800,0.90510,24.810029,223,2,0.014504,27.620
3,1717459200000,2024-06-04,wbtceth,18.300,18.5600,18.2800,18.5200,0.77235,14.264129,462,1,0.007069,18.270
4,1646956800000,2022-03-11,wbtceth,15.140,15.2000,14.9700,15.1500,22.63624,341.567795,1707,4,0.002647,15.130
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1854,1754611200000,2025-08-08,wbtceth,30.080,30.0800,28.6200,29.0600,2.26962,66.393487,663,4,0.065249,29.990
1855,1716163200000,2024-05-20,wbtceth,21.590,21.8400,19.0400,19.4900,10.81639,214.793859,3298,0,0.052944,21.570
1856,1704326400000,2024-01-04,wbtceth,19.400,19.5700,19.2100,19.4800,1.89160,36.659874,899,3,0.001542,19.350
1857,1729382400000,2024-10-20,wbtceth,25.790,25.8500,25.0800,25.1100,9.27017,234.877845,10071,6,-0.006332,25.730


In [30]:
# slit dataset
n_val = int(n * 0.2)                #20% of data for validation
n_test = int(n * 0.2)               #20% of data for testing.
n_train = n - n_val - n_test        #whatever is left for training (about 60%).

n_val, n_test, n_train

(371, 371, 1117)

In [35]:
df_train = df.iloc[: n_train ] #take rows from start (0) up to n_train. That’s the first 60% of the dataset.
df_val = df.iloc[n_train : n_train + n_val] #Starts at n_train (where training stopped).Goes up to n_train + n_val.That’s the next 20% of the dataset           
df_test = df.iloc[n_train + n_val : ] #Starts where validation stopped (n_train + n_val). Goes until the end of the DataFrame.That’s the last 20% of the dataset 
     

In [43]:
features = df[['open', 'high','low','close','volume_wbtc', 'volume_eth', 'return']]
target = df['target']