In [244]:
import pandas as pd
import numpy as np

In [245]:
df = pd.read_csv('../data/raw/cardano_5y.csv', parse_dates=['open_time'])

In [246]:
df.head()

Unnamed: 0,open_time,open,high,low,close,volume
0,2020-11-18 06:00:00,0.10447,0.1064,0.10408,0.10587,30552966.4
1,2020-11-18 07:00:00,0.10587,0.10696,0.10467,0.10687,27057619.6
2,2020-11-18 08:00:00,0.10692,0.10704,0.10632,0.10656,17147608.0
3,2020-11-18 09:00:00,0.10658,0.10666,0.1055,0.10636,22508213.3
4,2020-11-18 10:00:00,0.10639,0.1075,0.10594,0.10747,12270594.2


In [247]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43830 entries, 0 to 43829
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   open_time  43830 non-null  datetime64[ns]
 1   open       43810 non-null  float64       
 2   high       43810 non-null  float64       
 3   low        43810 non-null  float64       
 4   close      43810 non-null  float64       
 5   volume     43810 non-null  float64       
dtypes: datetime64[ns](1), float64(5)
memory usage: 2.0 MB


In [248]:
df.describe()

Unnamed: 0,open_time,open,high,low,close,volume
count,43830,43810.0,43810.0,43810.0,43810.0,43810.0
mean,2023-05-20 08:30:00,0.73961,0.745498,0.733378,0.739617,9636120.0
min,2020-11-18 06:00:00,0.10447,0.10494,0.10337,0.10449,0.0
25%,2022-02-17 19:15:00,0.3689,0.371,0.3666,0.3689,2842061.0
50%,2023-05-20 08:30:00,0.5621,0.5664,0.55765,0.5622,5131130.0
75%,2024-08-18 21:45:00,0.934697,0.943,0.926,0.93465,10356470.0
max,2025-11-18 11:00:00,3.095,3.101,3.048,3.095,447599600.0
std,,0.522906,0.527874,0.517603,0.522904,14679760.0


In [249]:
df.isnull().sum()

open_time     0
open         20
high         20
low          20
close        20
volume       20
dtype: int64

In [250]:
zero_counts = (df == 0).sum()
print("Number of zeros per column:\n", zero_counts)

Number of zeros per column:
 open_time    0
open         0
high         0
low          0
close        0
volume       2
dtype: int64


In [251]:
df = df.set_index("open_time").sort_index()

In [252]:
df.head()

Unnamed: 0_level_0,open,high,low,close,volume
open_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-11-18 06:00:00,0.10447,0.1064,0.10408,0.10587,30552966.4
2020-11-18 07:00:00,0.10587,0.10696,0.10467,0.10687,27057619.6
2020-11-18 08:00:00,0.10692,0.10704,0.10632,0.10656,17147608.0
2020-11-18 09:00:00,0.10658,0.10666,0.1055,0.10636,22508213.3
2020-11-18 10:00:00,0.10639,0.1075,0.10594,0.10747,12270594.2


In [253]:
df["missing_flag"] = df["open"].isna().astype(int)

In [254]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 43830 entries, 2020-11-18 06:00:00 to 2025-11-18 11:00:00
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   open          43810 non-null  float64
 1   high          43810 non-null  float64
 2   low           43810 non-null  float64
 3   close         43810 non-null  float64
 4   volume        43810 non-null  float64
 5   missing_flag  43830 non-null  int64  
dtypes: float64(5), int64(1)
memory usage: 2.3 MB


In [255]:
df.head()

Unnamed: 0_level_0,open,high,low,close,volume,missing_flag
open_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-11-18 06:00:00,0.10447,0.1064,0.10408,0.10587,30552966.4,0
2020-11-18 07:00:00,0.10587,0.10696,0.10467,0.10687,27057619.6,0
2020-11-18 08:00:00,0.10692,0.10704,0.10632,0.10656,17147608.0,0
2020-11-18 09:00:00,0.10658,0.10666,0.1055,0.10636,22508213.3,0
2020-11-18 10:00:00,0.10639,0.1075,0.10594,0.10747,12270594.2,0


In [256]:
df.isnull().sum()

open            20
high            20
low             20
close           20
volume          20
missing_flag     0
dtype: int64

In [257]:
df["missing_flag"] = df["open"].isna().astype(int)

In [258]:
df[["open","high","low","close","volume"]] = (
    df[["open","high","low","close","volume"]].ffill()
)

In [259]:
df.isnull().sum()

open            0
high            0
low             0
close           0
volume          0
missing_flag    0
dtype: int64

# Feature Engineering

In [260]:
df["return_1h"] = df["close"].pct_change().fillna(0)
df["volatility_24h"] = df["return_1h"].rolling(24).std().fillna(0)

In [261]:
df.describe()

Unnamed: 0,open,high,low,close,volume,missing_flag,return_1h,volatility_24h
count,43830.0,43830.0,43830.0,43830.0,43830.0,43830.0,43830.0,43830.0
mean,0.739742,0.745633,0.73351,0.739751,9638699.0,0.000456,9.7e-05,0.009425
std,0.523045,0.528019,0.517744,0.523048,14678650.0,0.021357,0.011234,0.00615
min,0.10447,0.10494,0.10337,0.10449,0.0,0.0,-0.215392,0.0
25%,0.3689,0.371,0.3666,0.3689,2842061.0,0.0,-0.004541,0.005724
50%,0.5622,0.5665,0.55775,0.56225,5132363.0,0.0,0.0,0.007849
75%,0.935,0.943615,0.926475,0.935,10365380.0,0.0,0.004632,0.011299
max,3.095,3.101,3.048,3.095,447599600.0,1.0,0.337091,0.085205


In [262]:
df["ma_24"] = df["close"].rolling(24).mean()
df["ma_168"] = df["close"].rolling(168).mean()
df["ma_ratio"] = df["ma_24"] / df["ma_168"]

In [263]:
df["vol_change"] = df["volume"].pct_change().fillna(0)

In [264]:
df.head()

Unnamed: 0_level_0,open,high,low,close,volume,missing_flag,return_1h,volatility_24h,ma_24,ma_168,ma_ratio,vol_change
open_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2020-11-18 06:00:00,0.10447,0.1064,0.10408,0.10587,30552966.4,0,0.0,0.0,,,,0.0
2020-11-18 07:00:00,0.10587,0.10696,0.10467,0.10687,27057619.6,0,0.009446,0.0,,,,-0.114403
2020-11-18 08:00:00,0.10692,0.10704,0.10632,0.10656,17147608.0,0,-0.002901,0.0,,,,-0.366256
2020-11-18 09:00:00,0.10658,0.10666,0.1055,0.10636,22508213.3,0,-0.001877,0.0,,,,0.312615
2020-11-18 10:00:00,0.10639,0.1075,0.10594,0.10747,12270594.2,0,0.010436,0.0,,,,-0.454839


In [265]:
df["ma_24"] = df["ma_24"].bfill()
df["ma_168"] = df["ma_168"].bfill()
df["ma_ratio"] = df["ma_ratio"].bfill()

In [266]:
df.isnull().sum()

open              0
high              0
low               0
close             0
volume            0
missing_flag      0
return_1h         0
volatility_24h    0
ma_24             0
ma_168            0
ma_ratio          0
vol_change        0
dtype: int64

In [267]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 43830 entries, 2020-11-18 06:00:00 to 2025-11-18 11:00:00
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   open            43830 non-null  float64
 1   high            43830 non-null  float64
 2   low             43830 non-null  float64
 3   close           43830 non-null  float64
 4   volume          43830 non-null  float64
 5   missing_flag    43830 non-null  int64  
 6   return_1h       43830 non-null  float64
 7   volatility_24h  43830 non-null  float64
 8   ma_24           43830 non-null  float64
 9   ma_168          43830 non-null  float64
 10  ma_ratio        43830 non-null  float64
 11  vol_change      43830 non-null  float64
dtypes: float64(11), int64(1)
memory usage: 4.3 MB


In [268]:
df.isnull().sum()

open              0
high              0
low               0
close             0
volume            0
missing_flag      0
return_1h         0
volatility_24h    0
ma_24             0
ma_168            0
ma_ratio          0
vol_change        0
dtype: int64

In [269]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.ffill(inplace=True)

In [270]:
df.to_csv('../data/processed/cardano_processed.csv', index=True)