In [102]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [131]:
df = pd.read_csv('../data/raw/solana_5y.csv')

In [132]:
df.head()

Unnamed: 0,open_time,open,high,low,close,volume
0,2020-11-18 06:00:00,2.0818,2.1248,2.0772,2.1085,202719.77
1,2020-11-18 07:00:00,2.1065,2.1365,2.0932,2.1289,123056.45
2,2020-11-18 08:00:00,2.1263,2.1423,2.1209,2.1243,27228.63
3,2020-11-18 09:00:00,2.1265,2.1296,2.0851,2.0957,76816.4
4,2020-11-18 10:00:00,2.0957,2.1152,2.0801,2.1152,63006.25


In [133]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43830 entries, 0 to 43829
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   open_time  43830 non-null  object 
 1   open       43810 non-null  float64
 2   high       43810 non-null  float64
 3   low        43810 non-null  float64
 4   close      43810 non-null  float64
 5   volume     43810 non-null  float64
dtypes: float64(5), object(1)
memory usage: 2.0+ MB


In [134]:
df.describe()

Unnamed: 0,open,high,low,close,volume
count,43810.0,43810.0,43810.0,43810.0,43810.0
mean,95.509218,96.244494,94.743311,95.512204,190979.9
std,72.863244,73.362662,72.337607,72.862214,215069.3
min,1.1843,1.2182,1.0301,1.1832,0.0
25%,24.54,24.72,24.39,24.54,75819.04
50%,90.69,91.44,89.885,90.7,127234.5
75%,157.01,158.05,155.96,157.0,225889.9
max,286.23,295.83,284.2,286.24,4974114.0


In [135]:
df.isnull().sum()

open_time     0
open         20
high         20
low          20
close        20
volume       20
dtype: int64

In [136]:
df.index=pd.to_datetime(df.open_time)
df = df.sort_index()

In [137]:
df.head()

Unnamed: 0_level_0,open_time,open,high,low,close,volume
open_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-11-18 06:00:00,2020-11-18 06:00:00,2.0818,2.1248,2.0772,2.1085,202719.77
2020-11-18 07:00:00,2020-11-18 07:00:00,2.1065,2.1365,2.0932,2.1289,123056.45
2020-11-18 08:00:00,2020-11-18 08:00:00,2.1263,2.1423,2.1209,2.1243,27228.63
2020-11-18 09:00:00,2020-11-18 09:00:00,2.1265,2.1296,2.0851,2.0957,76816.4
2020-11-18 10:00:00,2020-11-18 10:00:00,2.0957,2.1152,2.0801,2.1152,63006.25


In [138]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 43830 entries, 2020-11-18 06:00:00 to 2025-11-18 11:00:00
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   open_time  43830 non-null  object 
 1   open       43810 non-null  float64
 2   high       43810 non-null  float64
 3   low        43810 non-null  float64
 4   close      43810 non-null  float64
 5   volume     43810 non-null  float64
dtypes: float64(5), object(1)
memory usage: 2.3+ MB


In [139]:
df.drop(['open_time'],axis=1, inplace=True)

In [140]:
df.head()

Unnamed: 0_level_0,open,high,low,close,volume
open_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-11-18 06:00:00,2.0818,2.1248,2.0772,2.1085,202719.77
2020-11-18 07:00:00,2.1065,2.1365,2.0932,2.1289,123056.45
2020-11-18 08:00:00,2.1263,2.1423,2.1209,2.1243,27228.63
2020-11-18 09:00:00,2.1265,2.1296,2.0851,2.0957,76816.4
2020-11-18 10:00:00,2.0957,2.1152,2.0801,2.1152,63006.25


In [141]:
df.isnull().sum()

open      20
high      20
low       20
close     20
volume    20
dtype: int64

In [142]:
df["missing_flag"] = df["open"].isna().astype(int)

In [143]:
df[["open","high","low","close","volume"]] = (
    df[["open","high","low","close","volume"]].ffill()
)

In [144]:
df.isnull().sum()

open            0
high            0
low             0
close           0
volume          0
missing_flag    0
dtype: int64

# Feature Engineering

In [145]:
df["return_1h"] = df["close"].pct_change().fillna(0)

In [146]:
df["volatility_24h"] = df["return_1h"].rolling(24).std().fillna(0)

In [147]:
df.describe()

Unnamed: 0,open,high,low,close,volume,missing_flag,return_1h,volatility_24h
count,43830.0,43830.0,43830.0,43830.0,43830.0,43830.0,43830.0,43830.0
mean,95.481162,96.216293,94.715492,95.484251,190937.9,0.000456,0.000179,0.010986
std,72.863212,73.362546,72.337478,72.862063,215049.0,0.021357,0.012941,0.006874
min,1.1843,1.2182,1.0301,1.1832,0.0,0.0,-0.19284,0.0
25%,24.54,24.71225,24.39,24.537,75801.37,0.0,-0.005372,0.006701
50%,90.66,91.365,89.815,90.66,127194.7,0.0,0.0,0.009242
75%,156.9775,158.0275,155.9475,156.9775,225882.0,0.0,0.005421,0.013142
max,286.23,295.83,284.2,286.24,4974114.0,1.0,0.186463,0.072218


In [148]:
df["ma_24"] = df["close"].rolling(24).mean()
df["ma_168"] = df["close"].rolling(168).mean()
df["ma_ratio"] = df["ma_24"] / df["ma_168"]

In [149]:
df["vol_change"] = df["volume"].pct_change().fillna(0)

In [150]:
df.head()

Unnamed: 0_level_0,open,high,low,close,volume,missing_flag,return_1h,volatility_24h,ma_24,ma_168,ma_ratio,vol_change
open_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2020-11-18 06:00:00,2.0818,2.1248,2.0772,2.1085,202719.77,0,0.0,0.0,,,,0.0
2020-11-18 07:00:00,2.1065,2.1365,2.0932,2.1289,123056.45,0,0.009675,0.0,,,,-0.392973
2020-11-18 08:00:00,2.1263,2.1423,2.1209,2.1243,27228.63,0,-0.002161,0.0,,,,-0.778731
2020-11-18 09:00:00,2.1265,2.1296,2.0851,2.0957,76816.4,0,-0.013463,0.0,,,,1.821163
2020-11-18 10:00:00,2.0957,2.1152,2.0801,2.1152,63006.25,0,0.009305,0.0,,,,-0.179781


In [151]:
df.isnull().sum()

open                0
high                0
low                 0
close               0
volume              0
missing_flag        0
return_1h           0
volatility_24h      0
ma_24              23
ma_168            167
ma_ratio          167
vol_change          0
dtype: int64

In [152]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 43830 entries, 2020-11-18 06:00:00 to 2025-11-18 11:00:00
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   open            43830 non-null  float64
 1   high            43830 non-null  float64
 2   low             43830 non-null  float64
 3   close           43830 non-null  float64
 4   volume          43830 non-null  float64
 5   missing_flag    43830 non-null  int64  
 6   return_1h       43830 non-null  float64
 7   volatility_24h  43830 non-null  float64
 8   ma_24           43807 non-null  float64
 9   ma_168          43663 non-null  float64
 10  ma_ratio        43663 non-null  float64
 11  vol_change      43830 non-null  float64
dtypes: float64(11), int64(1)
memory usage: 4.3 MB


In [153]:
df["ma_24"] = df["ma_24"].bfill()
df["ma_168"] = df["ma_168"].bfill()
df["ma_ratio"] = df["ma_ratio"].bfill()

In [154]:
df.isnull().sum()

open              0
high              0
low               0
close             0
volume            0
missing_flag      0
return_1h         0
volatility_24h    0
ma_24             0
ma_168            0
ma_ratio          0
vol_change        0
dtype: int64

In [155]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.ffill(inplace=True)

# Scaling

In [156]:
features=['open','high','low','close',
          'volume','missing_flag','return_1h','volatility_24h',
          'ma_24','ma_168','ma_ratio','vol_change']
scaler=MinMaxScaler()
df[features] = scaler.fit_transform(df[features])

In [157]:
df.head()

Unnamed: 0_level_0,open,high,low,close,volume,missing_flag,return_1h,volatility_24h,ma_24,ma_168,ma_ratio,vol_change
open_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2020-11-18 06:00:00,0.003149,0.003077,0.003698,0.003246,0.040755,0.0,0.508406,0.0,0.00308,0.003226,0.573355,0.01427
2020-11-18 07:00:00,0.003235,0.003117,0.003754,0.003318,0.024739,0.0,0.533914,0.0,0.00308,0.003226,0.573355,0.008662
2020-11-18 08:00:00,0.003305,0.003137,0.003852,0.003301,0.005474,0.0,0.50271,0.0,0.00308,0.003226,0.573355,0.003158
2020-11-18 09:00:00,0.003305,0.003094,0.003726,0.003201,0.015443,0.0,0.472912,0.0,0.00308,0.003226,0.573355,0.040258
2020-11-18 10:00:00,0.003197,0.003045,0.003708,0.00327,0.012667,0.0,0.532937,0.0,0.00308,0.003226,0.573355,0.011705


In [163]:
df.to_csv('../data/processed/solana_5y.csv', index=False)