## Data Cleaning and Preprocessing

In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler


### Initial Data Inspection

In [12]:
# Load the dataset
df = pd.read_csv('btc_2015_2024.csv')

print("--- Initial Data Head ---")
print(df.head())
print("\n--- Data Info ---")
print(df.info())



--- Initial Data Head ---
         date        open        high         low       close      volume  \
0  2015-01-02  314.079010  315.838989  313.565002  315.032013   7860650.0   
1  2015-01-03  314.846008  315.149994  281.082001  281.082001  33054400.0   
2  2015-01-04  281.145996  287.230011  257.612000  264.195007  55629100.0   
3  2015-01-05  265.084015  278.341003  265.084015  274.473999  43962800.0   
4  2015-01-06  274.610992  287.553009  272.696014  286.188995  23245700.0   

        rsi_7      rsi_14       cci_7      cci_14      sma_50      ema_50  \
0  100.000000  100.000000  -66.666667  -66.666667  314.640503  314.648333   
1    1.938583    2.096744 -100.000000 -100.000000  303.454336  303.009081   
2    1.235506    1.375421 -110.693896 -110.693896  293.639503  292.715747   
3   21.462825   19.523695  -76.487357  -76.487357  289.806403  288.769813   
4   38.272356   34.350787  -37.070244  -37.070244  289.203501  288.295540   

      sma_100     ema_100      macd   bollinger 

### Data Preprocessing

In [13]:
# Convert 'date' to datetime objects
# format='%d/%m/%Y %H:%M' handles "01/03/2022 0:00"
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')

# Sort by date (Oldest -> Newest) is CRITICAL for time-series
df = df.sort_values('date').reset_index(drop=True)

# Drop irrelevant columns
# 'unix' is redundant with date, 'symbol' is constant
df = df.drop(['next_day_close'], axis=1)

display(df.head())

Unnamed: 0,date,open,high,low,close,volume,rsi_7,rsi_14,cci_7,cci_14,sma_50,ema_50,sma_100,ema_100,macd,bollinger,TrueRange,atr_7,atr_14
0,2015-01-02,314.07901,315.838989,313.565002,315.032013,7860650.0,100.0,100.0,-66.666667,-66.666667,314.640503,314.648333,314.640503,314.644418,0.017568,314.640503,2.273987,4.193073,4.275997
1,2015-01-03,314.846008,315.149994,281.082001,281.082001,33054400.0,1.938583,2.096744,-100.0,-100.0,303.454336,303.009081,303.454336,303.232458,-1.030403,303.454336,34.067993,15.719617,14.951008
2,2015-01-04,281.145996,287.230011,257.612,264.195007,55629100.0,1.235506,1.375421,-110.693896,-110.693896,293.639503,292.715747,293.639503,293.178382,-2.091487,293.639503,29.618011,20.033779,19.034849
3,2015-01-05,265.084015,278.341003,265.084015,274.473999,43962800.0,21.462825,19.523695,-76.487357,-76.487357,289.806403,288.769813,289.806403,289.286389,-2.215008,289.806403,14.145996,18.468441,17.90707
4,2015-01-06,274.610992,287.553009,272.696014,286.188995,23245700.0,38.272356,34.350787,-37.070244,-37.070244,289.203501,288.29554,289.203501,288.744005,-1.746686,289.203501,14.856995,17.613461,17.300126


In [14]:
print("\n--- Data Preprocessing Completed ---")
display(df.head())


--- Data Preprocessing Completed ---


Unnamed: 0,date,open,high,low,close,volume,rsi_7,rsi_14,cci_7,cci_14,sma_50,ema_50,sma_100,ema_100,macd,bollinger,TrueRange,atr_7,atr_14
0,2015-01-02,314.07901,315.838989,313.565002,315.032013,7860650.0,100.0,100.0,-66.666667,-66.666667,314.640503,314.648333,314.640503,314.644418,0.017568,314.640503,2.273987,4.193073,4.275997
1,2015-01-03,314.846008,315.149994,281.082001,281.082001,33054400.0,1.938583,2.096744,-100.0,-100.0,303.454336,303.009081,303.454336,303.232458,-1.030403,303.454336,34.067993,15.719617,14.951008
2,2015-01-04,281.145996,287.230011,257.612,264.195007,55629100.0,1.235506,1.375421,-110.693896,-110.693896,293.639503,292.715747,293.639503,293.178382,-2.091487,293.639503,29.618011,20.033779,19.034849
3,2015-01-05,265.084015,278.341003,265.084015,274.473999,43962800.0,21.462825,19.523695,-76.487357,-76.487357,289.806403,288.769813,289.806403,289.286389,-2.215008,289.806403,14.145996,18.468441,17.90707
4,2015-01-06,274.610992,287.553009,272.696014,286.188995,23245700.0,38.272356,34.350787,-37.070244,-37.070244,289.203501,288.29554,289.203501,288.744005,-1.746686,289.203501,14.856995,17.613461,17.300126


### Target Variable Derivation (The Bullish/Bearish Label)

In [15]:
# # If Close(t+1) > Close(t) -> 1 (Bullish), else 0 (Bearish)
# df['Target'] = (df['close'].shift(-1) > df['close']).astype(int)

# print("\n--- Target Variable Created ---")
# display(df[['date', 'close', 'Target']].head())

### Feature Engineering

- Daily Return (% change)

In [16]:
df['Daily_Return'] = df['close'].pct_change() # 

- Volatility Features

In [17]:
df['High_Low_Spread'] = df['high'] - df['low'] # High-Low Spread (Intraday Volatility)
df['Close_Open_Spread'] = df['close'] - df['open'] # Close-Open Spread (Day's Momentum)

- Moving Averages (Trend Indicators)

In [18]:
df['MA_7'] = df['close'].rolling(window=7).mean()
df['MA_30'] = df['close'].rolling(window=30).mean()

# Drop rows with NaN values created by shifting/rolling (first 30 rows and last 1 row)
df = df.dropna()

### Save cleaned dataset

In [19]:
# Save the cleaned dataset to a new CSV file
output_path = "BTC_Cleaned_Data.csv"
df.to_csv(output_path, index=False)

print("Cleaned dataset saved to:", output_path)
display(df)

Cleaned dataset saved to: BTC_Cleaned_Data.csv


Unnamed: 0,date,open,high,low,close,volume,rsi_7,rsi_14,cci_7,cci_14,...,macd,bollinger,TrueRange,atr_7,atr_14,Daily_Return,High_Low_Spread,Close_Open_Spread,MA_7,MA_30
29,2015-01-31,226.440994,233.503998,216.309006,217.464005,2.334820e+07,35.643356,39.545291,-98.071536,-51.796943,...,-3.462697,228.424399,17.194992,22.345309,22.040724,-0.039576,17.194992,-8.976989,243.140429,246.600333
30,2015-02-01,216.867004,231.574005,212.014999,226.972000,2.912850e+07,43.755389,43.393165,-72.721346,-54.788070,...,-3.808759,226.383199,19.559006,21.944376,21.845208,0.043722,19.559006,10.104996,239.319571,243.664999
31,2015-02-02,226.490997,242.175003,222.658997,238.229004,3.061210e+07,52.095851,47.642203,-7.891879,-12.669779,...,-3.201652,227.001600,19.516006,21.595310,21.663047,0.049596,19.516006,11.738007,234.284716,242.236566
32,2015-02-03,237.453995,245.957001,224.483002,227.268005,4.078370e+07,44.585253,44.165876,20.037760,-32.194104,...,-3.506224,229.459850,21.473999,21.577888,21.648362,-0.046010,21.473999,-10.185990,229.112287,241.005666
33,2015-02-04,227.511002,230.057999,221.113007,226.852997,2.659430e+07,44.303116,44.034862,-45.057541,-70.859064,...,-3.735064,230.310300,8.944992,19.764961,20.667685,-0.001826,8.944992,-0.658005,228.103431,239.418299
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3401,2024-04-25,64275.019531,65275.207031,62783.632813,64481.707031,3.215579e+10,44.648873,46.387439,-56.342372,-19.542900,...,-635.705401,66211.466602,2491.574218,3047.726139,3260.535559,0.003186,2491.574218,206.687500,65109.744978,67092.594271
3402,2024-04-26,64485.371094,64789.656250,63322.398438,63755.320313,2.413937e+10,40.732452,44.690989,-97.744291,-20.566277,...,-705.770149,65954.427149,1467.257812,2821.944950,3132.444292,-0.011265,1467.257812,-730.050781,65097.137835,66902.593490
3403,2024-04-27,63750.988281,63898.363281,62424.718750,63419.140625,1.953078e+10,38.890523,43.890963,-117.618281,-62.695061,...,-779.438971,65657.256445,1473.644531,2629.330604,3013.958595,-0.005273,1473.644531,-331.847656,64872.094866,66658.399740
3404,2024-04-28,63423.515625,64321.484375,62793.597656,63113.230469,1.733483e+10,37.109041,43.134281,-76.244670,-44.797724,...,-852.677302,65231.350000,1527.886719,2471.981478,2907.810604,-0.004824,1527.886719,-310.285156,64613.035715,66432.413151


### Normalization/Scaling

In [20]:
# # Initialize Min-Max Scaler to scale features between 0 and 1

# # Final list of features we will use for the model
# feature_cols = [
#     'open', 'high', 'low', 'close', 'Volume BTC', 'Volume USD',
#     'Daily_Return', 'High_Low_Spread', 'Close_Open_Spread', 'MA_7', 'MA_30'
# ]

# print(f"\nFeatures Created: {len(feature_cols)}")
# print(f"Total rows ready for scaling: {len(df)}")


# scaler = MinMaxScaler(feature_range=(0, 1))

# df[feature_cols] = scaler.fit_transform(df[feature_cols])

# print("\n--- Data after Normalization (First 5 rows) ---")
# print(df[feature_cols].head())