# Feature Engineering 

In [None]:
from pathlib import Path
import pandas as pd


# --- Safe path for GitHub or Streamlit deployment ---
BASE_DIR = Path().resolve().parent  # adjust `.parent` depending on notebook location
DATA_PATH = BASE_DIR / "data"
MODEL_PATH = BASE_DIR / "models"
DATA_PATH.mkdir(exist_ok=True)

# Step 1: Load Cleaned BTC data
df = pd.read_csv(DATA_PATH / "btc_hourly_yf.csv", parse_dates=['timestamp'], index_col='timestamp')

df.head()

Unnamed: 0_level_0,price
timestamp,Unnamed: 1_level_1
2025-06-07 17:00:00,105521.320312
2025-06-07 18:00:00,105753.3125
2025-06-07 19:00:00,105540.976562
2025-06-07 20:00:00,105543.210938
2025-06-07 21:00:00,105651.523438


In [10]:
# Step 2: Create lag feaatures
df['return_1h'] = df['price'].pct_change()
df['rolling_mean_3h'] = df['price'].rolling(window=3).mean()
df['rolling_mean_6h'] = df['price'].rolling(window=6).mean()
df['rolling_std_3h'] = df['price'].rolling(window=3).std()

In [11]:
# Step 3: Create next hour's price
df['future_price'] = df['price'].shift(-1)

In [12]:
# Step 4: Drop rows with NaNs from rolling calculations 
df.dropna(inplace=True)

In [13]:
# Step 5: Save feature dataset
df.to_csv(DATA_PATH / "btc_feature.csv")
print("✅ Feature set saved to btc_feature.csv")

✅ Feature set saved to btc_feature.csv


In [14]:
df.tail()

Unnamed: 0_level_0,price,return_1h,rolling_mean_3h,rolling_mean_6h,rolling_std_3h,future_price
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2025-06-14 10:00:00,104657.304688,0.001559,104672.854167,104853.514323,186.701119,104915.703125
2025-06-14 11:00:00,104915.703125,0.002469,104689.140625,104828.365885,212.441201,104736.453125
2025-06-14 12:00:00,104736.453125,-0.001709,104769.820312,104772.953125,132.391333,104908.320312
2025-06-14 13:00:00,104908.320312,0.001641,104853.492188,104763.173177,101.425998,105445.15625
2025-06-14 14:00:00,105445.15625,0.005117,105029.976562,104859.558594,369.682583,105513.789062
