In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import HuberRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [None]:
file_path = 'btc.csv'
df = pd.read_csv(file_path, header = 1)

df['date'] = pd.to_datetime(df['date'])
# dropped symbol b/c its not necessary
df = df.drop('symbol', axis=1)

# swapping 'Volume BTC' and 'Volume USD' after 2018-02-27
# for some reason they are switched after this particular day...
swap_date = pd.Timestamp('2018-02-27')
df_fixed = df.copy()
mask = df_fixed['date'] <= swap_date
btc_temp = df_fixed.loc[mask, 'Volume BTC'].copy()
df_fixed.loc[mask, 'Volume BTC'] = df_fixed.loc[mask, 'Volume USD']
df_fixed.loc[mask, 'Volume USD'] = btc_temp

# removing 8 rows w zeros from volume btc/usd
df_cleaned = df_fixed[df_fixed['Volume BTC'] > 0].copy()

# sma - for long-term trends
df_cleaned['SMA_7'] = df_cleaned['close'].rolling(window=7).mean()
df_cleaned['SMA_30'] = df_cleaned['close'].rolling(window=30).mean()

# ema - for short-term trends
df_cleaned['EMA_7'] = df_cleaned['close'].ewm(span=7, adjust=False).mean()
df_cleaned['EMA_30'] = df_cleaned['close'].ewm(span=30, adjust=False).mean()

# rsi; rsi > 70 = overbought Bitcoin, rsi < 30 = oversold Bitcoin
def calculate_rsi(data, period=14):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

df_cleaned['RSI'] = calculate_rsi(df_cleaned['close'])

# macd
df_cleaned['EMA_12'] = df_cleaned['close'].ewm(span=12, adjust=False).mean()
df_cleaned['EMA_26'] = df_cleaned['close'].ewm(span=26, adjust=False).mean()
df_cleaned['MACD'] = df_cleaned['EMA_12'] - df_cleaned['EMA_26']
df_cleaned['MACD_signal'] = df_cleaned['MACD'].ewm(span=9, adjust=False).mean() #EMA of the MACD line

# dropping n/a rows
df_final = df_cleaned.dropna().copy()
print(f"Total rows after removing NaN: {len(df_final)}")

# Create a target column for the next seven day's 'close' prices
#Predicting next day prices proved to be too easy (for Huber regression)
df_final['target'] = df_final['close'].shift(-7)

# Use prices from PREVIOUS days, not same day
df_final['close_lag_1'] = df_final['close'].shift(1)
df_final['close_lag_7'] = df_final['close'].shift(7)
df_final['high_lag_1'] = df_final['high'].shift(1)
df_final['low_lag_1'] = df_final['low'].shift(1)
df_final['volume_lag_1'] = df_final['Volume BTC'].shift(1)

# Price momentum features
df_final['momentum_7'] = df_final['close'].shift(1) - df_final['close'].shift(8)
df_final['momentum_30'] = df_final['close'].shift(1) - df_final['close'].shift(31)

# Volatility features
df_final['volatility_7'] = df_final['close'].rolling(7).std().shift(1)
df_final['volatility_30'] = df_final['close'].rolling(30).std().shift(1)

df_final.dropna(inplace=True)

# Break date into day, month, year for trend analysis
df_final['day'] = df_final['date'].dt.day
df_final['month'] = df_final['date'].dt.month
df_final['year'] = df_final['date'].dt.year

# separating features (x) from label (y: close)
feature_cols = [
    # Lagged prices (previous days only)
    'close_lag_1', 'close_lag_7', 'high_lag_1', 'low_lag_1', 'volume_lag_1',

    # Technical indicators (already calculated from past data)
    'SMA_7', 'SMA_30', 'EMA_7', 'EMA_30', 'RSI', 'MACD', 'MACD_signal',

    # Momentum and volatility
    'momentum_7', 'momentum_30', 'volatility_7', 'volatility_30',

    # Volume
    'Volume BTC', 'Volume USD',

    # Date components
    'day', 'month', 'year'
]
iso_forest = IsolationForest(contamination=0.05, random_state=42)
outlier_labels = iso_forest.fit_predict(df_final[feature_cols])

# Visualize outliers before removing
plt.figure(figsize=(12,6))
plt.scatter(df_final['date'][outlier_labels == 1],
           df_final['close'][outlier_labels == 1],
           c='blue', label='Normal', alpha=0.5, s=10)
plt.scatter(df_final['date'][outlier_labels == -1],
           df_final['close'][outlier_labels == -1],
           c='red', label='Outlier', s=30, marker='x')
plt.title('Outlier Detection in Bitcoin Prices')
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.legend()
plt.show()

# Remove outliers
df_final = df_final[outlier_labels == 1].copy()
print(f"Removed {sum(outlier_labels == -1)} outliers ({sum(outlier_labels == -1)/len(outlier_labels)*100:.2f}%)")
print(f"Remaining rows: {len(df_final)}")

# CRITICAL FIX: Sort by date before splitting
df_final = df_final.sort_values('date').reset_index(drop=True)
print(f"\n✅ Data sorted by date")
print(f"Date range: {df_final['date'].min()} to {df_final['date'].max()}")

# Separate features and target
y = df_final['target']

# ADD PERCENTAGE CHANGE TARGET
df_final['target_pct'] = ((df_final['target'] - df_final['close']) / df_final['close']) * 100

# Split points
train_size = int(len(df_final) * 0.7)
val_size = int(len(df_final) * 0.2)

# Splitting data (chronologically correct!)
train = df_final[:train_size].copy()
val = df_final[train_size:train_size + val_size].copy()
test = df_final[train_size + val_size:].copy()

# Verify splits are correct
print("\n" + "="*60)
print("SPLIT VERIFICATION")
print("="*60)
print(f"Train: {train['date'].min()} to {train['date'].max()}")
print(f"  Mean price: ${train['close'].mean():.2f}")
print(f"Val: {val['date'].min()} to {val['date'].max()}")
print(f"  Mean price: ${val['close'].mean():.2f}")
print(f"Test: {test['date'].min()} to {test['date'].max()}")
print(f"  Mean price: ${test['close'].mean():.2f}")


# standardize data for Linear/Huber Regression
# calculate mean and std from training data
x_train = train[feature_cols]
y_train = train['target']

x_val = val[feature_cols]
y_val = val['target']

x_test = test[feature_cols]
y_test = test['target']

x_mean = x_train.mean()
x_std = x_train.std().replace(0, 1) # ensuring no 0 values cause NaN errors

# standardize train, val, test
x_train_scaled = (x_train - x_mean) / x_std
x_val_scaled = (x_val - x_mean) / x_std
x_test_scaled = (x_test - x_mean) / x_std
# print standardized dataset
print(x_train_scaled.head())

# checking splits
print(f"\nTraining set: {len(train)} rows ({len(train)/len(df_final)*100:.1f}%)")
print(f"Validation set: {len(val)} rows ({len(val)/len(df_final)*100:.1f}%)")
print(f"Testing set: {len(test)} rows ({len(test)/len(df_final)*100:.1f}%)")

# date ranges
print(f"\nTrain dates: {train.index.min()} to {train.index.max()}")
print(f"Val dates: {val.index.min()} to {val.index.max()}")
print(f"Test dates: {test.index.min()} to {test.index.max()}")

## EDA


In [None]:
# monthly growth of btc over time
df_final['date'] = pd.to_datetime(df_final['date'])
monthly_df = df_final.resample('M', on='date').agg({
    'close': 'mean'
}).reset_index()

plt.figure(figsize=(12,6))
plt.plot(monthly_df['date'], monthly_df['close'], label='Monthly Average Close', color='gold')
plt.title('Bitcoin Monthly Growth Over Time')
plt.xlabel('Date')
plt.ylabel('BTC Price (USD)')
plt.grid(True, alpha=0.3)
plt.legend()
plt.show()

In [None]:
# monthly % change of btc over time
monthly_df['monthly_return_%'] = monthly_df['close'].pct_change() * 100
plt.figure(figsize=(12,6))
plt.bar(monthly_df['date'], monthly_df['monthly_return_%'], color='teal', alpha=0.7)
plt.axhline(0, color='black', linewidth=1)
plt.title('Monthly BTC % Change')
plt.xlabel('Date')
plt.ylabel('Monthly Change (%)')
plt.grid(True, alpha=0.3)
plt.show()


In [None]:
# yearly averages
yearly_df = df_final.resample('Y', on='date')['close'].mean()
yearly_df.plot(kind='bar', color='orange', figsize=(12,6), title='Average BTC Price by Year')
plt.ylabel('BTC Price (USD)')
plt.show()

In [None]:
# distribution insights
plt.figure(figsize=(12,6))
sns.histplot(df_final['close'], bins=50, kde=True)
plt.title('Distribution of BTC Closing Prices')
plt.xlabel('Price (USD)')
plt.show()


In [None]:
# rolling volatility
df_final['volatility_30'] = df_final['close'].rolling(window=30).std()
plt.figure(figsize=(12,6))
plt.plot(df_final['date'], df_final['volatility_30'], color='red', label='30-Day Volatility')
plt.title('BTC 30-Day Rolling Volatility')
plt.xlabel('Date')
plt.ylabel('Volatility (USD)')
plt.legend()
plt.show()


In [None]:
# sma bands
plt.figure(figsize=(12,6))
plt.plot(df_final['date'], df_final['close'], label='Close', color='gold')
plt.plot(df_final['date'], df_final['SMA_30'], label='SMA 30', color='red')
plt.fill_between(df_final['date'],
                 df_final['SMA_30'] - df_final['volatility_30'],
                 df_final['SMA_30'] + df_final['volatility_30'],
                 color='lightblue', alpha=0.9, label='Volatility Band')
plt.title('BTC Close with 30-Day SMA and Volatility Bands')
plt.legend()
plt.show()

In [None]:
# volume and price comparison
plt.figure(figsize=(12,6))
plt.plot(df_final['date'], df_final['close'], color='gold', label='Close Price')
plt.bar(df_final['date'], df_final['Volume BTC'], color='gray', alpha=0.5, width=3, label='Volume (BTC)')
plt.title('BTC Price vs Volume')
plt.xlabel('Date')
plt.ylabel('Price / Volume')
plt.legend()
plt.show()

In [None]:
# Implementing HuberRegressor
huber = HuberRegressor(
    epsilon=1.35,
    max_iter=10000,
    alpha=0.0001
)
huber.fit(x_train_scaled, y_train)
y_pred_huber = huber.predict(x_val_scaled)

plt.figure(figsize=(12,6))
plt.plot(val['date'], y_val, label="Actual", linewidth=1)
plt.plot(val['date'], y_pred_huber, label="Predicted", linewidth=1, alpha=0.8)
plt.legend()
plt.title("Huber: Actual vs Predicted")
plt.show()

# Testing absolute error
abs_error = abs(y_val - y_pred_huber)
plt.figure(figsize=(12,6))
plt.plot(val['date'], abs_error)
plt.title("Huber: Absolute Error")
plt.show()

In [None]:
# Define the evaluation function
def evaluate_model(name, y_true, y_pred_values):
    min_len = min(len(y_true), len(y_pred_values))
    y_true_aligned = y_true.reset_index(drop=True)[:min_len] if hasattr(y_true, 'reset_index') else y_true[:min_len]
    y_pred_aligned = y_pred_values[:min_len]

    mae = mean_absolute_error(y_true_aligned, y_pred_aligned)
    mse = mean_squared_error(y_true_aligned, y_pred_aligned)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true_aligned, y_pred_aligned)

    print(f"--- {name} Results ---")
    print(f"MAE: {mae:.2f}")
    print(f"MSE: {mse:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"R^2: {r2:.2f}")
    print("-" * (len(name) + 11))

# Evaluate Huber Regression
evaluate_model("Huber Regression", y_val, y_pred_huber)

# Linear Regression
lin = LinearRegression()
lin.fit(x_train_scaled, y_train)
y_pred_lin = lin.predict(x_val_scaled)
evaluate_model("Linear Regression", y_val, y_pred_lin)

# Random Forest with Percentage Change
print("\nStarting Random Forest hyperparameter tuning (on percentage change)...")
param_grid = {
    'n_estimators': [200, 300],
    'max_depth': [20, 30, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 0.5]
}

rf_grid = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    cv=3,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=1
)

# Train on percentage change
y_train_pct = train['target_pct']
y_val_pct = val['target_pct']

rf_grid.fit(x_train, y_train_pct)
print(f"\nBest parameters: {rf_grid.best_params_}")

# Use best model
rf = rf_grid.best_estimator_
y_pred_rf_pct = rf.predict(x_val)

# Convert percentage predictions back to prices
val_close = val['close'].reset_index(drop=True)
y_pred_rf = val_close * (1 + y_pred_rf_pct / 100)
y_val_price = val_close * (1 + y_val_pct.reset_index(drop=True) / 100)

# Evaluate on converted prices
evaluate_model("Random Forest (Tuned - Pct Change)", y_val_price, y_pred_rf)

# Feature Importance Analysis
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("\n--- Feature Importance ---")
print(feature_importance)

# Visualize feature importance
plt.figure(figsize=(10,6))
plt.barh(feature_importance['feature'], feature_importance['importance'])
plt.xlabel('Importance')
plt.title('Random Forest Feature Importance')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

# Comparison plots
min_len_plot = min(len(y_val), len(y_pred_huber), len(y_pred_lin), len(y_pred_rf))
val_date_aligned = val['date'].reset_index(drop=True)[:min_len_plot]
y_val_plot_aligned = y_val.reset_index(drop=True)[:min_len_plot]
y_pred_huber_plot_aligned = y_pred_huber[:min_len_plot]
y_pred_lin_plot_aligned = y_pred_lin[:min_len_plot]
y_pred_rf_plot_aligned = y_pred_rf[:min_len_plot]

plt.figure(figsize=(14,7))
plt.plot(val_date_aligned, y_val_plot_aligned, label="Actual", linewidth=2, color='blue')
plt.plot(val_date_aligned, y_pred_lin_plot_aligned, label="Linear Predicted", linewidth=1.5, alpha=0.8, color='red', linestyle='--')
plt.title("Actual vs Linear Predicted Prices")
plt.xlabel("Date")
plt.ylabel("Price")
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

plt.figure(figsize=(14,7))
plt.plot(val_date_aligned, y_val_plot_aligned, label="Actual", linewidth=2, color='blue')
plt.plot(val_date_aligned, y_pred_rf_plot_aligned, label="Random Forest Predicted", linewidth=1.5, alpha=0.8, color='purple', linestyle=':')
plt.title("Actual vs Random Forest Predicted Prices")
plt.xlabel("Date")
plt.ylabel("Price")
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

plt.figure(figsize=(14,7))
plt.plot(val_date_aligned, y_val_plot_aligned, label="Actual", linewidth=2, color='blue')
plt.plot(val_date_aligned, y_pred_huber_plot_aligned, label="Huber Predicted", linewidth=1.5, alpha=0.8, color='green')
plt.plot(val_date_aligned, y_pred_lin_plot_aligned, label="Linear Predicted", linewidth=1.5, alpha=0.8, color='red', linestyle='--')
plt.plot(val_date_aligned, y_pred_rf_plot_aligned, label="Random Forest Predicted", linewidth=1.5, alpha=0.8, color='purple', linestyle=':')
plt.title("Actual vs Predicted Prices (Huber vs Linear vs Random Forest)")
plt.xlabel("Date")
plt.ylabel("Price")
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from sklearn.preprocessing import MinMaxScaler

# LSTM Model Data Preparation
lstm_feature_cols = ['open', 'high', 'low', 'close', 'Volume BTC', 'Volume USD', 'SMA_7', 'SMA_30', 'EMA_7', 'EMA_30', 'RSI', 'MACD', 'MACD_signal', 'day', 'month', 'year']

# Scale features and target separately for LSTM
scaler_lstm_features = MinMaxScaler(feature_range=(0, 1))
scaled_lstm_features = scaler_lstm_features.fit_transform(df_final[lstm_feature_cols])

scaler_lstm_target = MinMaxScaler(feature_range=(0, 1))
scaled_lstm_target = scaler_lstm_target.fit_transform(df_final['target'].values.reshape(-1, 1))

# Function to create LSTM sequences
look_back_window = 30

def create_lstm_sequences(features_data, target_data, look_back):
    X_seq, y_seq = [], []
    for i in range(len(features_data) - look_back):
        X_seq.append(features_data[i:(i + look_back), :])
        y_seq.append(target_data[i + look_back, 0])
    return np.array(X_seq), np.array(y_seq)

# Generate sequences
X_lstm_sequences, y_lstm_single_targets = create_lstm_sequences(scaled_lstm_features, scaled_lstm_target, look_back_window)
y_lstm_single_targets = y_lstm_single_targets.reshape(-1, 1)

# Split the sequential data
train_ratio = 0.7
val_ratio = 0.2

train_size_lstm_split = int(len(X_lstm_sequences) * train_ratio)
val_size_lstm_split = int(len(X_lstm_sequences) * val_ratio)

x_train_lstm = X_lstm_sequences[:train_size_lstm_split]
y_train_lstm = y_lstm_single_targets[:train_size_lstm_split]

x_val_lstm = X_lstm_sequences[train_size_lstm_split : train_size_lstm_split + val_size_lstm_split]
y_val_lstm = y_lstm_single_targets[train_size_lstm_split : train_size_lstm_split + val_size_lstm_split]

x_test_lstm = X_lstm_sequences[train_size_lstm_split + val_size_lstm_split :]
y_test_lstm = y_lstm_single_targets[train_size_lstm_split + val_size_lstm_split :]

# LSTM Model Definition
num_features_lstm = scaled_lstm_features.shape[1]

model = Sequential([
    Input(shape=(look_back_window, num_features_lstm)),
    LSTM(64, return_sequences=True),
    Dropout(0.2),
    LSTM(32, return_sequences=False),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1)
])

model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()

# Model Training
history = model.fit(
    x_train_lstm, y_train_lstm,
    epochs=20,
    batch_size=15,
    validation_data=(x_val_lstm, y_val_lstm),
    verbose=1
)

# Make Predictions
y_pred_lstm_scaled = model.predict(x_val_lstm)

# Inverse transform
pred_val_lstm = scaler_lstm_target.inverse_transform(y_pred_lstm_scaled)
true_val_lstm = scaler_lstm_target.inverse_transform(y_val_lstm)

# Evaluate LSTM
evaluate_model("LSTM", true_val_lstm.flatten(), pred_val_lstm.flatten())

# Plotting LSTM predictions
val_dates_start_idx_df = look_back_window + train_size_lstm_split
val_dates_end_idx_df = look_back_window + train_size_lstm_split + val_size_lstm_split

val_dates_for_lstm_plot = df_final['date'].iloc[val_dates_start_idx_df : val_dates_end_idx_df]

min_len_plot_lstm_final = min(len(val_dates_for_lstm_plot), len(true_val_lstm.flatten()))
val_dates_for_lstm_plot_aligned = val_dates_for_lstm_plot.head(min_len_plot_lstm_final)
true_val_lstm_aligned = true_val_lstm.flatten()[:min_len_plot_lstm_final]
pred_val_lstm_aligned = pred_val_lstm.flatten()[:min_len_plot_lstm_final]

plt.figure(figsize=(14,5))
plt.plot(val_dates_for_lstm_plot_aligned, true_val_lstm_aligned, label="Actual (LSTM Val)", linewidth=1, color='blue')
plt.plot(val_dates_for_lstm_plot_aligned, pred_val_lstm_aligned, label="Predicted (LSTM Val)", linewidth=1, alpha=0.8, color='orange')
plt.title("LSTM Actual vs Predicted Bitcoin Prices (Validation Set)")
plt.xlabel("Date")
plt.ylabel("Price (USD)")
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()


In [None]:
# Test Huber
y_pred_test_huber = huber.predict(x_test_scaled)
evaluate_model("Huber Regression - TEST SET", y_test, y_pred_test_huber)

# Test Linear
y_pred_test_lin = lin.predict(x_test_scaled)
evaluate_model("Linear Regression - TEST SET", y_test, y_pred_test_lin)

# Test Random Forest - Convert percentage to price
y_test_pct = test['target_pct']
y_pred_rf_test_pct = rf.predict(x_test)

test_close = test['close'].reset_index(drop=True)
y_pred_test_rf = test_close * (1 + y_pred_rf_test_pct / 100)
y_test_price = test_close * (1 + y_test_pct.reset_index(drop=True) / 100)

evaluate_model("Random Forest - TEST SET (Pct Change)", y_test_price, y_pred_test_rf)

In [None]:
# Residual plot for Huber (best model)
residuals = y_val.reset_index(drop=True) - y_pred_huber

plt.figure(figsize=(14,5))
plt.scatter(y_val.reset_index(drop=True), residuals, alpha=0.5)
plt.axhline(0, color='red', linestyle='--')
plt.xlabel('Actual Price')
plt.ylabel('Residual (Actual - Predicted)')
plt.title('Huber Regression: Residual Plot')
plt.show()

# Distribution of residuals
plt.figure(figsize=(10,5))
sns.histplot(residuals, bins=50, kde=True)
plt.title('Distribution of Prediction Errors')
plt.xlabel('Residual')
plt.show()

In [None]:
# Check what dates are in each split
print("=" * 60)
print("DATA SPLIT ANALYSIS")
print("=" * 60)
print(f"Train: {train['date'].min()} to {train['date'].max()}")
print(f"  Price range: ${train['close'].min():.2f} - ${train['close'].max():.2f}")
print(f"  Mean price: ${train['close'].mean():.2f}")

print(f"\nVal: {val['date'].min()} to {val['date'].max()}")
print(f"  Price range: ${val['close'].min():.2f} - ${val['close'].max():.2f}")
print(f"  Mean price: ${val['close'].mean():.2f}")

print(f"\nTest: {test['date'].min()} to {test['date'].max()}")
print(f"  Price range: ${test['close'].min():.2f} - ${test['close'].max():.2f}")
print(f"  Mean price: ${test['close'].mean():.2f}")

print(f"\nTarget ranges:")
print(f"Train target: ${y_train.min():.2f} - ${y_train.max():.2f}")
print(f"Val target: ${y_val.min():.2f} - ${y_val.max():.2f}")
print(f"Test target: ${y_test.min():.2f} - ${y_test.max():.2f}")

# Plot train/val/test splits
plt.figure(figsize=(14,6))
plt.plot(train['date'], train['close'], label='Train', alpha=0.7)
plt.plot(val['date'], val['close'], label='Val', alpha=0.7)
plt.plot(test['date'], test['close'], label='Test', alpha=0.7, linewidth=2)
plt.axvline(train['date'].max(), color='red', linestyle='--', alpha=0.5)
plt.axvline(val['date'].max(), color='red', linestyle='--', alpha=0.5)
plt.title('Train/Val/Test Split Over Time')
plt.xlabel('Date')
plt.ylabel('BTC Price')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Check predictions vs actual on test set
plt.figure(figsize=(14,6))
plt.plot(test['date'], y_test.reset_index(drop=True), label='Actual', linewidth=2, color='blue')
plt.plot(test['date'], y_pred_test_huber, label='Huber Predicted', alpha=0.7, color='green')
plt.plot(test['date'], y_pred_test_lin, label='Linear Predicted', alpha=0.7, color='red')
plt.title('Test Set: Actual vs Predicted')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Model comparison data
models = ['Huber\nRegression', 'Linear\nRegression', 'Random Forest\n(Pct Change)', 'LSTM']

# Validation set metrics
val_mae = [1639.24, 1646.39, 1631.65, 3193.65]
val_r2 = [0.92, 0.92, 0.92, 0.81]

# Test set metrics
test_mae = [4028.37, 4168.63, 4961.29, None]
test_r2 = [0.91, 0.90, 0.86, None]

# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# MAE Comparison
x = np.arange(len(models))
width = 0.35

axes[0].bar(x - width/2, val_mae, width, label='Validation', color='skyblue', edgecolor='black')
axes[0].bar(x[:3] + width/2, test_mae[:3], width, label='Test', color='coral', edgecolor='black')
axes[0].set_xlabel('Model', fontsize=12, fontweight='bold')
axes[0].set_ylabel('MAE (USD)', fontsize=12, fontweight='bold')
axes[0].set_title('Mean Absolute Error Comparison', fontsize=14, fontweight='bold')
axes[0].set_xticks(x)
axes[0].set_xticklabels(models)
axes[0].legend()
axes[0].grid(axis='y', alpha=0.3)
axes[1].bar(x[:3] - width/2, [val_r2[i] for i in range(3)], width, label='Validation', color='lightgreen', edgecolor='black')
axes[1].bar(x[:3] + width/2, test_r2[:3], width, label='Test', color='gold', edgecolor='black')

axes[1].bar(x[3], val_r2[3], width, label='LSTM (Val only)', color='lightgreen', edgecolor='black', hatch='//')
axes[1].set_xlabel('Model', fontsize=12, fontweight='bold')
axes[1].set_ylabel('R² Score', fontsize=12, fontweight='bold')
axes[1].set_title('R² Score Comparison', fontsize=14, fontweight='bold')
axes[1].set_xticks(x)
axes[1].set_xticklabels(models)
axes[1].legend()
axes[1].set_ylim([0, 1])
axes[1].grid(axis='y', alpha=0.3)
plt.tight_layout()

summary_text = """Test Set Performance (Feb 2024 - Jun 2025):

Price Range: $42,570 - $108,980  |  • Best Model: Huber Regression (R²=0.91, MAE=$4,028)  |  • All models achieve <7% prediction error"""

fig.text(0.5, -0.02, summary_text,
ha='center', fontsize=10,
bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5, pad=10))
plt.savefig('model_comparison_with_summary.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
df_final = df_final.sort_values(by='date', ascending=True).reset_index(drop=True)
print("df_final sorted by date in ascending order.")
print(df_final[['date', 'close']].head())

In [None]:
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# --- LSTM Model Definition ---
num_features_lstm = scaled_lstm_features.shape[1] # Get the number of features from scaled data

model = Sequential([
    Input(shape=(look_back_window, num_features_lstm)), # Input shape: (timesteps, num_features)
    LSTM(64, return_sequences=True),
    Dropout(0.2),

    LSTM(32, return_sequences=False),
    Dropout(0.2),

    Dense(32, activation='relu'),
    Dense(1)  # Predict target value (Bitcoin closing price 7 days ahead)
])

model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()
es = EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True)
rlr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=4)

# --- Model Training ---
history = model.fit(
    x_train_lstm, y_train_lstm,
    epochs=20,
    batch_size=14,
    validation_data=(x_val_lstm, y_val_lstm),
    callbacks = [es, rlr],
    verbose = 1
)

# --- Make Predictions ---
y_pred_lstm_scaled = model.predict(x_val_lstm)

print("y_val mean:", y_lstm_single_targets.mean())
print("y_pred mean:", y_pred_lstm_scaled.mean())

# Inverse transform scaled predictions and true values back to original scale
pred_val_lstm = scaler_lstm_target.inverse_transform(y_pred_lstm_scaled)
true_val_lstm = scaler_lstm_target.inverse_transform(y_val_lstm)


# --- Evaluate Metrics & Plot Results ---
# The evaluate_model function (defined earlier in pmsPxAUTzsT8) expects 1D arrays.
# Flatten the (samples, 1) output from inverse_transform.
evaluate_model("LSTM (New Features)", true_val_lstm.flatten(), pred_val_lstm.flatten())

# Plotting prediction versus actual values
# Align dates correctly for the validation set used for LSTM.
# The first date corresponding to y_lstm_single_targets[0] is df_final['date'].iloc[look_back_window].
# The validation set dates start from df_final['date'].iloc[look_back_window + train_size_lstm_split].

val_dates_start_idx_df = look_back_window + train_size_lstm_split
val_dates_end_idx_df = look_back_window + train_size_lstm_split + val_size_lstm_split

val_dates_for_lstm_plot = df_final['date'].iloc[val_dates_start_idx_df : val_dates_end_idx_df]

# Ensure lengths match for plotting by potentially truncating if needed
min_len_plot_lstm_final = min(len(val_dates_for_lstm_plot), len(true_val_lstm.flatten()))
val_dates_for_lstm_plot_aligned = val_dates_for_lstm_plot.head(min_len_plot_lstm_final)
true_val_lstm_aligned = true_val_lstm.flatten()[:min_len_plot_lstm_final]
pred_val_lstm_aligned = pred_val_lstm.flatten()[:min_len_plot_lstm_final]


plt.figure(figsize=(14,5))
plt.plot(val_dates_for_lstm_plot_aligned, true_val_lstm_aligned, label="Actual (LSTM Val)", linewidth=1, color='blue')
plt.plot(val_dates_for_lstm_plot_aligned, pred_val_lstm_aligned, label="Predicted (LSTM Val)", linewidth=1, alpha=0.8, color='orange')
plt.title("LSTM Actual vs Predicted Bitcoin Prices (Validation Set) with New Features")
plt.xlabel("Date")
plt.ylabel("Price (USD)")
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
look_back_window = 60 # Increased from 30 to 60

# Re-generate sequences from the scaled data with the new look_back_window
X_lstm_sequences, y_lstm_single_targets = create_lstm_sequences(scaled_lstm_features, scaled_lstm_target, look_back_window)
y_lstm_single_targets = y_lstm_single_targets.reshape(-1, 1)

# Re-split the sequential data into train/val/test (maintaining temporal order)
train_ratio = 0.7
val_ratio = 0.2

train_size_lstm_split = int(len(X_lstm_sequences) * train_ratio)
val_size_lstm_split = int(len(X_lstm_sequences) * val_ratio)

x_train_lstm = X_lstm_sequences[:train_size_lstm_split]
y_train_lstm = y_lstm_single_targets[:train_size_lstm_split]

x_val_lstm = X_lstm_sequences[train_size_lstm_split : train_size_lstm_split + val_size_lstm_split]
y_val_lstm = y_lstm_single_targets[train_size_lstm_split : train_size_lstm_split + val_size_lstm_split]

x_test_lstm = X_lstm_sequences[train_size_lstm_split + val_size_lstm_split :]
y_test_lstm = y_lstm_single_targets[train_size_lstm_split + val_size_lstm_split :]

# --- LSTM Model Definition with new hyperparameters ---
num_features_lstm = scaled_lstm_features.shape[1]

model_tuned = Sequential([
    Input(shape=(look_back_window, num_features_lstm)),
    LSTM(128, return_sequences=True), # Increased units from 64 to 128
    Dropout(0.3), # Increased dropout from 0.2 to 0.3

    LSTM(64, return_sequences=False), # Increased units from 32 to 64
    Dropout(0.3), # Increased dropout from 0.2 to 0.3

    Dense(64, activation='relu'), # Increased units from 32 to 64
    Dense(1)
])

model_tuned.compile(optimizer='adam', loss='mean_squared_error')
model_tuned.summary()

es = EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True)
rlr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5) # Adjusted factor and patience

# --- Model Training with new batch size ---
history_tuned = model_tuned.fit(
    x_train_lstm, y_train_lstm,
    epochs=20,
    batch_size=32, # Changed batch size from 14 to 32
    validation_data=(x_val_lstm, y_val_lstm),
    callbacks=[es, rlr],
    verbose=1
)

# --- Make Predictions ---
y_pred_lstm_scaled_tuned = model_tuned.predict(x_val_lstm)

# Inverse transform scaled predictions and true values back to original scale
pred_val_lstm_tuned = scaler_lstm_target.inverse_transform(y_pred_lstm_scaled_tuned)
true_val_lstm_tuned = scaler_lstm_target.inverse_transform(y_val_lstm)

# --- Evaluate Metrics & Plot Results ---
evaluate_model("LSTM (Tuned Hyperparameters)", true_val_lstm_tuned.flatten(), pred_val_lstm_tuned.flatten())

# Align dates correctly for the validation set used for LSTM.
val_dates_start_idx_df = look_back_window + train_size_lstm_split
val_dates_end_idx_df = look_back_window + train_size_lstm_split + val_size_lstm_split

val_dates_for_lstm_plot = df_final['date'].iloc[val_dates_start_idx_df : val_dates_end_idx_df]

min_len_plot_lstm_final = min(len(val_dates_for_lstm_plot), len(true_val_lstm_tuned.flatten()))
val_dates_for_lstm_plot_aligned = val_dates_for_lstm_plot.head(min_len_plot_lstm_final)
true_val_lstm_aligned = true_val_lstm_tuned.flatten()[:min_len_plot_lstm_final]
pred_val_lstm_aligned = pred_val_lstm_tuned.flatten()[:min_len_plot_lstm_final]


plt.figure(figsize=(14,5))
plt.plot(val_dates_for_lstm_plot_aligned, true_val_lstm_aligned, label="Actual (LSTM Val)", linewidth=1, color='blue')
plt.plot(val_dates_for_lstm_plot_aligned, pred_val_lstm_aligned, label="Predicted (LSTM Val)", linewidth=1, alpha=0.8, color='green')
plt.title("LSTM Actual vs Predicted Bitcoin Prices (Validation Set) with Tuned Features")
plt.xlabel("Date")
plt.ylabel("Price (USD)")
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
print("\n" + "="*60)
print("FINAL TRUTH: TUNED LSTM ON TEST SET")
print("="*60)

# 1. Generate sequences for the TEST set using the NEW features
# Note: We must ensure x_test_lstm matches the tuned model's input shape
y_pred_lstm_test_scaled_tuned = model_tuned.predict(x_test_lstm)

# 2. Inverse transform
pred_test_lstm_tuned = scaler_lstm_target.inverse_transform(y_pred_lstm_test_scaled_tuned)
true_test_lstm_tuned = scaler_lstm_target.inverse_transform(y_test_lstm)

# 3. Evaluate
evaluate_model("LSTM (Tuned) - TEST SET", true_test_lstm_tuned.flatten(), pred_test_lstm_tuned.flatten())

# 4. Visualize the Final Result
test_dates_start = look_back_window + train_size_lstm_split + val_size_lstm_split
test_dates_slice = df_final['date'].iloc[test_dates_start : test_dates_start + len(true_test_lstm_tuned)]

plt.figure(figsize=(14,6))
plt.plot(test_dates_slice, true_test_lstm_tuned.flatten(), label="Actual Price", color='blue', linewidth=2)
plt.plot(test_dates_slice, pred_test_lstm_tuned.flatten(), label="LSTM Tuned Prediction", color='gold', linewidth=1.5, alpha=0.9)
plt.title("FINAL EXAM: Tuned LSTM on Unseen Test Data (2024-2025)")
plt.xlabel("Date")
plt.ylabel("Price (USD)")
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## Implement Time Series Cross-Validation


Implement time series cross-validation using `TimeSeriesSplit` with the optimal `look_back_window` (60) to evaluate the LSTM model. Collect MAE, MSE, RMSE, and R^2 for each fold.


In [None]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# 1. Set the optimal look_back_window to 60
optimal_look_back_window = 60
print(f"Optimal look_back_window set to: {optimal_look_back_window}")

# 2. Initialize MinMaxScaler for features and target separately
# Ensure df_final and feature_cols are the current ones with all added features
scaler_tscv_features = MinMaxScaler(feature_range=(0, 1))

#Fit minmax only on training portion of data
x_train_scaled = scaler_tscv_features.fit_transform(train[feature_cols])
#scaled_tscv_features = scaler_tscv_features.fit_transform(df_final[feature_cols])

scaler_tscv_target = MinMaxScaler(feature_range=(0, 1))
#scaled_tscv_target = scaler_tscv_target.fit_transform(df_final['target'].values.reshape(-1, 1))
scaled_tscv_target = scaler_tscv_target.fit_transform(train['target'].values.reshape(-1, 1))

#num_features_lstm = scaled_tscv_features.shape[1]
num_features_lstm = x_train_scaled.shape[1]
print(f"Number of features for LSTM: {num_features_lstm}")
print("MinMaxScalers initialized and fitted for features and target.")

In [None]:
# =============================================================================
# FIX: Remove ANY remaining NaN values before LSTM
# =============================================================================

print("="*60)
print("PREPARING DATA FOR LSTM")
print("="*60)
print(f"Rows before NaN removal: {len(df_final)}")
print(f"NaN values per column:")
print(df_final[feature_cols + ['target']].isna().sum())

# Remove ANY rows with NaN in features or target
df_final = df_final.dropna(subset=feature_cols + ['target']).reset_index(drop=True)

print(f"\nRows after NaN removal: {len(df_final)}")
print(f"Remaining NaN: {df_final[feature_cols + ['target']].isna().sum().sum()}")

# Verify no NaN remain
assert df_final[feature_cols + ['target']].isna().sum().sum() == 0, "Still have NaN values!"

print("✅ All NaN values removed\n")

# =============================================================================
# LSTM TIME SERIES CROSS-VALIDATION
# =============================================================================

import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import BatchNormalization

optimal_look_back_window = 60
n_splits = 2

num_features_lstm = len(feature_cols)  # Should be 21

tscv = TimeSeriesSplit(n_splits=n_splits)
df = df_final.copy()

# Reuse the create_lstm_sequences function
def create_lstm_sequences(features_data, target_data, look_back):
    X_seq, y_seq = [], []
    for i in range(len(features_data) - look_back):
        X_seq.append(features_data[i:(i + look_back), :])
        y_seq.append(target_data[i + look_back, 0])
    return np.array(X_seq), np.array(y_seq)

# Create an empty list to store the metrics for each fold
fold_metrics = []

# Loop through each split
for fold, (train_index, val_index) in enumerate(tscv.split(df)):
    print(f"\n===== FOLD {fold + 1} / {n_splits} ====")

    # Split raw data
    train_fold = df.iloc[train_index]
    val_fold = df.iloc[val_index]

    # DEBUG: Check for NaN in raw data
    print(f"NaN in train_fold features: {train_fold[feature_cols].isna().sum().sum()}")
    print(f"NaN in train_fold target: {train_fold['target'].isna().sum()}")
    print(f"NaN in val_fold features: {val_fold[feature_cols].isna().sum().sum()}")
    print(f"NaN in val_fold target: {val_fold['target'].isna().sum()}")

    # Initialize fresh scalers
    scaler_X = MinMaxScaler()
    scaler_y = MinMaxScaler()

    # Fit and transform
    X_train_scaled = scaler_X.fit_transform(train_fold[feature_cols])
    y_train_scaled = scaler_y.fit_transform(train_fold[['target']])

    X_val_scaled = scaler_X.transform(val_fold[feature_cols])
    y_val_scaled = scaler_y.transform(val_fold[['target']])

    # DEBUG: Check for NaN after scaling
    print(f"NaN after scaling X_train: {np.isnan(X_train_scaled).sum()}")
    print(f"NaN after scaling y_train: {np.isnan(y_train_scaled).sum()}")

    # Create LSTM sequences
    X_train_seq, y_train_seq = create_lstm_sequences(X_train_scaled, y_train_scaled, optimal_look_back_window)
    X_val_seq, y_val_seq = create_lstm_sequences(X_val_scaled, y_val_scaled, optimal_look_back_window)

    print(f"Train sequences: {X_train_seq.shape}, Val sequences: {X_val_seq.shape}")

    if X_train_seq.shape[0] == 0 or X_val_seq.shape[0] == 0:
        print(f"Skipping fold {fold + 1} due to insufficient data.")
        continue

    # Build LSTM model
    model = Sequential([
        Input(shape=(optimal_look_back_window, num_features_lstm)),
        LSTM(64, return_sequences=True),
        Dropout(0.2),
        LSTM(64, return_sequences=False),
        Dropout(0.2),
        BatchNormalization(),
        Dense(32, activation='relu'),
        Dense(1, activation='linear')
    ])

    # Use LOWER learning rate
    model.compile(optimizer=Adam(learning_rate=0.0001, clipnorm=1.0), loss='huber')

    # Callbacks
    es = EarlyStopping(monitor='val_loss', patience=12, restore_best_weights=True)
    rlr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=9)

    # Train
    history = model.fit(
        X_train_seq, y_train_seq,
        epochs=20,
        batch_size=32,
        validation_data=(X_val_seq, y_val_seq),
        callbacks=[es, rlr],
        verbose=1
    )

    # Make predictions
    y_pred_scaled = model.predict(X_val_seq, verbose=0)

    # DEBUG: Check predictions
    print(f"NaN in predictions (scaled): {np.isnan(y_pred_scaled).sum()}")
    print(f"Inf in predictions (scaled): {np.isinf(y_pred_scaled).sum()}")

    # Inverse transform
    y_pred = scaler_y.inverse_transform(y_pred_scaled)
    y_true = scaler_y.inverse_transform(y_val_seq.reshape(-1, 1))

    # DEBUG: Check after inverse transform
    print(f"NaN in y_pred: {np.isnan(y_pred).sum()}")
    print(f"NaN in y_true: {np.isnan(y_true).sum()}")
    print(f"Inf in y_pred: {np.isinf(y_pred).sum()}")

    # If NaN or Inf exists, skip this fold
    if np.isnan(y_pred).any() or np.isnan(y_true).any() or np.isinf(y_pred).any():
        print(f"⚠️ Skipping fold {fold + 1} due to NaN/Inf in predictions")
        continue

    # Calculate metrics
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)

    print(f"Fold {fold + 1} Metrics - MAE: {mae:.2f}, MSE: {mse:.2f}, RMSE: {rmse:.2f}, R^2: {r2:.2f}")

    fold_metrics.append({
        'fold': fold + 1,
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse,
        'R^2': r2
    })

print("\n" + "="*60)
print("Time Series Cross-Validation completed.")
print("="*60)

if len(fold_metrics) > 0:
    metrics_df_tscv = pd.DataFrame(fold_metrics)
    print("\n--- Time Series Cross-Validation Results ---")
    print(metrics_df_tscv)
    print("\n--- Average Metrics Across Folds ---")
    print(metrics_df_tscv.mean().to_frame('Average Metrics').transpose())
else:
    print("⚠️ No folds completed successfully")

print("\n✅ LSTM Cross-Validation Complete!")

In [None]:
# =============================================================================
# LSTM TEST SET EVALUATION
# =============================================================================

print("\n" + "="*60)
print("LSTM TEST SET EVALUATION")
print("="*60)

# Prepare test data the same way as in cross-validation
scaler_X_test = MinMaxScaler()
scaler_y_test = MinMaxScaler()

# Fit scalers on train data only
X_train_for_test = scaler_X_test.fit_transform(train[feature_cols])
y_train_for_test = scaler_y_test.fit_transform(train[['target']])

# Transform test data
X_test_scaled = scaler_X_test.transform(test[feature_cols])
y_test_scaled = scaler_y_test.transform(test[['target']])

# Create LSTM sequences for test set
X_test_seq, y_test_seq = create_lstm_sequences(
    X_test_scaled,
    y_test_scaled,
    optimal_look_back_window
)

print(f"Test sequences shape: {X_test_seq.shape}")

if X_test_seq.shape[0] > 0:
    # Make predictions using the last trained model from CV
    y_pred_test_scaled = model.predict(X_test_seq, verbose=0)

    # Check for NaN
    if np.isnan(y_pred_test_scaled).any():
        print("⚠️ NaN found in test predictions!")
    else:
        # Inverse transform to get actual prices
        y_pred_test = scaler_y_test.inverse_transform(y_pred_test_scaled)
        y_test_true = scaler_y_test.inverse_transform(y_test_seq.reshape(-1, 1))

        # Evaluate
        evaluate_model("LSTM - TEST SET", y_test_true.flatten(), y_pred_test.flatten())

        # Visualize test set predictions
        # Get corresponding dates (skip first look_back_window rows)
        test_dates_lstm = test['date'].iloc[optimal_look_back_window:optimal_look_back_window+len(y_test_true)].reset_index(drop=True)

        plt.figure(figsize=(14,6))
        plt.plot(test_dates_lstm, y_test_true.flatten(), label='Actual', linewidth=2, color='blue')
        plt.plot(test_dates_lstm, y_pred_test.flatten(), label='LSTM Predicted', linewidth=1.5, alpha=0.8, color='orange')
        plt.title('LSTM: Test Set Predictions (2024-2025)')
        plt.xlabel('Date')
        plt.ylabel('Price (USD)')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.show()
else:
    print("⚠️ Not enough test data for LSTM sequences")

print("\n✅ LSTM Test Evaluation Complete!")

In [None]:
print("\n--- Descriptive Statistics for 'close' price ---")
for name, df_split in [('Train', train), ('Validation', val), ('Test', test)]:
    print(f"\n{name} Set:")
    print(f"  Mean 'close' price: ${df_split['close'].mean():.2f}")
    print(f"  Standard Deviation 'close' price: ${df_split['close'].std():.2f}")
    print(f"  Min 'close' price: ${df_split['close'].min():.2f}")
    print(f"  Max 'close' price: ${df_split['close'].max():.2f}")

print("\n--- Temporal Ranges ---")
print(f"Train dates: {train['date'].min()} to {train['date'].max()}")
print(f"Val dates: {val['date'].min()} to {val['date'].max()}")
print(f"Test dates: {test['date'].min()} to {test['date'].max()}")

# Features for histogram analysis
selected_features = ['close_lag_1', 'Volume BTC', 'RSI']

print("\n--- Histograms for Selected Features Across Splits ---")
for feature in selected_features:
    plt.figure(figsize=(10, 5))
    sns.histplot(train[feature], color='blue', label='Train', kde=True, stat='density', alpha=0.5)
    sns.histplot(val[feature], color='green', label='Validation', kde=True, stat='density', alpha=0.5)
    sns.histplot(test[feature], color='red', label='Test', kde=True, stat='density', alpha=0.5)
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Density')
    plt.legend()
    plt.show()

In [None]:
plt.figure(figsize=(14,6))
plt.plot(train['date'], train['close'], label='Train', alpha=0.7, color='blue')
plt.plot(val['date'], val['close'], label='Validation', alpha=0.7, color='green')
plt.plot(test['date'], test['close'], label='Test', alpha=0.7, linewidth=2, color='red')
plt.title('BTC Close Price Over Time: Train, Validation, and Test Splits')
plt.xlabel('Date')
plt.ylabel('BTC Price')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Create log returns
df_final['log_price'] = np.log(df_final['close'])
df_final['target'] = df_final['log_price'].shift(-7) - df_final['log_price']

# drop final row
df_final = df_final.dropna().reset_index(drop=True)

# feature_cols = [...]  # your chosen features

X_all = df_final[feature_cols].values
y_all = df_final['target'].values.reshape(-1, 1)

from sklearn.preprocessing import StandardScaler

n_splits = 2   # or 5
tscv = TimeSeriesSplit(n_splits=n_splits)

# Building LSTM sequences
def create_sequences(features, target, look_back):
    X_seq, y_seq = [], []
    for i in range(len(features) - look_back):
        X_seq.append(features[i:i+look_back])
        y_seq.append(target[i+look_back])
    return np.array(X_seq), np.array(y_seq)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import Huber
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from tensorflow.keras.layers import Input # Import Input layer for explicit shape definition
from tensorflow.keras.layers import BatchNormalization # Import BatchNormalization

look_back = 60
fold_results = []
fold_num = 1

for train_index, val_index in tscv.split(X_all):
    print(f"\n-------------------")
    print(f"Processing Fold {fold_num}/{n_splits}")
    print(f"Train: {len(train_index)}, Val: {len(val_index)}")
    print(f"-------------------")

    # -----------------------------
    # 5A: Split data for this fold
    # -----------------------------
    X_train_raw = X_all[train_index]
    y_train_raw = y_all[train_index]

    X_val_raw = X_all[val_index]
    y_val_raw = y_all[val_index]

    # -----------------------------
    # 5B: Scale using ONLY training data
    # -----------------------------
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()

    X_train_scaled = scaler_X.fit_transform(X_train_raw)
    y_train_scaled = scaler_y.fit_transform(y_train_raw)

    X_val_scaled = scaler_X.transform(X_val_raw)
    y_val_scaled = scaler_y.transform(y_val_raw)

    # -----------------------------
    # 5C: Create sequences
    # -----------------------------
    X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train_scaled, look_back)
    X_val_seq, y_val_seq     = create_sequences(X_val_scaled, y_val_scaled, look_back)

    # Handle cases where sequence creation might result in empty arrays
    if X_train_seq.shape[0] == 0 or X_val_seq.shape[0] == 0:
        print(f"Skipping fold {fold_num} due to insufficient data for sequence creation after look_back window.")
        fold_num += 1
        continue

    # -----------------------------
    # 5D: Build model (stable setup)
    # -----------------------------
    model = Sequential([
        Input(shape=(look_back, X_train_seq.shape[2])),
        LSTM(64, return_sequences=True), # First LSTM returns sequences
        BatchNormalization(), # Add BatchNormalization
        #Dropout(0.2),
        LSTM(64, return_sequences=False), # Last LSTM does not return sequences
        BatchNormalization(), # Add BatchNormalization
        #Dropout(0.2),
        Dense(32, activation='relu'), # Using relu activation
        Dense(1)
    ])

    model.compile(
        optimizer=Adam(learning_rate=0.0005, clipnorm=1.0),
        loss=Huber(delta=1.0)
    )

    es = EarlyStopping(monitor='val_loss', patience=12, restore_best_weights=True)
    rlr = ReduceLROnPlateau(monitor='val_loss', patience=6, factor=0.5)

    # -----------------------------
    # 5E: Train
    # -----------------------------
    model.fit(
        X_train_seq, y_train_seq,
        validation_data=(X_val_seq, y_val_seq),
        epochs=95,
        batch_size=70,
        callbacks=[es, rlr],
        verbose=0
    )

    # -----------------------------
    # 5F: Predict + inverse transform
    # -----------------------------
    y_pred_scaled = model.predict(X_val_seq, verbose=0)
    y_pred = scaler_y.inverse_transform(y_pred_scaled)
    y_true = scaler_y.inverse_transform(y_val_seq)

    # -----------------------------
    # 5G: Compute metrics
    # -----------------------------
    r2 = r2_score(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)

    fold_results.append((fold_num, r2, mse, rmse, mae))

    print(f"Fold {fold_num} — R²: {r2:.4f}, RMSE: {rmse:.4f}, MAE: {mae:.4f}")

    fold_num += 1



# Results summary
print("\n====== FINAL RESULTS ======")
for fold, r2, mse, rmse, mae in fold_results:
    print(f"Fold {fold}: R²={r2:.4f}, RMSE={rmse:.4f}, MAE={mae:.4f}")