In [39]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

In [40]:
df = pd.read_csv('data/btcusd_1-min_data.csv')
df.info()
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7162237 entries, 0 to 7162236
Data columns (total 6 columns):
 #   Column     Dtype  
---  ------     -----  
 0   Timestamp  float64
 1   Open       float64
 2   High       float64
 3   Low        float64
 4   Close      float64
 5   Volume     float64
dtypes: float64(6)
memory usage: 327.9 MB
      Timestamp  Open  High   Low  Close  Volume
0  1.325412e+09  4.58  4.58  4.58   4.58     0.0
1  1.325412e+09  4.58  4.58  4.58   4.58     0.0
2  1.325412e+09  4.58  4.58  4.58   4.58     0.0
3  1.325412e+09  4.58  4.58  4.58   4.58     0.0
4  1.325412e+09  4.58  4.58  4.58   4.58     0.0


In [41]:
# The 'Timestamp' is in Unix format, let's convert it to a readable datetime format.
df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
# Set the timestamp as the index, which is standard practice for time-series data.
df.set_index('Timestamp', inplace=True)

In [42]:
print("\n--- Analyzing Missing Values (NaNs) ---")
# Check for null values in each column
nan_counts = df.isnull().sum()
nan_percentage = (nan_counts / len(df)) * 100

print("Count of missing values per column:")
print(nan_counts[nan_counts > 0])
print("\nPercentage of missing values per column:")
print(nan_percentage[nan_percentage > 0].round(2))

# df.fillna(method='ffill', inplace=True)
print(df.isnull().sum().sum())


--- Analyzing Missing Values (NaNs) ---
Count of missing values per column:
Series([], dtype: int64)

Percentage of missing values per column:
Series([], dtype: float64)
0


In [43]:
print("\n--- Resampling Data to Daily Frequency ---")
aggregation_rules = {
    'Open': 'first',
    'High': 'max',
    'Low': 'min',
    'Close': 'last',
    'Volume': 'sum'
}

df_daily = df.resample('D').agg(aggregation_rules)
# Resampling can create days with no data (e.g., weekends in early years).
df_daily.fillna(method='ffill', inplace=True)

print(f"Resampling complete. New daily data shape: {df_daily.shape}")
print("\nFirst 5 rows of daily data:")
print(df_daily.head())


--- Resampling Data to Daily Frequency ---
Resampling complete. New daily data shape: (4975, 5)

First 5 rows of daily data:
            Open  High   Low  Close      Volume
Timestamp                                      
2012-01-01  4.58  4.84  4.58   4.84   10.000000
2012-01-02  4.84  5.00  4.84   5.00   10.100000
2012-01-03  5.00  5.32  5.00   5.29  107.085281
2012-01-04  5.29  5.57  4.93   5.57  107.233260
2012-01-05  5.57  6.46  5.57   6.42   70.328742



DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.



In [44]:
print("\n--- Creating Exploratory Visualizations ---")

# Plot 1: Bitcoin Price (OHLC Candlestick Chart)
fig_price = go.Figure(data=[go.Candlestick(x=df_daily.index,
                open=df_daily['Open'],
                high=df_daily['High'],
                low=df_daily['Low'],
                close=df_daily['Close'])])

fig_price.update_layout(
    title='Bitcoin Price History (Daily)',
    xaxis_title='Date',
    yaxis_title='Price (USD)',
    xaxis_rangeslider_visible=False, # Hides the range slider at the bottom
    template='plotly_dark'
)
fig_price.show()

# Plot 2: Trading Volume Over Time
fig_volume = px.bar(df_daily, x=df_daily.index, y='Volume',
                    title='Bitcoin Trading Volume (Daily)',
                    labels={'Volume': 'Volume', 'index': 'Date'})

fig_volume.update_layout(
    template='plotly_dark'
)
fig_volume.show()


# Plot 3: A combined chart for Weighted Price and Volume
# Create a figure with a secondary y-axis
fig_combined = make_subplots(specs=[[{"secondary_y": True}]])

# Add Close Price trace instead of Weighted Price
fig_combined.add_trace(
    # Use the 'Close' column which exists in df_daily
    go.Scatter(x=df_daily.index, y=df_daily['Close'], name="Close Price", line=dict(color='cyan')), # <-- CORRECTED
    secondary_y=False,
)

# Add Volume trace
fig_combined.add_trace(
    # Use the 'Volume' column which exists in df_daily
    go.Bar(x=df_daily.index, y=df_daily['Volume'], name="Volume", marker_color='orange', opacity=0.5), # <-- CORRECTED
    secondary_y=True,
)

# Add figure title and axis labels
fig_combined.update_layout(
    title_text="Bitcoin Daily Close Price and Trading Volume", # <-- Title updated
    template='plotly_dark'
)
fig_combined.update_xaxes(title_text="Date")
fig_combined.update_yaxes(title_text="<b>Close Price (USD)</b>", secondary_y=False)
fig_combined.update_yaxes(title_text="<b>Volume</b>", secondary_y=True)

fig_combined.show()

print("\n--- EDA and Initial Preprocessing Complete! ---")
# The 'df_daily' DataFrame is now ready for the next phase (Feature Engineering).


--- Creating Exploratory Visualizations ---



--- EDA and Initial Preprocessing Complete! ---


# Feature Engineering

Momentum: How fast and in what direction is the price moving? (e.g., Returns, RSI, MACD)
Volatility: How much is the price fluctuating? (e.g., Bollinger Bands, ATR, Rolling Std Dev)
Trend: What is the underlying trend, ignoring short-term noise? (e.g., Rolling Averages)
Past Behavior: What was the price yesterday or last week? (e.g., Lag Features)

In [45]:
import pandas_ta as ta

# Create a copy to work on, which is good practice.
df_feat = df_daily.copy()

In [46]:
# --- 1. Price and Log Returns ---
# These features measure the percentage change from one period to the next.
# It helps the model understand momentum.
print("--- Creating Price and Log Returns ---")
# Percentage change in the close price from the previous day.
df_feat['price_return'] = df_feat['Close'].pct_change()

# Log returns are often preferred in financial modeling.
df_feat['log_return'] = np.log(df_feat['Close'] / df_feat['Close'].shift(1))

--- Creating Price and Log Returns ---


In [47]:
# --- 2. Technical Indicators ---
# This is where the pandas_ta library is being used.
print("--- Creating Technical Indicators ---")

# RSI (Relative Strength Index): A momentum oscillator that measures the speed and change of price movements.
# Typically, RSI > 70 is considered "overbought" and RSI < 30 is "oversold".
df_feat.ta.rsi(close='Close', length=14, append=True) # This adds a column named 'RSI_14'

# MACD (Moving Average Convergence Divergence): A trend-following momentum indicator.
# It creates three columns: the MACD line, the signal line, and the histogram.
df_feat.ta.macd(close='Close', fast=12, slow=26, signal=9, append=True)

# Bollinger Bands: Measures volatility. It consists of a middle band (moving average)
# and an upper and lower band (standard deviations away from the middle).
df_feat.ta.bbands(close='Close', length=20, std=2, append=True)

# ATR (Average True Range): A pure measure of volatility.
df_feat.ta.atr(high='High', low='Low', close='Close', length=14, append=True)


--- Creating Technical Indicators ---


Timestamp
2012-01-01            NaN
2012-01-02            NaN
2012-01-03            NaN
2012-01-04            NaN
2012-01-05            NaN
                 ...     
2025-08-10    2583.624564
2025-08-11    2704.008524
2025-08-12    2660.150772
2025-08-13    2811.211431
2025-08-14    3132.982043
Freq: D, Name: ATRr_14, Length: 4975, dtype: float64

In [48]:
# --- 3. Volume-Based Indicators ---
print("--- Creating Volume-Based Indicators ---")

# OBV (On-Balance Volume): Relates price and volume to show momentum.
# If today's close is higher, add today's volume. If lower, subtract it.
df_feat.ta.obv(close='Close', volume='Volume', append=True)

# VWAP (Volume-Weighted Average Price): The average price weighted by volume.
# For daily data, a rolling VWAP is common. pandas_ta can calculate this.
# Note: For daily data, VWAP is often calculated over a rolling window.
# The default ta.vwap() calculation is more suited for intraday, but we can use it here as a rolling feature.
df_feat.ta.vwap(high='High', low='Low', close='Close', volume='Volume', length=14, append=True)

--- Creating Volume-Based Indicators ---


Timestamp
2012-01-01         4.753333
2012-01-02         4.946667
2012-01-03         5.203333
2012-01-04         5.356667
2012-01-05         6.150000
                  ...      
2025-08-10    118370.000000
2025-08-11    119689.000000
2025-08-12    119541.666667
2025-08-13    122005.000000
2025-08-14    120021.333333
Freq: D, Name: VWAP_D, Length: 4975, dtype: float64

In [49]:
# --- 4. Lag Features ---
# These give the model direct information about past prices.
# Original plan was for hourly (t-24 = 1 day). Since we have daily data:
# t-1 = 1 day ago
# t-7 = 1 week ago
# t-30 = 1 month ago
print("--- Creating Lag Features ---")
lags = [1, 7, 30]
for lag in lags:
    df_feat[f'close_lag_{lag}'] = df_feat['Close'].shift(lag)


--- Creating Lag Features ---


In [50]:
# --- 5. Rolling Statistics ---
# These smooth out the data to identify trends and volatility over a window.
# Original plan was 24h, 7d, 30d. For daily data, we'll use 7d and 30d.
print("--- Creating Rolling Statistics ---")
windows = [7, 30]
for window in windows:
    # Rolling Mean (Moving Average)
    df_feat[f'close_roll_mean_{window}'] = df_feat['Close'].rolling(window=window).mean()
    # Rolling Standard Deviation (Volatility)
    df_feat[f'close_roll_std_{window}'] = df_feat['Close'].rolling(window=window).std()

--- Creating Rolling Statistics ---


In [51]:
print("\n--- Creating Target Variable (y) and Features (X) ---")
df_feat['target'] = df_feat['Close'].shift(-1)


--- Creating Target Variable (y) and Features (X) ---


In [52]:
# --- 6. Clean Up ---
# All these shift() and rolling() operations create NaN (Not a Number) values at the start of the DataFrame.
# We must remove these rows before we can train a model.
print(f"\nShape before dropping NaNs: {df_feat.shape}")
df_feat.dropna(inplace=True)
print(f"Shape after dropping NaNs: {df_feat.shape}")


Shape before dropping NaNs: (4975, 27)
Shape after dropping NaNs: (4937, 27)


In [53]:
# Define our features (X) and target (y)
y = df_feat['target']
X = df_feat.drop(columns='target')

In [54]:
# Display the final DataFrame with all the new features
print("\n--- Final DataFrame with Features (first 5 rows) ---")
pd.set_option('display.max_columns', None) # Show all columns
print(df_feat.head())


--- Final DataFrame with Features (first 5 rows) ---
            Open  High   Low  Close      Volume  price_return  log_return  \
Timestamp                                                                   
2012-02-03  6.26  6.35  5.93   6.29  283.382106      0.004792    0.004781   
2012-02-04  6.29  6.50  5.94   6.50   67.694994      0.033386    0.032841   
2012-02-05  6.50  6.50  5.70   5.70   49.866684     -0.123077   -0.131336   
2012-02-06  5.70  6.15  5.20   5.90   26.362078      0.035088    0.034486   
2012-02-07  5.90  5.90  5.50   5.51  151.424746     -0.066102   -0.068388   

               RSI_14  MACD_12_26_9  MACDh_12_26_9  MACDs_12_26_9  BBL_20_2.0  \
Timestamp                                                                       
2012-02-03  52.119945     -0.113606      -0.000737      -0.112869    5.134692   
2012-02-04  54.264430     -0.070310       0.034047      -0.104357    5.158416   
2012-02-05  45.841184     -0.099405       0.003962      -0.103367    5.121458   
2

In [55]:
# --- 7. Train/Test Split (Time-Based) ---

print("\n--- Performing Time-Based Train/Test Split ---")
# Define the split point (80% of the data for training)
split_percentage = 0.8
split_index = int(len(df_feat) * split_percentage)

# SPLIT 1: The full DataFrames (useful for Prophet and for reference)
train_df = df_feat.iloc[:split_index]
test_df = df_feat.iloc[split_index:]

# SPLIT 2: The X and y arrays (needed for most ML/DL models)
X_train = X.iloc[:split_index]
y_train = y.iloc[:split_index]

X_test = X.iloc[split_index:]
y_test = y.iloc[split_index:]


# --- 7. SUMMARY OF FINAL DATA ---
print("\n--- DATA PREPARATION COMPLETE ---")
print(f"Total rows in final dataset: {len(df_feat)}")
print("\nData for Standard ML/DL Models (X_train, y_train, etc.):")
print(f"  X_train shape: {X_train.shape} | y_train shape: {y_train.shape}")
print(f"  X_test shape:  {X_test.shape} | y_test shape:  {y_test.shape}")

print("\nData for Prophet Model (train_df):")
print(f"  train_df shape: {train_df.shape}")

print("\nData for ARIMA Model (y_train):")
print(f"  y_train can be used directly for training ARIMA.")


--- Performing Time-Based Train/Test Split ---

--- DATA PREPARATION COMPLETE ---
Total rows in final dataset: 4937

Data for Standard ML/DL Models (X_train, y_train, etc.):
  X_train shape: (3949, 26) | y_train shape: (3949,)
  X_test shape:  (988, 26) | y_test shape:  (988,)

Data for Prophet Model (train_df):
  train_df shape: (3949, 27)

Data for ARIMA Model (y_train):
  y_train can be used directly for training ARIMA.


In [56]:
train_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,price_return,log_return,RSI_14,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,BBL_20_2.0,BBM_20_2.0,BBU_20_2.0,BBB_20_2.0,BBP_20_2.0,ATRr_14,OBV,VWAP_D,close_lag_1,close_lag_7,close_lag_30,close_roll_mean_7,close_roll_std_7,close_roll_mean_30,close_roll_std_30,target
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
2012-02-03,6.26,6.35,5.93,6.29,283.382106,0.004792,0.004781,52.119945,-0.113606,-0.000737,-0.112869,5.134692,6.3345,7.534308,37.881692,0.481455,0.909106,243.558728,6.19,6.26,5.88,5.57,5.782857,0.482829,6.453333,0.548574,6.5
2012-02-04,6.29,6.5,5.94,6.5,67.694994,0.033386,0.032841,54.26443,-0.07031,0.034047,-0.104357,5.158416,6.302,7.445584,36.292727,0.58657,0.881987,311.253722,6.313333,6.29,4.91,6.42,6.01,0.362859,6.456,0.548601,5.7
2012-02-05,6.5,6.5,5.7,5.7,49.866684,-0.123077,-0.131336,45.841184,-0.099405,0.003962,-0.103367,5.121458,6.232,7.342542,35.639994,0.260477,0.875658,261.387038,5.966667,6.5,5.9,6.4,5.981429,0.380413,6.432667,0.565685,5.9
2012-02-06,5.7,6.15,5.2,5.9,26.362078,0.035088,0.034486,48.013771,-0.105113,-0.001397,-0.103716,5.106777,6.182,7.257223,34.785596,0.368864,0.881364,287.749116,5.75,5.7,5.58,6.8,6.027143,0.341356,6.402667,0.569385,5.51
2012-02-07,5.9,5.9,5.5,5.51,151.424746,-0.066102,-0.068388,44.283271,-0.139498,-0.028626,-0.110873,5.076539,6.1025,7.128461,33.62427,0.211246,0.844612,136.32437,5.636667,5.9,5.55,6.9,6.021429,0.350876,6.356333,0.58389,5.66


In [None]:
import os

# Create a directory to store the processed data if it doesn't exist
output_dir = 'processed_data'
os.makedirs(output_dir, exist_ok=True)

print(f"--- Saving processed data to '{output_dir}' directory ---")

# Save each DataFrame/Series to a separate pickle file
X_train.to_pickle(os.path.join(output_dir, 'X_train.pkl'))
y_train.to_pickle(os.path.join(output_dir, 'y_train.pkl'))
X_test.to_pickle(os.path.join(output_dir, 'X_test.pkl'))
y_test.to_pickle(os.path.join(output_dir, 'y_test.pkl'))
train_df.to_pickle(os.path.join(output_dir, 'train_df.pkl'))
test_df.to_pickle(os.path.join(output_dir, 'test_df.pkl'))

print("All data successfully saved.")


--- Saving processed data to 'processed_data' directory ---
All data successfully saved.
