In [22]:
import yfinance as yf
import pandas as pd

# Download data
btc_data = yf.download("BTC-USD", start="2020-01-01", end=pd.to_datetime('today'))
eth_data = yf.download("ETH-USD", start="2020-01-01", end=pd.to_datetime('today'))

  btc_data = yf.download("BTC-USD", start="2020-01-01", end=pd.to_datetime('today'))
[*********************100%***********************]  1 of 1 completed
  eth_data = yf.download("ETH-USD", start="2020-01-01", end=pd.to_datetime('today'))
[*********************100%***********************]  1 of 1 completed


In [23]:
import numpy as np
from sklearn.model_selection import train_test_split

def create_and_split_data(df, lag_days=[1, 7, 30]):
    """Creates features (lags, MAs) and splits the data."""
    df_copy = df[['Close']].copy()

    # 1. Target Column: Price tomorrow
    # We want to predict the 'Close' price of the next day (shift(-1))
    df_copy['Target'] = df_copy['Close'].shift(-1)

    # 2. Lagged Features (Previous Prices)
    for day in lag_days:
        # Lag_1d is yesterday's price, Lag_7d is the price 7 days ago, etc.
        df_copy[f'Lag_{day}d'] = df_copy['Close'].shift(day)

    # 3. Simple Moving Averages (Trend Indicators)
    df_copy['SMA_7'] = df_copy['Close'].rolling(window=7).mean()
    df_copy['SMA_30'] = df_copy['Close'].rolling(window=30).mean()

    # 4. Drop initial rows with missing features (NaNs) and the last row (Target is NaN)
    df_copy.dropna(inplace=True)

    # Define X (features) and y (target)
    X = df_copy.drop(['Close', 'Target'], axis=1)
    y = df_copy['Target']

    # Split the data chronologically (Time Series split)
    split_point = int(len(X) * 0.8)  # Use the first 80% for training
    X_train, X_test = X[:split_point], X[split_point:]
    y_train, y_test = y[:split_point], y[split_point:]

    return X_train, X_test, y_train, y_test, X # Return full X for later prediction

# Apply to both data sets
X_train_btc, X_test_btc, y_train_btc, y_test_btc, X_full_btc = create_and_split_data(btc_data)
X_train_eth, X_test_eth, y_train_eth, y_test_eth, X_full_eth = create_and_split_data(eth_data)

print(f"BTC Training set size: {len(X_train_btc)} rows")

BTC Training set size: 1688 rows


In [26]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib # For saving the model

# 1. Train BTC Model
print("Training BTC Random Forest Regressor...")
rf_btc = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, max_depth=10)
rf_btc.fit(X_train_btc, y_train_btc)

# 2. Evaluate BTC Model
btc_preds = rf_btc.predict(X_test_btc)
print(f"BTC Test R-squared: {r2_score(y_test_btc, btc_preds):.4f}")

# 3. Train ETH Model (repeat for ETH)
print("Training ETH Random Forest Regressor...")
rf_eth = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, max_depth=10)
rf_eth.fit(X_train_eth, y_train_eth)

# 4. Save the trained models (important for the UI)
joblib.dump(rf_btc, "rf_btc_predictor.pkl")
joblib.dump(rf_eth, "rf_eth_predictor.pkl")
print("Models saved successfully.")

Training BTC Random Forest Regressor...
BTC Test R-squared: -3.8597
Training ETH Random Forest Regressor...
Models saved successfully.


In [29]:
import streamlit as st
import joblib
import pandas as pd
import yfinance as yf
import numpy as np

# --- Helper Functions (Must replicate feature engineering) ---
@st.cache_data
def get_latest_data(ticker):
    # Fetch the last 60 days of data to ensure enough history for 30-day MA
    data = yf.download(ticker, period="60d", interval="1d")
    return data

def prepare_features_for_prediction(data):
    """
    Calculates the exact features the model was trained on,
    using the last row of the historical data.
    """
    df_copy = data[['Close']].copy()

    # Replicate Lag and SMA features exactly as in training
    lag_days = [1, 7, 30]
    for day in lag_days:
        df_copy[f'Lag_{day}d'] = df_copy['Close'].shift(day)

    df_copy['SMA_7'] = df_copy['Close'].rolling(window=7).mean()
    df_copy['SMA_30'] = df_copy['Close'].rolling(window=30).mean()

    # Get the features from the very last (current) day
    # This row has all the features, but a NaN for 'Target' (which is tomorrow's price)
    latest_features = df_copy.iloc[-1]

    # Convert to a DataFrame row format that the model expects
    features_df = latest_features.drop('Close').to_frame().T
    return features_df

# --- Streamlit Application ---
st.title("â‚¿ Crypto Price Predictor (Random Forest)")
st.markdown("Predicting tomorrow's closing price for BTC and ETH.")
st.markdown("---")

# Load Models
try:
    rf_btc = joblib.load("rf_btc_predictor.pkl")
    rf_eth = joblib.load("rf_eth_predictor.pkl")
except FileNotFoundError:
    st.error("Model files not found. Please run Step 3 (Training) first.")
    st.stop()

# --- Prediction Logic for BTC ---
st.header("Bitcoin (BTC) Prediction")
btc_data_latest = get_latest_data("BTC-USD")
btc_input_features = prepare_features_for_prediction(btc_data_latest)

btc_prediction = rf_btc.predict(btc_input_features)[0]
current_btc_price = btc_data_latest['Close'].iloc[-1]
delta_btc = btc_prediction - current_btc_price

st.metric(
    label=f"Predicted Close Price for Tomorrow (BTC)",
    value=f"${btc_prediction:,.2f}",
    delta=f"{delta_btc.iloc[0]:,.2f} ({delta_btc.iloc[0]/current_btc_price*100:.2f}%)"
)

st.caption(f"Current BTC Price: ${current_btc_price:,.2f}")

st.markdown("---")

# --- Prediction Logic for ETH ---
st.header("Ethereum (ETH) Prediction")
eth_data_latest = get_latest_data("ETH-USD")
eth_input_features = prepare_features_for_prediction(eth_data_latest)

eth_prediction = rf_eth.predict(eth_input_features)[0]
current_eth_price = eth_data_latest['Close'].iloc[-1]
delta_eth = eth_prediction - current_eth_price

st.metric(
    label=f"Predicted Close Price for Tomorrow (ETH)",
    value=f"${eth_prediction:,.2f}",
    delta=f"{delta_eth.iloc[0]:,.2f} ({delta_eth.iloc[0]/current_eth_price*100:.2f}%)"
)

st.caption(f"Current ETH Price: ${current_eth_price:,.2f}")

2025-11-10 21:12:59.903 No runtime found, using MemoryCacheStorageManager


TypeError: unsupported format string passed to Series.__format__

After installing `streamlit`, you can run the cell again to import the library.

In [None]:
from google.colab import drive
drive.mount('/content/drive')