In [2]:
# ============================================================================
# GLOBAL CONFIG - Change ticker only here
# ============================================================================
TICKER = "MSFT"   # <<-- Change ticker symbol here only one time

# ============================================================================
# CELL 1: Training Function
# ============================================================================
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    VotingRegressor,
    StackingRegressor
)
from sklearn.metrics import mean_absolute_error, r2_score
import yfinance as yf
from datetime import datetime, timedelta

def train_hourly_model(ticker=TICKER, days_back=90):
    """Train an hourly prediction model for a given ticker."""
    
    # 1. Download hourly stock data
    end = datetime.today()
    start = end - timedelta(days=days_back)
    print(f"Downloading {ticker} hourly data from {start:%Y-%m-%d} to {end:%Y-%m-%d}")
    
    df = yf.download(ticker, start=start.strftime('%Y-%m-%d'),
                     end=end.strftime('%Y-%m-%d'), interval="60m")

    if isinstance(df.columns, pd.MultiIndex):
        df.columns = df.columns.get_level_values(0)
    df = df.reset_index()

    required_cols = ["Datetime", "Open", "High", "Low", "Close"]
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Required columns {missing_cols} not found in data")

    df = df[required_cols]
    df["Next_Close"] = df["Close"].shift(-1)
    df = df.dropna().reset_index(drop=True)

    features = ["Open", "High", "Low", "Close"]
    X = df[features]
    y = df["Next_Close"]

    preprocessor = ColumnTransformer([("num", StandardScaler(), features)], remainder="drop")

    # Models
    lr    = LinearRegression()
    ridge = Ridge(alpha=1.0)
    lasso = Lasso(alpha=0.001, max_iter=30000)
    rf    = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
    gb    = GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42)
    voting_reg = VotingRegressor([("lr", lr), ("rf", rf), ("gb", gb)])
    weighted_voting_reg = VotingRegressor([("lr", lr), ("rf", rf), ("gb", gb)], weights=[1,2,2])
    stacking_reg = StackingRegressor([("lr", lr), ("rf", rf), ("gb", gb)], final_estimator=Ridge(alpha=1.0))

    models = {
        "Linear Regression": lr,
        "Ridge Regression": ridge,
        "Lasso Regression": lasso,
        "Random Forest": rf,
        "Gradient Boosting": gb,
        "Voting": voting_reg,
        "Weighted Voting": weighted_voting_reg,
        "Stacking": stacking_reg
    }

    # Time series CV
    tscv = TimeSeriesSplit(n_splits=5)
    results = {}
    for name, model in models.items():
        mae_scores, r2_scores = [], []
        pipeline = Pipeline([("scale", preprocessor), ("model", model)])
        for train_idx, test_idx in tscv.split(X):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
            pipeline.fit(X_train, y_train)
            y_pred = pipeline.predict(X_test)
            mae_scores.append(mean_absolute_error(y_test, y_pred))
            r2_scores.append(r2_score(y_test, y_pred))
        results[name] = {"MAE": np.mean(mae_scores), "R²": np.mean(r2_scores)}

    results_df = pd.DataFrame(results).T.sort_values(by="MAE")
    print("\nModel CV performance:")
    print(results_df)

    best_model_name = results_df.index[0]
    print(f"\n✅ Best model: {best_model_name}")
    best_model = models[best_model_name]
    final_pipeline = Pipeline([("scale", preprocessor), ("model", best_model)])
    final_pipeline.fit(X, y)

    model_filename = f"{ticker}_hour.joblib"
    joblib.dump(final_pipeline, model_filename)
    print(f"💾 Saved model as {model_filename}")

    pred_next_hour = final_pipeline.predict(df[features].iloc[-1:])[0]
    print(f"📈 Predicted next close for {ticker}: {pred_next_hour:.2f}")
    
    return model_filename, best_model_name, pred_next_hour

# Train model (uses global TICKER)
model_file, best_model, prediction = train_hourly_model(days_back=90)

# ============================================================================
# CELL 2: Prediction Function
# ============================================================================
def predict_next_hour(ticker=TICKER, model_filename=None):
    """Predict the next hour's closing price using a saved model."""
    if model_filename is None:
        model_filename = f"{ticker}_hour.joblib"
    try:
        pipeline = joblib.load(model_filename)
    except FileNotFoundError:
        print(f"❌ Model file {model_filename} not found. Train the model first.")
        return None
    
    end = datetime.today()
    start = end - timedelta(days=7)
    df = yf.download(ticker, start=start.strftime('%Y-%m-%d'),
                     end=end.strftime('%Y-%m-%d'), interval="60m")
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = df.columns.get_level_values(0)
    df = df.reset_index()
    df = df[["Datetime", "Open", "High", "Low", "Close"]]

    latest_data = df.iloc[-1]
    prediction = pipeline.predict(df[["Open", "High", "Low", "Close"]].iloc[-1:])[0]
    next_hour = latest_data["Datetime"] + timedelta(hours=1)
    
    return {
        "ticker": ticker,
        "current_datetime": latest_data["Datetime"],
        "next_hour_datetime": next_hour,
        "current_close": latest_data["Close"],
        "predicted_next_close": prediction,
        "predicted_change": prediction - latest_data["Close"],
        "predicted_change_percent": ((prediction - latest_data["Close"]) / latest_data["Close"]) * 100,
        "model_file": model_filename
    }

def print_prediction_results(results):
    if results is None: return
    print("\n" + "="*50)
    print(f"📈 HOURLY PREDICTION FOR {results['ticker']}")
    print("="*50)
    print(f"Current Time: {results['current_datetime']}")
    print(f"Next Hour:    {results['next_hour_datetime']}")
    print(f"Current Close: ${results['current_close']:.2f}")
    print(f"Predicted Close: ${results['predicted_next_close']:.2f}")
    change, change_pct = results['predicted_change'], results['predicted_change_percent']
    arrow = "📈" if change > 0 else "📉"
    print(f"Predicted Change: {change:+.2f} ({change_pct:+.2f}%) {arrow}")
    print(f"Model: {results['model_file']}")
    print("="*50)

results = predict_next_hour()
print_prediction_results(results)

# ============================================================================
# CELL 3: Visualization
# ============================================================================
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plot_hourly_predictions(ticker=TICKER, days_back=7):
    end = datetime.today()
    start = end - timedelta(days=days_back)
    df = yf.download(ticker, start=start.strftime('%Y-%m-%d'),
                     end=end.strftime('%Y-%m-%d'), interval="60m")
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = df.columns.get_level_values(0)
    df = df.reset_index()
    
    fig = make_subplots(rows=2, cols=1, 
                        subplot_titles=(f'{ticker} Hourly OHLC', 'Volume'),
                        vertical_spacing=0.1)
    fig.add_trace(go.Candlestick(x=df['Datetime'],
                                  open=df['Open'], high=df['High'],
                                  low=df['Low'], close=df['Close'],
                                  name='OHLC'), row=1, col=1)
    if 'Volume' in df.columns:
        fig.add_trace(go.Bar(x=df['Datetime'], y=df['Volume'], name='Volume'), row=2, col=1)
    fig.update_layout(title=f'{ticker} Hourly Data (Last {days_back} Days)', height=600)
    fig.show()

plot_hourly_predictions(renderer="browser")


Downloading MSFT hourly data from 2025-05-24 to 2025-08-22



YF.download() has changed argument auto_adjust default to True

[*********************100%***********************]  1 of 1 completed



Model CV performance:
                        MAE        R²
Lasso Regression   1.220132  0.866165
Linear Regression  1.221946  0.866137
Ridge Regression   1.309094  0.855270
Stacking           1.537097  0.814934
Voting             4.303039 -0.104024
Weighted Voting    5.077195 -0.539357
Random Forest      6.217069 -1.373773
Gradient Boosting  6.283333 -1.317594

✅ Best model: Lasso Regression
💾 Saved model as MSFT_hour.joblib
📈 Predicted next close for MSFT: 506.29



YF.download() has changed argument auto_adjust default to True

[*********************100%***********************]  1 of 1 completed


📈 HOURLY PREDICTION FOR MSFT
Current Time: 2025-08-21 19:30:00+00:00
Next Hour:    2025-08-21 20:30:00+00:00
Current Close: $506.02
Predicted Close: $506.05
Predicted Change: +0.03 (+0.01%) 📈
Model: MSFT_hour.joblib





TypeError: plot_hourly_predictions() got an unexpected keyword argument 'renderer'