In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    VotingRegressor,
    StackingRegressor
)
from sklearn.metrics import mean_absolute_error, r2_score
import yfinance as yf
from datetime import datetime
from pmdarima import auto_arima
from prophet import Prophet

# -------------------------------
# 1. Download Stock Data (AAPL)
# -------------------------------
ticker = "AAPL"
start = "2015-01-01"   # use longer history for Prophet
end = datetime.today().strftime('%Y-%m-%d')

df = yf.download(ticker, start=start, end=end, auto_adjust=False)
df.columns = df.columns.get_level_values(0)

# Keep OHLC
df = df.loc[:, ["Open", "High", "Low", "Close", "Volume"]].reset_index()

# Create daily returns
df["Return"] = df["Close"].pct_change()
df["Next_Return"] = df["Return"].shift(-1)
df["Next_Close"] = df["Close"].shift(-1)
df = df.dropna().reset_index(drop=True)

# -------------------------------
# 2. Features & Preprocessor (ML)
# -------------------------------
features = ["Open", "High", "Low", "Close"]
X = df[features]
y = df["Next_Close"]

preprocessor = ColumnTransformer(
    transformers=[("num", StandardScaler(), features)],
    remainder="passthrough"
)

# -------------------------------
# 3. Candidate ML Models
# -------------------------------
lr = LinearRegression()
ridge = Ridge(alpha=1.0)
lasso = Lasso(alpha=0.001, max_iter=10000)
rf = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
gb = GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42)

voting_reg = VotingRegressor(estimators=[("lr", lr), ("rf", rf), ("gb", gb)])
weighted_voting_reg = VotingRegressor(
    estimators=[("lr", lr), ("rf", rf), ("gb", gb)],
    weights=[1, 2, 2]
)
stacking_reg = StackingRegressor(
    estimators=[("lr", lr), ("rf", rf), ("gb", gb)],
    final_estimator=Ridge(alpha=1.0)
)

models = {
    "Linear Regression": lr,
    "Ridge Regression": ridge,
    "Lasso Regression": lasso,
    "Random Forest": rf,
    "Gradient Boosting": gb,
    "Voting": voting_reg,
    "Weighted Voting": weighted_voting_reg,
    "Stacking": stacking_reg
}

# -------------------------------
# 4. Evaluate ML Models
# -------------------------------
tscv = TimeSeriesSplit(n_splits=5)
results = {}

for name, model in models.items():
    mae_scores, r2_scores = [], []
    pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])
    
    for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        
        mae_scores.append(mean_absolute_error(y_test, y_pred))
        r2_scores.append(r2_score(y_test, y_pred))
    
    results[name] = {"MAE": np.mean(mae_scores), "R²": np.mean(r2_scores)}

# -------------------------------
# 5. Auto ARIMA (on Returns)
# -------------------------------
try:
    arima_model = auto_arima(df["Return"].dropna(), seasonal=False, trace=False, suppress_warnings=True)
    arima_forecast = arima_model.predict(n_periods=1)[0]
    # Convert predicted return back to price
    arima_predicted_close = df["Close"].iloc[-1] * (1 + arima_forecast)
    results["ARIMA"] = {"MAE": np.nan, "R²": np.nan}
except Exception as e:
    print("ARIMA failed:", e)
    arima_predicted_close = None

# -------------------------------
# 6. Prophet (on Returns)
# -------------------------------
try:
    prophet_df = df[["Date", "Return"]].rename(columns={"Date": "ds", "Return": "y"}).dropna()
    prophet = Prophet(daily_seasonality=True)
    prophet.fit(prophet_df)
    future = prophet.make_future_dataframe(periods=1)
    forecast = prophet.predict(future)
    prophet_forecast_return = forecast.iloc[-1]["yhat"]
    prophet_predicted_close = df["Close"].iloc[-1] * (1 + prophet_forecast_return)
    results["Prophet"] = {"MAE": np.nan, "R²": np.nan}
except Exception as e:
    print("Prophet failed:", e)
    prophet_predicted_close = None

# -------------------------------
# 7. Results
# -------------------------------
results_df = pd.DataFrame(results).T.sort_values(by="MAE", na_position="last")
print("\nModel Performance (Cross-Validation):")
print(results_df)

# -------------------------------
# 8. AutoML: Pick Best ML Model
# -------------------------------
best_model_name = results_df.dropna().index[0]
print(f"\n✅ Best ML Model Selected: {best_model_name}")

best_model = models[best_model_name]
final_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", best_model)])
final_pipeline.fit(X, y)

# -------------------------------
# 9. Predict Tomorrow’s Price
# -------------------------------
latest_features = df[features].iloc[-1:]
ml_predicted_close = final_pipeline.predict(latest_features)[0]

print(f"\n📈 Predicted next close (ML AutoML) for {ticker} (tomorrow): {ml_predicted_close:.2f}")

if arima_predicted_close is not None:
    print(f"📉 ARIMA Forecast (returns → price): {arima_predicted_close:.2f}")

if prophet_predicted_close is not None:
    print(f"🔮 Prophet Forecast (returns → price): {prophet_predicted_close:.2f}")


  from .autonotebook import tqdm as notebook_tqdm
[*********************100%***********************]  1 of 1 completed


ARIMA failed: 0


20:25:04 - cmdstanpy - INFO - Chain [1] start processing
20:25:05 - cmdstanpy - INFO - Chain [1] done processing



Model Performance (Cross-Validation):
                         MAE        R²
Linear Regression   1.554560  0.981721
Lasso Regression    1.562035  0.981659
Ridge Regression    1.595152  0.981252
Stacking            1.734317  0.972200
Voting             11.746111  0.116262
Weighted Voting    13.961998 -0.262640
Gradient Boosting  17.274948 -0.951409
Random Forest      17.328091 -0.968408
Prophet                  NaN       NaN

✅ Best ML Model Selected: Linear Regression

📈 Predicted next close (ML AutoML) for AAPL (tomorrow): 232.76
🔮 Prophet Forecast (returns → price): 233.21


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    VotingRegressor,
    StackingRegressor
)
from sklearn.metrics import mean_absolute_error, r2_score
import yfinance as yf
from datetime import datetime

# -------------------------------
# 1. Download Stock Data (AAPL)
# -------------------------------
ticker = "NVDA"
start = "2015-01-01"
end = datetime.today().strftime('%Y-%m-%d')

df = yf.download(ticker, start=start, end=end, auto_adjust=False)
df.columns = df.columns.get_level_values(0)

# Keep OHLC
df = df.loc[:, ["Open", "High", "Low", "Close"]].reset_index()

# Create target: Next day's Close
df["Next_Close"] = df["Close"].shift(-1)
df = df.dropna().reset_index(drop=True)

# -------------------------------
# 2. Features & Preprocessor
# -------------------------------
features = ["Open", "High", "Low", "Close"]
X = df[features]
y = df["Next_Close"]

preprocessor = ColumnTransformer(
    transformers=[("num", StandardScaler(), features)],
    remainder="passthrough"
)

# -------------------------------
# 3. Candidate ML Models
# -------------------------------
lr = LinearRegression()
ridge = Ridge(alpha=1.0)
lasso = Lasso(alpha=0.001, max_iter=10000)
rf = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
gb = GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42)

voting_reg = VotingRegressor(estimators=[("lr", lr), ("rf", rf), ("gb", gb)])
stacking_reg = StackingRegressor(
    estimators=[("lr", lr), ("rf", rf), ("gb", gb)],
    final_estimator=Ridge(alpha=1.0)
)

models = {
    "Linear Regression": lr,
    "Ridge Regression": ridge,
    "Lasso Regression": lasso,
    "Random Forest": rf,
    "Gradient Boosting": gb,
    "Voting": voting_reg,
    "Stacking": stacking_reg
}

# -------------------------------
# 4. Evaluate Models
# -------------------------------
tscv = TimeSeriesSplit(n_splits=5)
results = {}

for name, model in models.items():
    mae_scores, r2_scores = [], []
    pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])
    
    for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        
        mae_scores.append(mean_absolute_error(y_test, y_pred))
        r2_scores.append(r2_score(y_test, y_pred))
    
    results[name] = {"MAE": np.mean(mae_scores), "R²": np.mean(r2_scores)}

results_df = pd.DataFrame(results).T.sort_values(by="MAE")
print("\nModel Performance (Cross-Validation):")
print(results_df)

# -------------------------------
# 5. AutoML: Pick Best Model
# -------------------------------
best_model_name = results_df.index[0]
print(f"\n✅ Best Model Selected: {best_model_name}")

best_model = models[best_model_name]
final_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", best_model)])
final_pipeline.fit(X, y)

# -------------------------------
# 6. Predict Tomorrow’s Price
# -------------------------------
latest_features = df[features].iloc[-1:]
ml_predicted_close = final_pipeline.predict(latest_features)[0]

print(f"\n📈 Predicted next close (ML AutoML) for {ticker} (tomorrow): {ml_predicted_close:.2f}")


[*********************100%***********************]  1 of 1 completed



Model Performance (Cross-Validation):
                         MAE        R²
Lasso Regression    0.770945  0.989001
Ridge Regression    0.772363  0.988832
Linear Regression   0.772694  0.988899
Stacking            0.868337  0.969772
Voting             10.629615 -0.051710
Random Forest      15.834915 -1.345361
Gradient Boosting  15.903886 -1.368800

✅ Best Model Selected: Lasso Regression

📈 Predicted next close (ML AutoML) for NVDA (tomorrow): 182.19


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    VotingRegressor,
    StackingRegressor
)
from sklearn.metrics import mean_absolute_error, r2_score
import yfinance as yf
from datetime import datetime, timedelta
import os

# -------------------------------
# 1. Candidate ML Models
# -------------------------------
lr = LinearRegression()
ridge = Ridge(alpha=1.0)
lasso = Lasso(alpha=0.001, max_iter=10000)
rf = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
gb = GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42)

voting_reg = VotingRegressor(estimators=[("lr", lr), ("rf", rf), ("gb", gb)])
stacking_reg = StackingRegressor(
    estimators=[("lr", lr), ("rf", rf), ("gb", gb)],
    final_estimator=Ridge(alpha=1.0)
)

models = {
    "Linear Regression": lr,
    "Ridge Regression": ridge,
    "Lasso Regression": lasso,
    "Random Forest": rf,
    "Gradient Boosting": gb,
    "Voting": voting_reg,
    "Stacking": stacking_reg
}

# -------------------------------
# 2. Function to Train & Predict
# -------------------------------
def predict_stock(ticker, start="2015-01-01"):
    end = datetime.today().strftime('%Y-%m-%d')
    
    # Download stock data
    df = yf.download(ticker, start=start, end=end, auto_adjust=False)
    df.columns = df.columns.get_level_values(0)
    
    # Keep OHLC
    df = df.loc[:, ["Open", "High", "Low", "Close"]].reset_index()
    
    # Create target: Next day's Close
    df["Next_Close"] = df["Close"].shift(-1)
    df = df.dropna().reset_index(drop=True)
    
    features = ["Open", "High", "Low", "Close"]
    X = df[features]
    y = df["Next_Close"]
    
    # Preprocessor
    preprocessor = ColumnTransformer(
        transformers=[("num", StandardScaler(), features)],
        remainder="passthrough"
    )
    
    # Evaluate Models with TimeSeriesSplit
    tscv = TimeSeriesSplit(n_splits=5)
    results = {}
    
    for name, model in models.items():
        mae_scores, r2_scores = [], []
        pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])
        
        for train_idx, test_idx in tscv.split(X):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
            
            pipeline.fit(X_train, y_train)
            y_pred = pipeline.predict(X_test)
            
            mae_scores.append(mean_absolute_error(y_test, y_pred))
            r2_scores.append(r2_score(y_test, y_pred))
        
        results[name] = {"MAE": np.mean(mae_scores), "R²": np.mean(r2_scores)}
    
    results_df = pd.DataFrame(results).T.sort_values(by="MAE")
    best_model_name = results_df.index[0]
    best_model = models[best_model_name]
    
    # Final Train on All Data
    final_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", best_model)])
    final_pipeline.fit(X, y)
    
    # Predict Tomorrow
    latest_features = df[features].iloc[-1:]
    predicted_close = final_pipeline.predict(latest_features)[0]
    
    # Tomorrow’s actual date
    next_date = (df["Date"].iloc[-1] + timedelta(days=1)).strftime("%Y-%m-%d")
    
    return {
        "ticker": ticker,
        "best_model": best_model_name,
        "predicted_close": predicted_close,
        "prediction_date": next_date
    }

# -------------------------------
# 3. Run for Multiple Tech Stocks
# -------------------------------
tech_tickers = ["AAPL", "MSFT", "AMZN", "GOOGL", "TSLA"]

all_predictions = []
for t in tech_tickers:
    output = predict_stock(t)
    all_predictions.append({
        "Prediction Date": output["prediction_date"],
        "Ticker": output["ticker"],
        "Best Model": output["best_model"],
        "Predicted Close": round(output["predicted_close"], 2)
    })

predictions_df = pd.DataFrame(all_predictions)
print("\n📊 Predictions for Tech Stocks:")
print(predictions_df)

# -------------------------------
# 4. Save to CSV/Excel (Append Daily)
# -------------------------------
csv_file = "tech_stock_predictions.csv"
excel_file = "tech_stock_predictions.xlsx"

if os.path.exists(csv_file):
    old_df = pd.read_csv(csv_file)
    predictions_df = pd.concat([old_df, predictions_df], ignore_index=True)

predictions_df.to_csv(csv_file, index=False)
predictions_df.to_excel(excel_file, index=False)

print(f"\n✅ Predictions saved to '{csv_file}' and '{excel_file}'")


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(



📊 Predictions for Tech Stocks:
  Prediction Date Ticker         Best Model  Predicted Close
0      2025-08-15   AAPL  Linear Regression           232.76
1      2025-08-15   MSFT           Stacking           523.06
2      2025-08-15   AMZN  Linear Regression           231.32
3      2025-08-15  GOOGL  Linear Regression           202.95
4      2025-08-15   TSLA   Lasso Regression           335.44


ModuleNotFoundError: No module named 'openpyxl'

In [5]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    VotingRegressor,
    StackingRegressor
)
from sklearn.metrics import mean_absolute_error, r2_score
import yfinance as yf
from datetime import datetime

# -------------------------------
# 1. Download Stock Data (e.g., GOOG)
# -------------------------------
ticker = "NVDA"
start = "2020-01-01"
end = datetime.today().strftime('%Y-%m-%d')

df = yf.download(ticker, start=start, end=end)

# Flatten possible MultiIndex columns
df.columns = df.columns.get_level_values(0)

# Keep OHLC
df = df.loc[:, ["Open", "High", "Low", "Close"]].reset_index()

# Create target: Next day's Close
df["Next_Close"] = df["Close"].shift(-1)
df = df.dropna().reset_index(drop=True)

# -------------------------------
# 2. Features & Preprocessor
# -------------------------------
features = ["Open", "High", "Low", "Close"]
X = df[features]
y = df["Next_Close"]

preprocessor = ColumnTransformer(
    transformers=[("num", StandardScaler(), features)],
    remainder="drop"
)

# -------------------------------
# 3. Candidate ML Models
# -------------------------------
lr = LinearRegression()
ridge = Ridge(alpha=1.0)
lasso = Lasso(alpha=0.001, max_iter=30000)
rf = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
gb = GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42)

voting_reg = VotingRegressor(estimators=[("lr", lr), ("rf", rf), ("gb", gb)])
weighted_voting_reg = VotingRegressor(
    estimators=[("lr", lr), ("rf", rf), ("gb", gb)],
    weights=[1, 2, 2]
)
stacking_reg = StackingRegressor(
    estimators=[("lr", lr), ("rf", rf), ("gb", gb)],
    final_estimator=Ridge(alpha=1.0)
)

models = {
    "Linear Regression": lr,
    "Ridge Regression": ridge,
    "Lasso Regression": lasso,
    "Random Forest": rf,
    "Gradient Boosting": gb,
    "Voting": voting_reg,
    "Weighted Voting": weighted_voting_reg,
    "Stacking": stacking_reg
}

# -------------------------------
# 4. Evaluate ML Models
# -------------------------------
tscv = TimeSeriesSplit(n_splits=5)
results = {}

for name, model in models.items():
    mae_scores, r2_scores = [], []
    pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])
    
    for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        
        mae_scores.append(mean_absolute_error(y_test, y_pred))
        r2_scores.append(r2_score(y_test, y_pred))
    
    results[name] = {"MAE": np.mean(mae_scores), "R²": np.mean(r2_scores)}

# -------------------------------
# 5. Results
# -------------------------------
results_df = pd.DataFrame(results).T.sort_values(by="MAE", na_position="last")
print("\nModel Performance (Cross-Validation):")
print(results_df)

# -------------------------------
# 6. AutoML: Pick Best ML Model
# -------------------------------
best_model_name = results_df.dropna().index[0]
print(f"\n✅ Best ML Model Selected: {best_model_name}")

best_model = models[best_model_name]
final_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", best_model)])
final_pipeline.fit(X, y)

# -------------------------------
# 7. Save Model as {ticker}_{today}.joblib
# -------------------------------
today_str = datetime.today().strftime("%Y%m%d")
model_filename = f"{ticker}_{today_str}.joblib"
joblib.dump(final_pipeline, model_filename)
print(f"💾 Model saved as {model_filename}")

# -------------------------------
# 8. Predict Tomorrow’s Price
# -------------------------------
latest_features = df[features].iloc[-1:]
next_day_prediction = final_pipeline.predict(latest_features)[0]

print(f"\n📈 Predicted next close (ML AutoML) for {ticker} (tomorrow): {next_day_prediction:.2f}")


  df = yf.download(ticker, start=start, end=end)
[*********************100%***********************]  1 of 1 completed



Model Performance (Cross-Validation):
                         MAE        R²
Lasso Regression    1.350765  0.981343
Linear Regression   1.351984  0.981239
Ridge Regression    1.360759  0.980714
Stacking            1.453536  0.969377
Voting              7.837694  0.499541
Weighted Voting     9.286432  0.289522
Random Forest      11.365554 -0.082063
Gradient Boosting  11.583588 -0.111793

✅ Best ML Model Selected: Lasso Regression
💾 Model saved as NVDA_20250817.joblib

📈 Predicted next close (ML AutoML) for NVDA (tomorrow): 182.27
