In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    VotingRegressor,
    StackingRegressor
)
from sklearn.metrics import mean_absolute_error, r2_score
import yfinance as yf
from datetime import datetime

# -------------------------------
# 1. Download Stock Data (AAPL)
# -------------------------------
ticker = "AAPL"
start = "2022-01-01"
end = datetime.today().strftime('%Y-%m-%d')

df = yf.download(ticker, start=start, end=end)

# Keep OHLC
df = df[["Open", "High", "Low", "Close"]].reset_index()

# Create target: Next day's Close
df["Next_Close"] = df["Close"].shift(-1)
df = df.dropna().reset_index(drop=True)

# -------------------------------
# 2. Features & Preprocessor
# -------------------------------
features = ["Open", "High", "Low", "Close"]
X = df[features]
y = df["Next_Close"]

preprocessor = ColumnTransformer(
    transformers=[("num", StandardScaler(), features)],
    remainder="passthrough"
)


  df = yf.download(ticker, start=start, end=end)
[*********************100%***********************]  1 of 1 completed


In [2]:
# -------------------------------
# 3. Candidate Models
# -------------------------------
lr = LinearRegression()
rf = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
gb = GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42)

voting_reg = VotingRegressor(estimators=[("lr", lr), ("rf", rf), ("gb", gb)])
weighted_voting_reg = VotingRegressor(
    estimators=[("lr", lr), ("rf", rf), ("gb", gb)],
    weights=[1, 2, 2]
)
stacking_reg = StackingRegressor(
    estimators=[("lr", lr), ("rf", rf), ("gb", gb)],
    final_estimator=Ridge(alpha=1.0)
)

models = {
    "Linear Regression": lr,
    "Random Forest": rf,
    "Gradient Boosting": gb,
    "Voting": voting_reg,
    "Weighted Voting": weighted_voting_reg,
    "Stacking": stacking_reg
}


In [3]:
# -------------------------------
# 4. Evaluate with TimeSeriesSplit
# -------------------------------
tscv = TimeSeriesSplit(n_splits=5)
results = {}

for name, model in models.items():
    mae_scores, r2_scores = [], []
    pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])
    
    for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        
        mae_scores.append(mean_absolute_error(y_test, y_pred))
        r2_scores.append(r2_score(y_test, y_pred))
    
    results[name] = {"MAE": np.mean(mae_scores), "R²": np.mean(r2_scores)}

results_df = pd.DataFrame(results).T.sort_values(by="MAE")
print("\nModel Performance (Cross-Validation):")
print(results_df)

ValueError: Selected columns, ['Open', 'High', 'Low', 'Close'], are not unique in dataframe

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    VotingRegressor,
    StackingRegressor
)
from sklearn.metrics import mean_absolute_error, r2_score
import yfinance as yf
from datetime import datetime

# -------------------------------
# 1. Download Stock Data (AAPL)
# -------------------------------
ticker = "AAPL"
start = "2022-01-01"
end = datetime.today().strftime('%Y-%m-%d')

df = yf.download(ticker, start=start, end=end)

# Flatten multi-index (sometimes yfinance returns multi-level headers)
df.columns = df.columns.get_level_values(0)

# Keep OHLC only
df = df.loc[:, ["Open", "High", "Low", "Close"]].reset_index()

# Create target: Next day's Close
df["Next_Close"] = df["Close"].shift(-1)
df = df.dropna().reset_index(drop=True)

# -------------------------------
# 2. Features & Preprocessor
# -------------------------------
features = ["Open", "High", "Low", "Close"]
X = df[features]
y = df["Next_Close"]

preprocessor = ColumnTransformer(
    transformers=[("num", StandardScaler(), features)],
    remainder="passthrough"
)

# -------------------------------
# 3. Candidate Models
# -------------------------------
lr = LinearRegression()
rf = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
gb = GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42)

voting_reg = VotingRegressor(estimators=[("lr", lr), ("rf", rf), ("gb", gb)])
weighted_voting_reg = VotingRegressor(
    estimators=[("lr", lr), ("rf", rf), ("gb", gb)],
    weights=[1, 2, 2]
)
stacking_reg = StackingRegressor(
    estimators=[("lr", lr), ("rf", rf), ("gb", gb)],
    final_estimator=Ridge(alpha=1.0)
)

models = {
    "Linear Regression": lr,
    "Random Forest": rf,
    "Gradient Boosting": gb,
    "Voting": voting_reg,
    "Weighted Voting": weighted_voting_reg,
    "Stacking": stacking_reg
}

# -------------------------------
# 4. Evaluate with TimeSeriesSplit
# -------------------------------
tscv = TimeSeriesSplit(n_splits=5)
results = {}

for name, model in models.items():
    mae_scores, r2_scores = [], []
    pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])
    
    for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        
        mae_scores.append(mean_absolute_error(y_test, y_pred))
        r2_scores.append(r2_score(y_test, y_pred))
    
    results[name] = {"MAE": np.mean(mae_scores), "R²": np.mean(r2_scores)}

results_df = pd.DataFrame(results).T.sort_values(by="MAE")
print("\nModel Performance (Cross-Validation):")
print(results_df)

# -------------------------------
# 5. AutoML: Pick Best Model
# -------------------------------
best_model_name = results_df.index[0]
print(f"\n✅ Best Model Selected: {best_model_name}")

best_model = models[best_model_name]
final_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", best_model)])

# Train best model on all data
final_pipeline.fit(X, y)

# -------------------------------
# 6. Predict Tomorrow’s Price
# -------------------------------
latest_features = df[features].iloc[-1:]
next_day_prediction = final_pipeline.predict(latest_features)[0]

print(f"\n📈 Predicted next close price for {ticker} (tomorrow): {next_day_prediction:.2f}")


  df = yf.download(ticker, start=start, end=end)
[*********************100%***********************]  1 of 1 completed



Model Performance (Cross-Validation):
                        MAE        R²
Linear Regression  2.391450  0.921143
Stacking           2.976327  0.888551
Voting             6.702505  0.269279
Weighted Voting    7.672900 -0.001707
Random Forest      9.109742 -0.493245
Gradient Boosting  9.232680 -0.502380

✅ Best Model Selected: Linear Regression

📈 Predicted next close price for AAPL (tomorrow): 232.66
