In [2]:
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import requests
from bs4 import BeautifulSoup
import nltk
nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\INDIA\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
ticker = "AAPL"
start = "2021-01-01"
end = datetime.today().strftime('%Y-%m-%d')

stock_df = yf.download(ticker, start=start, end=end)
stock_df = stock_df[['Close']].reset_index()
stock_df.head()


  stock_df = yf.download(ticker, start=start, end=end)
[*********************100%***********************]  1 of 1 completed


Price,Date,Close
Ticker,Unnamed: 1_level_1,AAPL
0,2021-01-04,126.096573
1,2021-01-05,127.655617
2,2021-01-06,123.358528
3,2021-01-07,127.567947
4,2021-01-08,128.668991


In [4]:
stock_df.tail()

Price,Date,Close
Ticker,Unnamed: 1_level_1,AAPL
1155,2025-08-11,227.179993
1156,2025-08-12,229.649994
1157,2025-08-13,233.330002
1158,2025-08-14,232.779999
1159,2025-08-15,231.589996


In [5]:
def get_yahoo_finance_news(ticker="AAPL"):
    url = f"https://finance.yahoo.com/quote/{ticker}?p={ticker}"
    r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    soup = BeautifulSoup(r.text, "html.parser")
    
    headlines = []
    for item in soup.find_all('h3'):  # Yahoo Finance news headlines
        text = item.get_text(strip=True)
        if text:
            headlines.append(text)
    return headlines

news = get_yahoo_finance_news("AAPL")
print(news[:5])


['News', 'Life', 'Entertainment', 'Finance', 'Sports']


In [6]:
sia = SentimentIntensityAnalyzer()

def daily_sentiment(ticker="AAPL"):
    headlines = get_yahoo_finance_news(ticker)
    scores = {"pos":0, "neg":0, "neu":0, "compound":0}
    
    for h in headlines:
        s = sia.polarity_scores(h)
        for k in scores:
            scores[k] += s[k]
    
    n = len(headlines) if headlines else 1
    for k in scores:
        scores[k] /= n
    return scores

sentiment_today = daily_sentiment("AAPL")
print(sentiment_today)


{'pos': 0.051701754385964915, 'neg': 0.023017543859649124, 'neu': 0.9252982456140352, 'compound': 0.019108771929824558}


In [7]:
dates = pd.date_range(end=datetime.today(), periods=180).strftime('%Y-%m-%d')

sentiment_df = pd.DataFrame([
    {"Date": d, **daily_sentiment("AAPL")} for d in dates
])

sentiment_df["Date"] = pd.to_datetime(sentiment_df["Date"])
sentiment_df.head()


Unnamed: 0,Date,pos,neg,neu,compound
0,2025-02-19,0.051702,0.023018,0.925298,0.019109
1,2025-02-20,0.051702,0.023018,0.925298,0.019109
2,2025-02-21,0.051702,0.023018,0.925298,0.019109
3,2025-02-22,0.051702,0.023018,0.925298,0.019109
4,2025-02-23,0.051702,0.023018,0.925298,0.019109


In [8]:
stock_df = yf.download(ticker, start=start, end=end)

# Drop multi-level index if exists
stock_df.columns = stock_df.columns.get_level_values(0)

# Keep only Close price and reset index
stock_df = stock_df[['Close']].reset_index()


  stock_df = yf.download(ticker, start=start, end=end)
[*********************100%***********************]  1 of 1 completed


In [9]:
# Ensure Date is datetime in both DataFrames
stock_df["Date"] = pd.to_datetime(stock_df["Date"])
sentiment_df["Date"] = pd.to_datetime(sentiment_df["Date"])

# Merge on Date
merged_df = pd.merge(stock_df, sentiment_df, on="Date", how="inner")

# Create target (next day's close)
merged_df["Next_Close"] = merged_df["Close"].shift(-1)
merged_df = merged_df.dropna()

print(merged_df.head())


        Date       Close       pos       neg       neu  compound  Next_Close
0 2025-02-19  244.272079  0.051702  0.023018  0.925298  0.019109  245.229736
1 2025-02-20  245.229736  0.051702  0.023018  0.925298  0.019109  244.950424
2 2025-02-21  244.950424  0.051702  0.023018  0.925298  0.019109  246.496643
3 2025-02-24  246.496643  0.051702  0.023018  0.925298  0.019109  246.436783
4 2025-02-25  246.436783  0.051702  0.023018  0.925298  0.019109  239.773087


In [10]:
merged_df.corr(numeric_only=True)['Next_Close']

Close         0.93583
pos               NaN
neg               NaN
neu               NaN
compound          NaN
Next_Close    1.00000
Name: Next_Close, dtype: float64

In [11]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 123 entries, 0 to 122
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Date        123 non-null    datetime64[ns]
 1   Close       123 non-null    float64       
 2   pos         123 non-null    float64       
 3   neg         123 non-null    float64       
 4   neu         123 non-null    float64       
 5   compound    123 non-null    float64       
 6   Next_Close  123 non-null    float64       
dtypes: datetime64[ns](1), float64(6)
memory usage: 7.7 KB


In [12]:
features = ["Close", "pos", "neg", "neu", "compound"]
X = merged_df[features]
y = merged_df["Next_Close"]


In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, shuffle=False  # keep time order
)


In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score


preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), features)
    ],
    remainder="passthrough"  # if we had extra cols, keep them
)


In [16]:
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(
        n_estimators=200,
        max_depth=10,
        random_state=42
    ))
])


In [17]:
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print("MAE:", mean_absolute_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))


MAE: 3.5027149019197283
R²: 0.6432629728181469


In [18]:
X_train

Unnamed: 0,Close,pos,neg,neu,compound
0,244.272079,0.051702,0.023018,0.925298,0.019109
1,245.229736,0.051702,0.023018,0.925298,0.019109
2,244.950424,0.051702,0.023018,0.925298,0.019109
3,246.496643,0.051702,0.023018,0.925298,0.019109
4,246.436783,0.051702,0.023018,0.925298,0.019109
...,...,...,...,...,...
81,198.195068,0.051702,0.023018,0.925298,0.019109
82,195.418213,0.051702,0.023018,0.925298,0.019109
83,196.357147,0.051702,0.023018,0.925298,0.019109
84,200.772141,0.051702,0.023018,0.925298,0.019109


In [19]:
latest_features = merged_df[features].iloc[-1:]
next_day_prediction = pipeline.predict(latest_features)[0]

print(f"Predicted next close price: {next_day_prediction:.2f}")


Predicted next close price: 237.41


In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, r2_score


models = {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.001),
    "SVR (RBF)": SVR(kernel="rbf", C=100, gamma=0.1),
    "KNN": KNeighborsRegressor(n_neighbors=5),
    "Random Forest": RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42)
}


In [25]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.01),
    "SVR (RBF)": SVR(kernel="rbf", C=100, gamma=0.1),
    "KNN": KNeighborsRegressor(n_neighbors=5),
    "Random Forest": RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42)
}


In [26]:
tscv = TimeSeriesSplit(n_splits=5)

results = {}

for name, model in models.items():
    mae_scores, r2_scores = [], []
    
    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ])
    
    for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        
        mae_scores.append(mean_absolute_error(y_test, y_pred))
        r2_scores.append(r2_score(y_test, y_pred))
    
    results[name] = {
        "MAE": np.mean(mae_scores),
        "R²": np.mean(r2_scores)
    }

results_df = pd.DataFrame(results).T.sort_values(by="MAE")
print(results_df)


                        MAE        R²
Linear Regression  3.273149  0.499089
Lasso              3.273829  0.498686
Ridge              3.282630  0.487332
SVR (RBF)          5.503136  0.171976
KNN                5.876165  0.029319
Random Forest      6.229315 -0.497933
Gradient Boosting  6.919075 -1.679095


In [27]:
best_model_name = results_df.index[0]
print(f"Best Model: {best_model_name}")

# Refit best model on all data
best_model = models[best_model_name]
pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", best_model)])
pipeline.fit(X, y)

latest_features = merged_df[features].iloc[-1:]
next_day_prediction = pipeline.predict(latest_features)[0]

print(f"Predicted next close price for tomorrow: {next_day_prediction:.2f}")


Best Model: Linear Regression
Predicted next close price for tomorrow: 230.99


In [29]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    VotingRegressor,
    StackingRegressor
)
from sklearn.metrics import mean_absolute_error, r2_score
import yfinance as yf
from datetime import datetime

# -------------------------------
# 1. Download Stock Data (e.g., GOOG)
# -------------------------------
ticker = "AAPL"
start = "2020-01-01"
end = datetime.today().strftime('%Y-%m-%d')

df = yf.download(ticker, start=start, end=end)

# Flatten possible MultiIndex columns
df.columns = df.columns.get_level_values(0)

# Keep OHLC
df = df.loc[:, ["Open", "High", "Low", "Close"]].reset_index()

# Create target: Next day's Close
df["Next_Close"] = df["Close"].shift(-1)
df = df.dropna().reset_index(drop=True)

# -------------------------------
# 2. Features & Preprocessor
# -------------------------------
features = ["Open", "High", "Low", "Close"]
X = df[features]
y = df["Next_Close"]

preprocessor = ColumnTransformer(
    transformers=[("num", StandardScaler(), features)],
    remainder="drop"
)

# -------------------------------
# 3. Candidate ML Models
# -------------------------------
lr = LinearRegression()
ridge = Ridge(alpha=1.0)
lasso = Lasso(alpha=0.001, max_iter=30000)
rf = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
gb = GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42)

voting_reg = VotingRegressor(estimators=[("lr", lr), ("rf", rf), ("gb", gb)])
weighted_voting_reg = VotingRegressor(
    estimators=[("lr", lr), ("rf", rf), ("gb", gb)],
    weights=[1, 2, 2]
)
stacking_reg = StackingRegressor(
    estimators=[("lr", lr), ("rf", rf), ("gb", gb)],
    final_estimator=Ridge(alpha=1.0)
)

models = {
    "Linear Regression": lr,
    "Ridge Regression": ridge,
    "Lasso Regression": lasso,
    "Random Forest": rf,
    "Gradient Boosting": gb,
    "Voting": voting_reg,
    "Weighted Voting": weighted_voting_reg,
    "Stacking": stacking_reg
}

# -------------------------------
# 4. Evaluate ML Models
# -------------------------------
tscv = TimeSeriesSplit(n_splits=5)
results = {}

for name, model in models.items():
    mae_scores, r2_scores = [], []
    pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])
    
    for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        
        mae_scores.append(mean_absolute_error(y_test, y_pred))
        r2_scores.append(r2_score(y_test, y_pred))
    
    results[name] = {"MAE": np.mean(mae_scores), "R²": np.mean(r2_scores)}

# -------------------------------
# 5. Results
# -------------------------------
results_df = pd.DataFrame(results).T.sort_values(by="MAE", na_position="last")
print("\nModel Performance (Cross-Validation):")
print(results_df)

# -------------------------------
# 6. AutoML: Pick Best ML Model
# -------------------------------
best_model_name = results_df.dropna().index[0]
print(f"\n✅ Best ML Model Selected: {best_model_name}")

best_model = models[best_model_name]
final_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", best_model)])
final_pipeline.fit(X, y)

# -------------------------------
# 7. Save Model as {ticker}_{today}.joblib
# -------------------------------
today_str = datetime.today().strftime("%Y%m%d")
model_filename = f"{ticker}_{today_str}.joblib"
joblib.dump(final_pipeline, model_filename)
print(f"💾 Model saved as {model_filename}")

# -------------------------------
# 8. Predict Tomorrow’s Price
# -------------------------------
latest_features = df[features].iloc[-1:]
next_day_prediction = final_pipeline.predict(latest_features)[0]

print(f"\n📈 Predicted next close (ML AutoML) for {ticker} (tomorrow): {next_day_prediction:.2f}")


  df = yf.download(ticker, start=start, end=end)
[*********************100%***********************]  1 of 1 completed



Model Performance (Cross-Validation):
                        MAE        R²
Linear Regression  2.215202  0.955663
Lasso Regression   2.220408  0.955497
Ridge Regression   2.229898  0.955404
Stacking           2.282525  0.953148
Voting             5.738832  0.592633
Weighted Voting    6.629220  0.427836
Random Forest      7.738581  0.222017
Gradient Boosting  8.260718  0.010347

✅ Best ML Model Selected: Linear Regression
💾 Model saved as AAPL_20250817.joblib

📈 Predicted next close (ML AutoML) for AAPL (tomorrow): 232.60
