In [2]:
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import requests
from bs4 import BeautifulSoup
import nltk
nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\INDIA\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
ticker = "MSFT"
start = "2021-01-01"
end = datetime.today().strftime('%Y-%m-%d')

stock_df = yf.download(ticker, start=start, end=end)
stock_df = stock_df[['Close']].reset_index()
stock_df.head()


  stock_df = yf.download(ticker, start=start, end=end)
[*********************100%***********************]  1 of 1 completed


Price,Date,Close
Ticker,Unnamed: 1_level_1,MSFT
0,2021-01-04,209.617188
1,2021-01-05,209.819397
2,2021-01-06,204.378906
3,2021-01-07,210.194946
4,2021-01-08,211.475632


In [5]:
def get_yahoo_finance_news(ticker="AAPL"):
    url = f"https://finance.yahoo.com/quote/{ticker}?p={ticker}"
    r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    soup = BeautifulSoup(r.text, "html.parser")
    
    headlines = []
    for item in soup.find_all('h3'):  # Yahoo Finance news headlines
        text = item.get_text(strip=True)
        if text:
            headlines.append(text)
    return headlines

news = get_yahoo_finance_news("MSFT")
print(news[:5])


['News', 'Life', 'Entertainment', 'Finance', 'Sports']


In [7]:
sia = SentimentIntensityAnalyzer()

def daily_sentiment(ticker="AAPL"):
    headlines = get_yahoo_finance_news(ticker)
    scores = {"pos":0, "neg":0, "neu":0, "compound":0}
    
    for h in headlines:
        s = sia.polarity_scores(h)
        for k in scores:
            scores[k] += s[k]
    
    n = len(headlines) if headlines else 1
    for k in scores:
        scores[k] /= n
    return scores

sentiment_today = daily_sentiment("MSFT")
print(sentiment_today)


{'pos': 0.06476785714285714, 'neg': 0.015339285714285713, 'neu': 0.9198750000000001, 'compound': 0.04192142857142857}


In [8]:
dates = pd.date_range(end=datetime.today(), periods=90).strftime('%Y-%m-%d')

sentiment_df = pd.DataFrame([
    {"Date": d, **daily_sentiment("AAPL")} for d in dates
])

sentiment_df["Date"] = pd.to_datetime(sentiment_df["Date"])
sentiment_df.head()


Unnamed: 0,Date,pos,neg,neu,compound
0,2025-05-20,0.051702,0.023018,0.925298,0.019109
1,2025-05-21,0.051702,0.023018,0.925298,0.019109
2,2025-05-22,0.051702,0.023018,0.925298,0.019109
3,2025-05-23,0.051702,0.023018,0.925298,0.019109
4,2025-05-24,0.051702,0.023018,0.925298,0.019109


In [9]:
stock_df = yf.download(ticker, start=start, end=end)

# Drop multi-level index if exists
stock_df.columns = stock_df.columns.get_level_values(0)

# Keep only Close price and reset index
stock_df = stock_df[['Close']].reset_index()


  stock_df = yf.download(ticker, start=start, end=end)
[*********************100%***********************]  1 of 1 completed


In [10]:
# Ensure Date is datetime in both DataFrames
stock_df["Date"] = pd.to_datetime(stock_df["Date"])
sentiment_df["Date"] = pd.to_datetime(sentiment_df["Date"])

# Merge on Date
merged_df = pd.merge(stock_df, sentiment_df, on="Date", how="inner")

# Create target (next day's close)
merged_df["Next_Close"] = merged_df["Close"].shift(-1)
merged_df = merged_df.dropna()

print(merged_df.head())


        Date       Close       pos       neg       neu  compound  Next_Close
0 2025-05-20  458.170013  0.051702  0.023018  0.925298  0.019109  452.570007
1 2025-05-21  452.570007  0.051702  0.023018  0.925298  0.019109  454.859985
2 2025-05-22  454.859985  0.051702  0.023018  0.925298  0.019109  450.179993
3 2025-05-23  450.179993  0.051702  0.023018  0.925298  0.019109  460.690002
4 2025-05-27  460.690002  0.051702  0.023018  0.925298  0.019109  457.359985


In [11]:
features = ["Close", "pos", "neg", "neu", "compound"]
X = merged_df[features]
y = merged_df["Next_Close"]


In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, shuffle=False  # keep time order
)


In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score


preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), features)
    ],
    remainder="passthrough"  # if we had extra cols, keep them
)


In [23]:
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(
        n_estimators=200,
        max_depth=10,
        random_state=42
    ))
])


In [24]:
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print("MAE:", mean_absolute_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))


MAE: 19.18649708557131
R²: -3.4422183474417567


In [25]:
latest_features = merged_df[features].iloc[-1:]
next_day_prediction = pipeline.predict(latest_features)[0]

print(f"Predicted next close price: {next_day_prediction:.2f}")


Predicted next close price: 494.98


In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, r2_score


models = {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.001),
    "SVR (RBF)": SVR(kernel="rbf", C=100, gamma=0.1),
    "KNN": KNeighborsRegressor(n_neighbors=5),
    "Random Forest": RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42)
}


In [27]:
tscv = TimeSeriesSplit(n_splits=5)

results = {}

for name, model in models.items():
    mae_scores, r2_scores = [], []
    
    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ])
    
    for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        
        mae_scores.append(mean_absolute_error(y_test, y_pred))
        r2_scores.append(r2_score(y_test, y_pred))
    
    results[name] = {
        "MAE": np.mean(mae_scores),
        "R²": np.mean(r2_scores)
    }

results_df = pd.DataFrame(results).T.sort_values(by="MAE")
print(results_df)


                         MAE        R²
Linear Regression   4.908962 -0.641520
Lasso               4.909459 -0.642310
Ridge               4.977400 -0.810408
SVR (RBF)           8.946023 -3.590208
Gradient Boosting   9.265345 -3.524004
Random Forest       9.461580 -3.759956
KNN                10.500401 -4.704523


In [28]:
best_model_name = results_df.index[0]
print(f"Best Model: {best_model_name}")

# Refit best model on all data
best_model = models[best_model_name]
pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", best_model)])
pipeline.fit(X, y)

latest_features = merged_df[features].iloc[-1:]
next_day_prediction = pipeline.predict(latest_features)[0]

print(f"Predicted next close price for tomorrow: {next_day_prediction:.2f}")


Best Model: Linear Regression
Predicted next close price for tomorrow: 522.61
