In [None]:
! pip install yfinance pandas

In [None]:
import yfinance as yf
import pandas as pd

stocks = ['AAPL', 'GOOGL', 'MSFT']
data = {}

for stock in stocks:
    df = yf.download(stock, start='2022-01-01', end='2024-12-31', interval='1d')
    df.reset_index(inplace=True)
    df['Stock'] = stock
    data[stock] = df
    df.to_csv(f'{stock}_data.csv', index=False) 

print("Data saved as CSV files for: ", list(data.keys()))


In [None]:
! pip install requests vaderSentiment pandas

In [None]:
import requests
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from datetime import datetime

API_KEY = 'c81c5e7584cb4ed3bee6262657dcb527'
stocks = {
    "AAPL": "Apple stock",
    "GOOGL": "Google stock",
    "MSFT": "Microsoft stock"
}

analyzer = SentimentIntensityAnalyzer()
all_results = []

for ticker, query in stocks.items():
    print(f"Fetching news for {ticker}...")
    url = f'https://newsapi.org/v2/everything?q={query}&language=en&sortBy=publishedAt&pageSize=100&apiKey={API_KEY}'
    response = requests.get(url)
    data = response.json()

    articles = data.get('articles', [])
    print(f"  → Found {len(articles)} articles for {ticker}")

    for article in articles:
        title = article['title']
        published_at = article['publishedAt'][:10]
        sentiment = analyzer.polarity_scores(title)
        label = 'positive' if sentiment['compound'] >= 0.05 else 'negative' if sentiment['compound'] <= -0.05 else 'neutral'
        all_results.append([ticker, published_at, article['source']['name'], title, sentiment['compound'], label])

df = pd.DataFrame(all_results, columns=['ticker', 'date', 'source', 'headline', 'vader_score', 'vader_label'])
df.to_csv('news_sentiment_data.csv', index=False)
print("Combined sentiment data saved as 'news_sentiment_data.csv'")


In [None]:
df.head()

In [None]:

tickers = ['AAPL', 'GOOGL', 'MSFT']

for ticker in tickers:
    df = yf.download(ticker, start="2024-03-01", end="2024-04-01", auto_adjust=True)
    df.head()
    if df.empty:
        print(f" No data for {ticker}, skipping...")
        continue

    df.reset_index(inplace=True)  
    df['ticker'] = ticker         
    print(df.head())

In [None]:
import requests
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from datetime import datetime, timedelta

API_KEY = 'ba32d4f4031a788c262f72ef57a1865a'

stocks = {
    "AAPL": "Apple stock",
    "GOOGL": "Google stock",
    "MSFT": "Microsoft stock"
}

analyzer = SentimentIntensityAnalyzer()
all_results = []

def generate_monthly_ranges(start_date, end_date):
    ranges = []
    current = start_date
    while current < end_date:
        next_month = (current.replace(day=1) + timedelta(days=32)).replace(day=1)
        ranges.append((current.strftime("%Y-%m-%d"), next_month.strftime("%Y-%m-%d")))
        current = next_month
    return ranges

start_date = datetime(2023, 1, 1)
end_date = datetime(2025, 1, 1)
monthly_ranges = generate_monthly_ranges(start_date, end_date)

for ticker, query in stocks.items():
    print(f"\n Fetching news for {ticker}...")
    for from_date, to_date in monthly_ranges:
        print(f"  → {from_date} to {to_date}")
        url = f"https://gnews.io/api/v4/search?q={query}&from={from_date}&to={to_date}&lang=en&max=100&token={API_KEY}"
        try:
            response = requests.get(url)
            data = response.json()
            articles = data.get('articles', [])

            for article in articles:
                title = article['title']
                published_at = article['publishedAt'][:10]
                sentiment = analyzer.polarity_scores(title)
                label = 'positive' if sentiment['compound'] >= 0.05 else 'negative' if sentiment['compound'] <= -0.05 else 'neutral'
                all_results.append([ticker, published_at, article['source']['name'], title, sentiment['compound'], label])

        except Exception as e:
            print(f" Error during {from_date} → {to_date}: {e}")

df = pd.DataFrame(all_results, columns=['ticker', 'date', 'source', 'headline', 'vader_score', 'vader_label'])
df.to_csv('news_sentiment_gnews_full.csv', index=False)
print("\n Monthly sentiment data saved as 'news_sentiment_gnews_full.csv'")


In [None]:
df.head()

In [None]:
import pandas as pd
import random
from datetime import datetime, timedelta

df = pd.read_csv('news_sentiment_gnews_full.csv') 

start_date = datetime(2023, 1, 1)
end_date = datetime(2025, 1, 1)

def random_date(start, end):
    delta = end - start
    random_days = random.randint(0, delta.days - 1)
    return start + timedelta(days=random_days)

df['date'] = df['date'].apply(lambda x: random_date(start_date, end_date).strftime("%Y-%m-%d"))

df.to_csv('news_sentiment_data_randomized.csv', index=False)
print(" Randomized dates assigned and saved to 'news_sentiment_data_randomized.csv'")


In [None]:
df.head()

In [None]:
from pymongo import MongoClient
import pandas as pd
import numpy as np

client = MongoClient("mongodb://localhost:27017")  
db = client["stock_prediction"]  
collection = db["merged_stock_gnews"]  


data = list(collection.find())
df = pd.DataFrame(data)

df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(['ticker', 'date'])

features = []

for ticker, group in df.groupby("ticker"):
    group = group.sort_values("date").copy()

    group['prev_close'] = group['close'].shift(1)
    group['return'] = group['close'].pct_change()

   
    group['ma_3'] = group['close'].rolling(window=3).mean()
    group['ma_7'] = group['close'].rolling(window=7).mean()

    
    group['diff_ma_3'] = group['close'] - group['ma_3']
    group['diff_ma_7'] = group['close'] - group['ma_7']

    group['volatility_3'] = group['return'].rolling(window=3).std()
    group['volatility_7'] = group['return'].rolling(window=7).std()

    group['future_close'] = group['close'].shift(-1)
    group['future_return'] = (group['future_close'] - group['close']) / group['close']

    features.append(group)


features_df = pd.concat(features)
features_df.dropna(inplace=True)  

model_df = features_df[[
    'ticker', 'date', 'close', 'avg_vader_score', 'news_count', 'return',
    'ma_3', 'ma_7', 'diff_ma_3', 'diff_ma_7', 'volatility_3', 'volatility_7',
    'future_return'
]]

print(model_df.head())

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

feature_cols = [
    'close', 'avg_vader_score', 'news_count', 'return',
    'ma_3', 'ma_7', 'diff_ma_3', 'diff_ma_7', 'volatility_3', 'volatility_7'
]
target_col = 'future_return'


model_df = model_df.dropna(subset=feature_cols + [target_col])

model_df = model_df.sort_values(by="date")

split_index = int(0.8 * len(model_df))
train_df = model_df.iloc[:split_index]
test_df = model_df.iloc[split_index:]

X_train = train_df[feature_cols]
y_train = train_df[target_col]
X_test = test_df[feature_cols]
y_test = test_df[target_col]

model = xgb.XGBRegressor(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    objective='reg:squarederror',
    random_state=42
)
model.fit(X_train, y_train)


y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f" MSE: {mse:.5f}")
print(f"R² Score: {r2:.5f}")


In [None]:
print(model_df[feature_cols + ['future_return']].corr()['future_return'])


In [None]:
from pymongo import MongoClient
import pandas as pd
from sklearn.preprocessing import StandardScaler

client = MongoClient("mongodb://localhost:27017") 
db = client["stock_prediction"]
collection = db["merged_stock_gnews"]
data = list(collection.find())
df = pd.DataFrame(data)

df = df.drop(columns=["_id"])
df["date"] = pd.to_datetime(df["date"])
df = df.sort_values(by=["ticker", "date"]).reset_index(drop=True)

df["return"] = df.groupby("ticker")["close"].pct_change()
df["future_return"] = df.groupby("ticker")["close"].pct_change(periods=3).shift(-3)

df["ma_3"] = df.groupby("ticker")["close"].transform(lambda x: x.rolling(3).mean())
df["ma_7"] = df.groupby("ticker")["close"].transform(lambda x: x.rolling(7).mean())

df["diff_ma_3"] = df["close"] - df["ma_3"]
df["diff_ma_7"] = df["close"] - df["ma_7"]

df["volatility_3"] = df.groupby("ticker")["close"].transform(lambda x: x.rolling(3).std())
df["volatility_7"] = df.groupby("ticker")["close"].transform(lambda x: x.rolling(7).std())

df = df.dropna()

df["label"] = df["future_return"].apply(lambda x: 1 if x > 0 else 0)

features = ["close", "avg_vader_score", "news_count", "return", "ma_3", "ma_7",
            "diff_ma_3", "diff_ma_7", "volatility_3", "volatility_7"]
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])

print(df[features + ["label"]].head())




In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

X = df[features]
y = df["label"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(" Accuracy:", accuracy_score(y_test, y_pred))
print("\n Classification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


import matplotlib.pyplot as plt

importances = pd.Series(model.feature_importances_, index=features)
importances.sort_values().plot(kind="barh", figsize=(8, 6), title="Feature Importances")
plt.tight_layout()
plt.show()


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import pandas as pd

X = df[features]
y = df['future_return']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

regressor = RandomForestRegressor(n_estimators=100, random_state=42)
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)


print("MSE:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

plt.figure(figsize=(8, 5))
plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r--')
plt.xlabel("Actual Future Return")
plt.ylabel("Predicted Future Return")
plt.title("Actual vs Predicted Return")
plt.grid(True)
plt.tight_layout()
plt.show()
