In [6]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import yfinance as yf
import requests
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from pymongo import MongoClient

# ---------------- Configuration ----------------
API_KEY = "your_gnews_api_key"  # Replace this
stocks = {"AAPL": "Apple", "GOOGL": "Google", "AMZN": "Amazon"}
start_date = "2023-01-01"
end_date = "2025-01-01"
analyzer = SentimentIntensityAnalyzer()

# MongoDB Setup
client = MongoClient("mongodb://localhost:27017/")
db = client["stock_prediction"]

# ---------------- Step 1: Fetch Stock Data ----------------
stock_data = []
for ticker in stocks:
    data = yf.download(ticker, start=start_date, end=end_date)
    data = data.reset_index()
    data["ticker"] = ticker
    for index, row in data.iterrows():
        stock_data.append({
                "ticker": ticker,
                "timestamp": row["Date"],
                "open": row["Open"],
                "close": row["Close"],
                "high": row["High"],
                "low": row["Low"],
                "volume": row["Volume"]
            })

if stock_data:
         db.raw_stock_data.insert_many.insert_many(stock_data)
        print(f"Inserted {len(stock_data)} records for {len(stocks)} stocks.")
# # Save to MongoDB
# db.raw_stock_data.delete_many({})
#(stock_df.to_dict("records"))

# # ---------------- Step 2: Fetch News Data ----------------
# all_news = []
# for ticker, query in stocks.items():
#     print(f"Fetching news for {ticker}")
#     current_date = datetime.strptime(start_date, "%Y-%m-%d")
#     while current_date < datetime.strptime(end_date, "%Y-%m-%d"):
#         next_date = current_date + timedelta(days=30)
#         url = f"https://gnews.io/api/v4/search?q={query}&from={current_date.date()}&to={next_date.date()}&lang=en&token={API_KEY}"
#         try:
#             response = requests.get(url)
#             articles = response.json().get("articles", [])
#             for article in articles:
#                 pub_date = article["publishedAt"][:10]
#                 sentiment = analyzer.polarity_scores(article["title"])
#                 all_news.append({
#                     "ticker": ticker,
#                     "date": pub_date,
#                     "title": article["title"],
#                     "vader_score": sentiment["compound"]
#                 })
#         except Exception as e:
#             print("News fetch failed:", e)
#         current_date = next_date

# news_df = pd.DataFrame(all_news)
# news_df["date"] = pd.to_datetime(news_df["date"])

# # Save to MongoDB
# db.raw_news_data.delete_many({})
# db.raw_news_data.insert_many(news_df.to_dict("records"))

# # ---------------- Step 3: Merge and Aggregate ----------------
# agg_news = news_df.groupby(["ticker", "date"])["vader_score"].agg(["mean", "count"]).reset_index()
# agg_news.columns = ["ticker", "date", "avg_vader_score", "news_count"]

# df = pd.merge(stock_df, agg_news, on=["ticker", "date"], how="left")
# df["avg_vader_score"].fillna(0, inplace=True)
# df["news_count"].fillna(0, inplace=True)

# # ---------------- Step 4: Feature Engineering ----------------
# df = df.sort_values(by=["ticker", "date"]).reset_index(drop=True)
# df["return"] = df.groupby("ticker")["close"].pct_change()
# df["future_return"] = df.groupby("ticker")["close"].pct_change(periods=3).shift(-3)

# df["ma_3"] = df.groupby("ticker")["close"].transform(lambda x: x.rolling(3).mean())
# df["ma_7"] = df.groupby("ticker")["close"].transform(lambda x: x.rolling(7).mean())

# df["diff_ma_3"] = df["close"] - df["ma_3"]
# df["diff_ma_7"] = df["close"] - df["ma_7"]

# df["volatility_3"] = df.groupby("ticker")["close"].transform(lambda x: x.rolling(3).std())
# df["volatility_7"] = df.groupby("ticker")["close"].transform(lambda x: x.rolling(7).std())

# df = df.dropna()

# # Save merged and processed data
# db.merged_data.delete_many({})
# db.merged_data.insert_many(df.to_dict("records"))

# # ---------------- Step 5: Modeling ----------------
# features = ["close", "avg_vader_score", "news_count", "return", "ma_3", "ma_7",
#             "diff_ma_3", "diff_ma_7", "volatility_3", "volatility_7"]

# X = df[features]
# y = df["future_return"]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# model = RandomForestRegressor(n_estimators=100, random_state=42)
# model.fit(X_train, y_train)

# y_pred = model.predict(X_test)

# # ---------------- Step 6: Evaluation ----------------
# print("\U0001F4C9 MSE:", mean_squared_error(y_test, y_pred))
# print("\U0001F50D R² Score:", r2_score(y_test, y_pred))

# # Save predictions
# results_df = X_test.copy()
# results_df["actual_future_return"] = y_test.values
# results_df["predicted_future_return"] = y_pred

# db.predictions.delete_many({})
# db.predictions.insert_many(results_df.reset_index(drop=True).to_dict("records"))

# # ---------------- Optional: Feature Importance ----------------
# importances = pd.Series(model.feature_importances_, index=features)
# importances.sort_values().plot(kind="barh", figsize=(8, 6), title="Feature Importances")
# plt.tight_layout()
# plt.show()


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


[{'ticker': 'AAPL', 'timestamp': Ticker
   2023-01-03
Name: 0, dtype: datetime64[ns], 'open': Ticker
AAPL    128.782633
Name: 0, dtype: object, 'close': Ticker
AAPL    123.632515
Name: 0, dtype: object, 'high': Ticker
AAPL    129.395502
Name: 0, dtype: object, 'low': Ticker
AAPL    122.742858
Name: 0, dtype: object, 'volume': Ticker
AAPL    112117500
Name: 0, dtype: object}, {'ticker': 'AAPL', 'timestamp': Ticker
   2023-01-04
Name: 1, dtype: datetime64[ns], 'open': Ticker
AAPL    125.431615
Name: 1, dtype: object, 'close': Ticker
AAPL    124.907707
Name: 1, dtype: object, 'high': Ticker
AAPL    127.181276
Name: 1, dtype: object, 'low': Ticker
AAPL    123.64242
Name: 1, dtype: object, 'volume': Ticker
AAPL    89113600
Name: 1, dtype: object}, {'ticker': 'AAPL', 'timestamp': Ticker
   2023-01-05
Name: 2, dtype: datetime64[ns], 'open': Ticker
AAPL    125.668865
Name: 2, dtype: object, 'close': Ticker
AAPL    123.583115
Name: 2, dtype: object, 'high': Ticker
AAPL    126.301508
Name: 2, dt

TypeError: cannot concatenate object of type '<class 'dict'>'; only Series and DataFrame objs are valid