In [2]:
import yfinance as yf
import numpy as np
from MongoDB_Connection import start_db
import pandas as pd
import time
import talib as ta
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [11]:
# Connect to MongoDB
db = start_db()
print(db)

['C964_Database', 'admin', 'config', 'local', 'newDatabase']
Entry exists
Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'C964_Database')


In [None]:
#Tickers for training
print("Reading list of tickers to be loaded.")

tickers = [
    "NVDA", "QBTS", "TSLA", "LCID", "F", "PLTR", "SOUN", "RGTI", "INFA", "AAL",
    "MARA", "BTG", "INTC", "CLF", "PDD", "JWN", "DJT", "AAPL", "OKLO", "SOFI",
    "X", "NIO", "WRD", "ITUB", "SMCI", "NU", "RIOT", "RIVN", "SMR", "AMD",
    "GOOGL", "ACHR", "GME", "UEC", "BBD", "ABEV", "AMZN", "AMCR", "RIG", "BAC",
    "IONQ", "HOOD", "AGNC", "WBD", "PFE", "CLSK", "CNH", "CCL", "IAG", "GRAB",
    "T", "KGC", "HBAN", "HIMS", "ERIC", "AUR", "MRVL", "PSLV", "GOOG", "VALE",
    "NGD", "SNAP", "CX", "RKLB", "BB", "UNH", "HOLX", "HL", "CSX", "NOK", "VOD",
    "PONY", "LYG", "MSFT", "HPE", "TEM", "KMI", "UAA", "ADT", "UBER", "GAP",
    "HAL", "WMB", "LYFT", "HLN", "CMCSA", "CDE", "TEVA", "GGB", "LUMN", "VTRS",
    "NCLH", "NXE", "PCG", "WMT", "MU", "LUV", "SLB", "PTEN", "CPNG"
]

In [None]:
# Entering documents into database based on ticker symbols above. More fields will be added once the database is loaded with the initial ticker data.

print("Inserting documents into MongoDB.")

for ticker in tickers:
    try:
        # Gather 10 year historical data
        data = yf.download(ticker, period="10y", interval="1d", progress=False)

        if not data.empty:
            # Move 'Date' from index to column
            data.reset_index(inplace=True)
            data["Ticker"] = ticker

            # Convert NaNs to None (MongoDB does not accept NaN)
            data.replace({np.nan: None}, inplace=True)

            # This prevents the data from being formatted incorrectly. Each value needs to be a string.
            if isinstance(data.columns, pd.MultiIndex):
                data.columns = ['_'.join(filter(None, col)).strip() for col in data.columns]

            # Create dictionary of documents to enter into database
            newDocs = data.to_dict("records")

            # Insert into MongoDB
            db.Finance_Data.insert_many(newDocs)
            print(f"Inserted {len(newDocs)} attributes for {ticker}.")

        # Timer to prevent rate limit on yfinance. This limiter will cause a multi-day lockout.
        time.sleep(3)
    except Exception as e:
        print(f"{ticker} had an error being inserted: {e}")


In [None]:
# Restructuring the document fields in the database. From the yfinance API the fields are labeled as "close_{ticker}". This needed to be changed since the subsequent actions need to be easily manipulated across all tickers. Having them be uniform will make that process easier.
print("Transforming field names.")

rename_fields = ["Open", "High", "Low", "Close", "Volume"]
tickers = db.Finance_Data.distinct("Ticker")

for ticker in tickers:
    suffix = f"_{ticker}"
    for field in rename_fields:
        old_field = f"{field}{suffix}"
        new_field = field
        db.Finance_Data.update_many(
            {old_field: {"$exists": True}},
            {"$rename": {old_field: new_field}}
        )
    print(f"Renamed fields for {ticker}")


In [None]:
"""
The next few sections will be for adding the additional columns to the database documents.
The first one will add a "Success" column. This will be based on whether the stock's "Close" value is greater than the "Open" value.
This is the only one that will require a manual calculation to be performed.
Next, the "Sentiment Score" column will be added using the nltk package. This library aggregates a list of related news articles pertaining to the ticker symbol in question. A value will be produced from -1 to +1.
The columns for SMA, EMA, RSI and MACD will use a group of values from the ticker symbol in order to make the calculations.
This action will be performed using the TA-lib package.
"""


In [6]:
# Adding the 'Success' column
print("Adding Success column to database.")

# Gather each documents Open and Close value. If Open < Close, then Success = 1. Otherwise, Success = 0.
for doc in db.Finance_Data.find():
    doc_close = doc.get("Close")
    doc_open = doc.get("Open")

    if doc_close is not None and doc_open is not None:
        success_flag = 1 if doc_close > doc_open else 0
        db.Finance_Data.update_one(
            {"_id": doc["_id"]},
            {"$set": {"Success": success_flag}}
        )


In [None]:
# Adding the 'Sentiment Score' column
print("Adding sentiment score for each ticker.")

nltk.download('vader_lexicon')

# Initialize Sentiment Analyzer for using model against related news articles
sia = SentimentIntensityAnalyzer()

# Get list of tickers
tickers = db.Finance_Data.distinct("Ticker")

for ticker_symbol in tickers:
    try:
        ticker = yf.Ticker(ticker_symbol)
        news_items = ticker.news

        # Calculate average compound sentiment score. This will be added as the data point for all ticker entries for initial concept.
        compound_scores = []

        for item in news_items:
            content = item.get("content", {})
            summary = content.get("summary", "")
            sentiment = sia.polarity_scores(summary)
            compound_scores.append(sentiment["compound"])

        if compound_scores:
            average_sentiment = sum(compound_scores) / len(compound_scores)
        else:
            average_sentiment = None

        # Update all documents for this ticker with sentiment (or just latest if preferred)
        db.Finance_Data.update_many(
            {"Ticker": ticker_symbol},
            {"$set": {"Sentiment": average_sentiment}}
        )

        # If there is no news available for given ticker, set default sentiment to neutral (0.0).
    except Exception as e:
        print(f"Error processing {ticker_symbol}: {e}. Default sentiment value is 0.0.")
        db.Finance_Data.update_many(
            {"Ticker": ticker_symbol},
            {"$set": {"Sentiment": 0.0}}
        )


In [22]:
# Adding the columns for SMA (Simple Moving Average), EMA (Exponential Moving Average), RSI (Relative Strength Index) and MACD ( Moving Average Convergence/Divergence)
print("Adding technical indicators")

tickers = db.Finance_Data.distinct("Ticker")
for symbol in tickers:

    # Converting to list and sorting by date so that each indicator aligns properly
    current = list(db.Finance_Data.find({"Ticker": symbol}).sort("Date",1))
    close = [doc["Close"] for doc in current if "Close" in doc]

    # Skip the first 29 days to avoid NaN values being placed in the database
    if len(close) < 30:
        continue

    npArray = np.array(close)

    SMA_values = ta.SMA(npArray, timeperiod=30)
    EMA_values = ta.EMA(npArray, timeperiod=30)
    RSI_values = ta.RSI(npArray, timeperiod=30)
    MACD_values, MACD_signal, _ = ta.MACD(npArray, fastperiod=10, slowperiod=28, signalperiod=7)


    # Iterating through document to add indicator value.
    for i, doc in enumerate(current):
        # Will be used to store all the key:value pairs for inserting document fields
        insert = {}

        # The second check to make sure there are no NaN values placed in the database
        # There may be a situation where the value is NaN after the 30-day point.

        #SMA
        if i < len(SMA_values) and SMA_values[i] is not np.nan:
            insert["SMA"] = float(EMA_values[i])
        #EMA
        if i < len(EMA_values) and EMA_values[i] is not np.nan:
            insert["EMA"] = float(EMA_values[i])
        #RSI
        if i < len(RSI_values) and RSI_values[i] is not np.nan:
            insert["RSI"] = float(RSI_values[i])
        #MACD
        if i < len(MACD_values) and MACD_values[i] is not np.nan:
            insert["MACD"] = float(MACD_values[i])

        # Insert values for document
        db.Finance_Data.update_one({"_id": doc["_id"]}, {"$set": insert})

print("Finish")

Finish
