In [7]:
import yfinance as yf
import numpy as np
from MongoDB_Connection import start_db
import pandas as pd
import time
import talib as ta
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [9]:
# Connect to MongoDB
db = start_db()
print(db)

['C964_Database', 'admin', 'config', 'local']
Entry exists
Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'C964_Database')


In [None]:
#Tickers for training
tickers = [
    "NVDA", "QBTS", "TSLA", "LCID", "F", "PLTR", "SOUN", "RGTI", "INFA", "AAL",
    "MARA", "BTG", "INTC", "CLF", "PDD", "JWN", "DJT", "AAPL", "OKLO", "SOFI",
    "X", "NIO", "WRD", "ITUB", "SMCI", "NU", "RIOT", "RIVN", "SMR", "AMD",
    "GOOGL", "ACHR", "GME", "UEC", "BBD", "ABEV", "AMZN", "AMCR", "RIG", "BAC",
    "IONQ", "HOOD", "AGNC", "WBD", "PFE", "CLSK", "CNH", "CCL", "IAG", "GRAB",
    "T", "KGC", "HBAN", "HIMS", "ERIC", "AUR", "MRVL", "PSLV", "GOOG", "VALE",
    "NGD", "SNAP", "CX", "RKLB", "BB", "UNH", "HOLX", "HL", "CSX", "NOK", "VOD",
    "PONY", "LYG", "MSFT", "HPE", "TEM", "KMI", "UAA", "ADT", "UBER", "GAP",
    "HAL", "WMB", "LYFT", "HLN", "CMCSA", "CDE", "TEVA", "GGB", "LUMN", "VTRS",
    "NCLH", "NXE", "PCG", "WMT", "MU", "LUV", "SLB", "PTEN", "CPNG"
]

In [None]:
# Entering documents into database based on ticker symbols above. More attributes will
# be added once the database is loaded with the initial data.
for ticker in tickers:
    try:
        # Gather 10 year historical data
        data = yf.download(ticker, period="10y", interval="1d", progress=False)

        if not data.empty:
            # Move 'Date' from index to column
            data.reset_index(inplace=True)
            data["Ticker"] = ticker

            # Convert NaNs to None (MongoDB does not accept NaN)
            data.replace({np.nan: None}, inplace=True)

            # Flatten MultiIndex columns if they exist. This prevents the data from being formatted incorrectly. Each value needed to be a string
            if isinstance(data.columns, pd.MultiIndex):
                data.columns = ['_'.join(filter(None, col)).strip() for col in data.columns]

            attributes = data.to_dict("records")

            # Insert into MongoDB
            db.Finance_Data.insert_many(attributes)
            print(f"Inserted {len(attributes)} attributes for {ticker}.")

        # Timer to prevent rate limit
        time.sleep(3)
    except Exception as e:
        print(f"{ticker} had an error being inserted: {e}")


In [None]:
# Restructuring the document fields in the database. From the yfinance API the fields are labeled as "close_{ticker}". This needed to be changed since the subsequent actions need to be easily manipulated across all tickers. Having them be uniform will make that process easier.

rename_fields = ["Open", "High", "Low", "Close", "Volume"]
tickers = db.Finance_Data.distinct("Ticker")

for ticker in tickers:
    suffix = f"_{ticker}"
    for field in rename_fields:
        old_field = f"{field}{suffix}"
        new_field = field
        db.Finance_Data.update_many(
            {old_field: {"$exists": True}},
            {"$rename": {old_field: new_field}}
        )
    print(f"Renamed fields for {ticker}")


In [None]:
"""
The next few sections will be for adding the additional columns to the database documents.
The first one will add a "Success" column. This will be based on whether the stock's "Close" value is greater than the "Open" value.
This is the only one that will require a manual calculation to be performed.
Next, the "Sentiment Score" column will be added using the nltk package. This library aggregates a list of related news articles pertaining to the ticker symbol in question. A value will be produced from -1 to +1.
The columns for SMA, EMA, RSI and MACD will use a group of values from the ticker symbol in order to make the calculations.
This action will be performed using the TA-lib package.
"""


In [6]:
# Adding the 'Success' column

for doc in db.Finance_Data.find():
    testdoc_close = doc.get("Close")
    testdoc_open = doc.get("Open")

    if testdoc_close is not None and testdoc_open is not None:
        success_flag = 1 if testdoc_close > testdoc_open else 0
        db.Finance_Data.update_one(
            {"_id": doc["_id"]},
            {"$set": {"Success": success_flag}}
        )


In [None]:
# Adding the 'Sentiment Score' column

# Download VADER lexicon
nltk.download('vader_lexicon')

# Initialize VADER Sentiment Analyzer
sia = SentimentIntensityAnalyzer()

# Get unique tickers
tickers = db.Finance_Data.distinct("Ticker")

for ticker_symbol in tickers:
    try:
        ticker = yf.Ticker(ticker_symbol)
        news_items = ticker.news

        # Calculate average compound sentiment score. This will be added as the data point for all ticker entries for initial concept.
        # Ticker "WRD" does not have any sentiment data at this point
        compound_scores = []

        for item in news_items:
            content = item.get("content", {})
            summary = content.get("summary", "")
            sentiment = sia.polarity_scores(summary)
            compound_scores.append(sentiment["compound"])

        if compound_scores:
            average_sentiment = sum(compound_scores) / len(compound_scores)
        else:
            average_sentiment = None

        # Update all documents for this ticker with sentiment (or just latest if preferred)
        db.Finance_Data.update_many(
            {"Ticker": ticker_symbol},
            {"$set": {"Sentiment": average_sentiment}}
        )

        print(f"Added sentiment score for {ticker_symbol}")

    except Exception as e:
        print(f"Error processing {ticker_symbol}: {e}")
        db.Finance_Data.update_many(
            {"Ticker": ticker_symbol},
            {"$set": {"Sentiment": 0.0}}
        )


In [51]:
# Adding the columns for SMA (Simple Moving Average), EMA (Exponential Moving Average), RSI (Relative Strength Index) and MACD ( Moving Average Convergence/Divergence)
current_ticker = db.Finance_Data.find({"Ticker": "NVDA"})
close = [doc["Close"] for doc in current_ticker if "Close" in doc]
print(len(close))
npArray = np.array(close)
values = ta.SMA(npArray, timeperiod=30)
print(len(values))
for x in values: print(x)

2515
2515
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
0.5133512288331985
0.5114230692386628
0.5096078515052795
0.5075990120569865
0.5058322042226792
0.504613995552063
0.5029843330383301
0.5009190122286479
0.49933775663375857
0.49736118217309316
0.4958928734064102
0.49405345221360525
0.4926012843847275
0.491520224014918
0.49042302966117857
0.4891644805669785
0.4875590254863103
0.486284339427948
0.48514679570992786
0.4846304655075073
0.48417867521444957
0.48563891351222993
0.48809147576491035
0.4909474104642868
0.4938598225514094
0.4963769197463989
0.49889401098092395
0.5014917840560277
0.5042243808507919
0.5069970101118088
0.5092908322811127
0.5107513437668483
0.5114912619193395
0.511874740322431
0.5136107126871745
0.5156560232241948
0.5178791711727778
0.5201982845862706
0.5216994315385819
0.5240337590376536
0.5262230674425761
0.5281766533851624
0.5309724271297455
0.5330647855997086
0.5352470805247624
0.537445755799