In [2]:
import yfinance as yf
import numpy as np
from Database.MongoDB_Connection import start_db
import pandas as pd
import time
from datetime import datetime, timedelta, date
import talib as ta
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [3]:
# Connect to MongoDB
db = start_db()
db
print(db)

['C964_Database', 'admin', 'config', 'local']
Test entry exists
Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'C964_Database')


In [None]:
#Tickers for training
print("Reading list of tickers to be loaded.")

tickers = [
    "AAL", "AAPL", "ABEV", "ACHR", "ADBE", "ADT", "ADSK", "ALGN", "ALNY", "AMAT",
    "AMCR", "AMD", "AMZN", "ANET", "APH", "ASML", "AUR", "AVGO", "AXP", "BA",
    "BAC", "BAX", "BB", "BBD", "BDX", "BIIB", "BK", "BKNG", "BLK", "BMY",
    "BTG", "C", "CAT", "CCL", "CDE", "CHTR", "CI", "CINF", "CMCSA", "CMI",
    "CNH", "COF", "COP", "COST", "CPNG", "CRM", "CSCO", "CSX", "CTAS", "CVS",
    "CVX", "CX", "D", "DD", "DE", "DG", "DHR", "DIS", "DJT", "DOV",
    "DTE", "DUK", "EMR", "ENPH", "EOG", "EQIX", "ERIC", "ETN", "EXC", "F",
    "FANG", "FAST", "FDX", "FE", "FIS", "FISV", "GAP", "GE", "GGB", "GILD",
    "GIS", "GLW", "GM", "GME", "GOOG", "GOOGL", "GRAB", "GS", "HAL", "HBAN",
    "HD", "HIMS", "HL", "HLN", "HOLX", "HON", "HOOD", "HPQ", "HPE", "IBM",
    "IAG", "ICE", "ILMN", "INFA", "INTC", "INTU", "IONQ", "ISRG", "ITUB", "ITW",
    "JNPR", "JWN", "K", "KGC", "KEY", "KLAC", "KMB", "KMI", "KO", "KR",
    "LCID", "LIN", "LLY", "LMT", "LOW", "LRCX", "LUMN", "LUV", "LYFT", "LYG",
    "MA", "MARA", "MCD", "MDLZ", "MDT", "MET", "MMM", "MO", "MRK", "MRVL",
    "MS", "MSFT", "MSI", "MTB", "MU", "NEE", "NEM", "NFLX", "NIO", "NKE",
    "NKLA", "NOK", "NOV", "NRG", "NSC", "NTAP", "NVDA", "NU", "NXE", "OKLO",
    "PDD", "PCG", "PFE", "PLTR", "PONY", "PSLV", "PTEN", "QBTS", "RGTI", "RIOT",
    "RIG", "RIVN", "RKLB", "SOUN", "SMCI", "SMR", "SNAP", "SOFI", "T", "TEM",
    "TEVA", "TSLA", "UBER", "UAA", "UEC", "UNH", "VALE", "VOD", "VTRS", "WBD",
    "WMB", "WMT", "WRD", "X"
]



Reading list of tickers to be loaded.


In [None]:
# Entering documents into database based on ticker symbols above. More fields will be added once the database is loaded with the initial ticker data.

print("Inserting documents into MongoDB.")

for ticker in tickers:
    try:
        # Gather 10 year historical data in 1-year intervals. progress=False is to stop the display of the progress bar while downloading
        data = yf.download(ticker, period="10y", interval="1d", progress=False, auto_adjust=False)

        if not data.empty:
            # Move 'Date' from index to column
            data.reset_index(inplace=True)
            data["Ticker"] = ticker

            # Convert NaNs to None (MongoDB does not accept NaN)
            data.replace({np.nan: None}, inplace=True)

            # This prevents the data from being formatted incorrectly. Each value needs to be a string.
            if isinstance(data.columns, pd.MultiIndex):
                data.columns = ['_'.join(filter(None, col)).strip() for col in data.columns]

            # Create dictionary of documents to enter into database
            newDocs = data.to_dict("records")

            # Insert into MongoDB
            db.Finance_Data.insert_many(newDocs)
            print(f"Inserted {len(newDocs)} attributes for {ticker}.")

        # Timer to prevent rate limit on yfinance. This limiter will cause a multi-day lockout.
        time.sleep(1.5)
    except Exception as e:
        print(f"{ticker} had an error being inserted: {e}")


In [None]:
# Restructuring the document fields in the database. From the yfinance API the fields are labeled as "close_{ticker}". This needed to be changed since the subsequent actions need to be easily manipulated across all tickers. Having them be uniform will make that process easier.
print("Transforming field names.")

rename_fields = ["Open", "High", "Low", "Close", "Volume"]
tickers = db.Finance_Data.distinct("Ticker")

for ticker in tickers:

    for field in rename_fields:
        old_field = f"{field}_{ticker}"
        new_field = field
        db.Finance_Data.update_many(
            {old_field: {"$exists": True}},
            {"$rename": {old_field: new_field}}
        )
    print(f"Renamed fields for {ticker}")


In [None]:
"""
The next few sections will be for adding the additional columns to the database documents.
The first one will add a "Success" column. This will be based on whether the stock's "Close" value is greater than the "Open" value.
This is the only one that will require a manual calculation to be performed.
Next, the "Sentiment Score" column will be added using the nltk package. This library aggregates a list of related news articles pertaining to the ticker symbol in question. A value will be produced from -1 to +1.
The columns for SMA, EMA, RSI and MACD will use a group of values from the ticker symbol in order to make the calculations.
This action will be performed using the TA-lib package.
"""


In [None]:
# Adding the 'Success' column
print("Adding Success column to database.")

# Gather each documents Open and Close value. If Open < Close, then Success = 1. Otherwise, Success = 0.
for doc in db.Finance_Data.find():
    doc_close = doc.get("Close")
    doc_open = doc.get("Open")

    if doc_close is not None and doc_open is not None:
        success_flag = 1 if doc_close > doc_open else 0
        db.Finance_Data.update_one(
            {"_id": doc["_id"]},
            {"$set": {"Success": success_flag}}
        )


In [8]:
# Adding the 'Sentiment Score' column
print("Adding sentiment score for each ticker.")

nltk.download('vader_lexicon')

# Initialize Sentiment Analyzer for using model against related news articles
sia = SentimentIntensityAnalyzer()

# Get list of tickers
#tickers = db.Finance_Data.distinct("Ticker")

for ticker_symbol in tickers:
    print(f"Getting sentiment score for {ticker_symbol}")
    try:

        # Gets all the news items for the ticker symbol
        news_items = yf.Ticker(ticker_symbol).get_news(count=25)
        print(f"First check to see how many news items there are Ticker: {ticker_symbol}, News : {len(news_items)}")

        # Will be used to store a dictionary of dates:news summaries
        news_articles = []

        for item in news_items:
            content = item.get("content", {})
            if "summary" in content and "pubDate" in content:
                print("adding articles")
                article_date = datetime.fromisoformat(content["pubDate"].replace("Z", "")).date()
                news_articles.append({"Article_Date": article_date, "Summary": content["summary"]})

            else:
                print("There's the issue getting the summary or pubDate")


        # Calculate sentiment from the last 60 days. The time variable may be a bit difficult to gather with the date limitations in MongoDB along with the limitations from yfinance.
        docs = db.Finance_Data.find({"Ticker": ticker_symbol})
        for doc in docs:

            doc_date = doc.get("Date")

            if isinstance(doc_date, datetime):
                doc_date = doc_date.date()

            elif isinstance(doc_date, str):
                try:
                    doc_date = datetime.strptime(doc_date, "%Y-%m-%d").date()
                except:
                    print("meh")

            start_range = doc_date - timedelta(days=60)

            # Will be used to store the compound score of the sentiment for all documents for the specific ticker
            compound_scores = []
            for item in news_articles:

                # Skip if date is wrong
                if not isinstance(item["Article_Date"], date):
                    print("Invalid article date format. Skipping.")
                    continue
                # Check to see if the news articles fall in the above date range. If so, add to compound scores list.
                if start_range <= item["Article_Date"] < doc_date:

                    sentiment = sia.polarity_scores(item["Summary"])
                    compound_scores.append(sentiment["compound"])

		    # Add the sentiment score for specific date if compound scores is populated.
            if compound_scores:
                average_sentiment = sum(compound_scores) / len(compound_scores)

            else:
                # average of all articles for this ticker will be added if it cannot be applied to specific date
                if news_articles:
                    overall_scores = [sia.polarity_scores(item["Summary"])["compound"] for item in news_articles]
                    average_sentiment = sum(overall_scores) / len(overall_scores)

                #  Average sentiment set to 0.0, "neutral", if no articles found at all.
                else:
                    average_sentiment = 0.0


            # Update all documents for this ticker with sentiment score
            db.Finance_Data.update_one(
                {"_id": doc["_id"]},
                {"$set": {"Sentiment": average_sentiment}}
            )
        print(f"Added sentiment score for {ticker_symbol}")

    # If there is no news available for given ticker due to exception, default sentiment is set to 0.1 for identification.
    except Exception as e:
        print(f"Error processing {ticker_symbol}: {e}. Default sentiment value is 0.1.")
        db.Finance_Data.update_many(
            {"Ticker": ticker_symbol},
            {"$set": {"Sentiment": 0.1}}
        )


Adding sentiment score for each ticker.
Getting sentiment score for CTAS


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\dhous\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


First check to see how many news items there are Ticker: CTAS, News : 25
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
Added sentiment score for CTAS
Getting sentiment score for CVS
First check to see how many news items there are Ticker: CVS, News : 25
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding articles
adding a

In [9]:
# Adding the columns for SMA (Simple Moving Average), EMA (Exponential Moving Average), RSI (Relative Strength Index) and MACD ( Moving Average Convergence/Divergence)
# After initial model training results, added Williams %R(WILLR), Commodity Channel Index(CCI) and Average True Range(ATR)
print("Adding technical indicators")

tickers = db.Finance_Data.distinct("Ticker")
for symbol in tickers:

    # Converting to list and sorting by date so that each indicator aligns properly
    current = list(db.Finance_Data.find({"Ticker": symbol}).sort("Date",1))
    close = [doc["Close"] for doc in current if "Close" in doc]
    high = [doc["High"] for doc in current if "High" in doc]
    low = [doc["Low"] for doc in current if "Low" in doc]

    npArrayClose = np.array(close)
    npArrayHigh = np.array(high)
    npArrayLow = np.array(low)

    SMA_values = ta.SMA(npArrayClose, timeperiod=30)
    EMA_values = ta.EMA(npArrayClose, timeperiod=30)
    RSI_values = ta.RSI(npArrayClose, timeperiod=30)
    MACD_values, MACD_signal, _ = ta.MACD(npArrayClose, fastperiod=10, slowperiod=28, signalperiod=7)
    WILLR_values = ta.WILLR(npArrayHigh, npArrayLow, npArrayClose, timeperiod=14)
    CCI_values = ta.CCI(npArrayHigh, npArrayLow, npArrayClose, timeperiod=14)
    ATR_values = ta.ATR(npArrayHigh, npArrayLow, npArrayClose, timeperiod=14)


    # Iterating through document to add indicator value.
    for i, doc in enumerate(current):
        # Will be used to store all the key:value pairs for inserting document fields
        insert = {}

        # There may be a situation where the value is NaN after the 30-day point.
        #SMA
        if i < len(SMA_values) and SMA_values[i] is not np.nan:
            insert["SMA"] = float(SMA_values[i])
        #EMA
        if i < len(EMA_values) and EMA_values[i] is not np.nan:
            insert["EMA"] = float(EMA_values[i])
        #RSI
        if i < len(RSI_values) and RSI_values[i] is not np.nan:
            insert["RSI"] = float(RSI_values[i])
        #MACD
        if i < len(MACD_values) and MACD_values[i] is not np.nan:
            insert["MACD"] = float(MACD_values[i])
        # WILLR
        if i < len(WILLR_values) and WILLR_values[i] is not np.nan:
            insert["WILLR"] = float(WILLR_values[i])
        # CCI
        if i < len(CCI_values) and CCI_values[i] is not np.nan:
            insert["CCI"] = float(CCI_values[i])
        # ATR
        if i < len(ATR_values) and ATR_values[i] is not np.nan:
            insert["ATR"] = float(ATR_values[i])

        # Insert values for document
        db.Finance_Data.update_one({"_id": doc["_id"]}, {"$set": insert})

print("Finish")

Adding technical indicators
Finish
