##### Authors: Alexander Mo & Tommaso Lucarelli

# Feature pipeline for Yahoo Finance data
This pipeline takes a set of stock market ticker symbols and retrieves historical data and statistics from Yahoo Finance. The data is then processed to fit the training pipeline's LSTM network. Upon processing the data, the features will be stored using Google Drive where they can be retrieved on demand.

Code is written to run on Google Colab.
##### NOTE: Ensure the feature_engineering folder is in the mounted drive and current directory

In [None]:
!pip install yfinance --upgrade --no-cache-dir
!pip install pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0mLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0m

In [3]:
import yfinance as yf
import pandas as pd
from random import randint

In [4]:
# Not needed with yfinance on Python 3.9

def parseMonth(month):
    """
    Function to parse the string of a month from the earnings date
    :param month: string with the given month
    :return: the number corresponding to the month in datetime format
    """
    if month == 'Jan':
        return 1
    elif month == 'Feb':
        return 2
    elif month == 'Mar':
        return 3
    elif month == 'Apr':
        return 4
    elif month == 'May':
        return 5
    elif month == 'Jun':
        return 6
    elif month == 'Jul':
        return 7
    elif month == 'Aug':
        return 8
    elif month == 'Sep':
        return 9
    elif month == 'Oct':
        return 10
    elif month == 'Nov':
        return 11
    else:
        return 12

# Not needed with yfinance on Python 3.9
def convertEarningsDate(df):
    """
    Function to convert the earnings date format to datetime
    :param df: dataframe containing the earnings history
    :return: dataframe updated with the parsed datetime as date
    """
    test = [None]*len(df)
    for i in range(len(df)):
        temp = df.loc[i, "Earnings Date"].split(', ')
        temp = [x.strip() for x in temp]
        temp = (temp[0] + ' ' + temp[1]).split(' ')
        test[i] = [temp[2], parseMonth(temp[0]), temp[1]]
    dt = pd.DataFrame(test, columns=['year', 'month', 'day'])
    df['Earnings Date'] = pd.to_datetime(dt)
    return df

### Functions to process the data of a stock into the right format
Final dataframe contains all necessary columns for the model in the right format and dtype.

In [5]:
from feature_engineering.prod_feature_functions import getEarnings, getHistory, getRelEarnings, getHistWithEarnings, dropIrrelevant

### Functions for selecting stocks and formatting

In [6]:
def stockToCSV(symbol, period='1mo', interval='15m'):
    """
    Functions to write stock information to CSV
    :param symbol: ticker symbol of the company
    :param period: over what time period the history data should be taken
    :param interval: how often a sample is taken over the period
    :return:
    """
    stock = yf.Ticker(symbol)
    earnings = getEarnings(stock)
    # earnings = convertEarningsDate(earnings)
    hist = getHistory(symbol, stock, period=period, interval=interval)
    rel_earnings = getRelEarnings(earnings, hist)
    hist = getHistWithEarnings(rel_earnings, hist)
    hist = dropIrrelevant(hist)
    hist.to_csv('./data/' + symbol)

def stockToDf(symbol, earnings, period='1mo', interval='15m'):
    """
    Functions to write stock information to CSV
    :param symbol: ticker symbol of the company
    :param earnings: dataframe containing the respective earnings
    :param period: over what time period the history data should be taken
    :param interval: how often a sample is taken over the period
    :return:
    """
    stock = yf.Ticker(symbol)
    # earnings = getEarnings(stock)
    hist = getHistory(symbol, stock, period=period, interval=interval)
    rel_earnings = getRelEarnings(earnings, hist)
    hist = getHistWithEarnings(rel_earnings, hist)
    hist = dropIrrelevant(hist)
    return hist

def getStocks():
    """
    Get all stock symbols listed on Nasdaq
    :return: list with symbols
    """
    df = pd.read_table('http://www.nasdaqtrader.com/dynamic/symdir/nasdaqlisted.txt')
    # local run
    # df = pd.read_table('tickers.txt')
    symbols = [None]*len(df)
    for idx, line in df.iterrows():
        symbols[idx] = line[0].split('|')[0]
    return symbols

def selectStocks(stocks, n):
    """
    Select n random stocks from the list
    :param stocks: list of company symbols
    :param n: number of companies to select
    :return: selected symbols
    """
    idxs = []
    selected = [None]*n
    earnings = [None]*n
    counter = 0
    while len(idxs) < n:
        temp_int = randint(0, len(stocks))
        if len(stocks) == n:
            temp_int = counter
        if temp_int not in idxs:
            earn = getEarnings(yf.Ticker(stocks[temp_int]))
            if earn is not None:
                idxs.append(temp_int)
                selected[counter] = stocks[temp_int]
                earnings[counter] = earn
                counter += 1
    return selected, earnings

def selectAll(stocks):
    """
    Select all stocks which have enough data on yfinance listed on the Nasdaq list
    :param stocks: list of company symbols fom Nasdaq
    :return: current symbols
    """
    selected = []
    earnings = []
    for stock in stocks:
        try:
            temp = getEarnings(yf.Ticker(stock))
        except:
            print('error loading the stock')
        if temp is not None:
            selected.append(stock)
            earnings.append(temp)
    return selected, earnings


### Run and store features on Google Drive

In [None]:
from google.colab import drive
drive.mount('/gdrive')

%cd /gdrive/MyDrive/Scalable/Project/feature_store

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive/MyDrive/Scalable/Project/feature_store


In [14]:
# PARAMETERS FOR RUN
SET = False
N = 5
ONE_CSV = True
PATH = './small_test.csv'

def runAll(n):
    stocks = getStocks()
    if SET:
        selected, earnings = selectStocks(stocks, n)
    else:
        stocks = ['AAPL', 'TSLA', 'AMZN']
        selected, earnings = selectStocks(stocks, len(stocks))
    if ONE_CSV:
        df = pd.DataFrame()
        for idx, stock in enumerate(selected):
            df = df.append(stockToDf(selected[idx], earnings[idx]))
        df.reset_index(drop=True, inplace=True)
        with open(PATH, 'w', encoding = 'utf-8-sig') as f:
            df.to_csv(f, index=False)
            f.close()
        print('File successfully saved.')
    else:
        for stock in selected:
            stockToCSV(stock)


In [15]:
runAll(N)

  df = df.append(stockToDf(selected[idx], earnings[idx]))
  df = df.append(stockToDf(selected[idx], earnings[idx]))


File successfully saved.


  df = df.append(stockToDf(selected[idx], earnings[idx]))
