# Data exploration of Yahoo Finance data
RAD for feature engineering pipeline.

In [12]:
import time

import yfinance as yf
import pandas as pd
from random import randint

In [13]:
# If yf.Ticker.info is not working running the command below solves it
# pip install yfinance --upgrade --no-cache-dir
# How to integrate this in the pipeline? Requirements?


test_symbol = 'AAPL'
stock = yf.Ticker(test_symbol)

In [14]:
def getEarnings(stock):
    earn = stock.get_earnings_dates()
    if earn is not None:
        earn['Earnings Date'] = earn.index.date
        earn['Earnings Date'] = pd.to_datetime(earn['Earnings Date'])
    return earn

In [15]:
# Not needed with yfinance on Python 3.9

def parseMonth(month):
    """
    Function to parse the string of a month from the earnings date
    :param month: string with the given month
    :return: the number corresponding to the month in datetime format
    """
    if month == 'Jan':
        return 1
    elif month == 'Feb':
        return 2
    elif month == 'Mar':
        return 3
    elif month == 'Apr':
        return 4
    elif month == 'May':
        return 5
    elif month == 'Jun':
        return 6
    elif month == 'Jul':
        return 7
    elif month == 'Aug':
        return 8
    elif month == 'Sep':
        return 9
    elif month == 'Oct':
        return 10
    elif month == 'Nov':
        return 11
    else:
        return 12

In [16]:
# Not needed with yfinance on Python 3.9

def convertEarningsDate(df):
    """
    Function to convert the earnings date format to datetime
    :param df: dataframe containing the earnings history
    :return: dataframe updated with the parsed datetime as date
    """
    test = [None]*len(df)
    for i in range(len(df)):
        temp = df.loc[i, "Earnings Date"].split(', ')
        temp = [x.strip() for x in temp]
        temp = (temp[0] + ' ' + temp[1]).split(' ')
        test[i] = [temp[2], parseMonth(temp[0]), temp[1]]
    dt = pd.DataFrame(test, columns=['year', 'month', 'day'])
    df['Earnings Date'] = pd.to_datetime(dt)
    return df

In [17]:
def getHistory(symbol, stock, period='1mo', interval='15m'):
    """
    Function to retrieve the price history of the stock and parse its date
    :param stock: yfinance ticker object
    :param period: the period over which data should be collected
    :param interval: the interval for data points
    :return: history dataframe with additional columns
    """
    hist = stock.history(period = period, interval = interval)
    hist['company'] = symbol
    hist['date'] = hist.index.date
    hist['date'] = pd.to_datetime(hist['date'])
    hist['Diff'] = hist['Close'] - hist['Open']
    return hist

In [18]:
def getRelEarnings(e_df, hist_df):
    """
    Finds the earnings data which is relevant for the given history time frame
    :param e_df: earnings dataframe
    :param hist_df: history dataframe
    :return: relevant dates dataframe
    """
    e_df.reset_index(inplace=True, drop=True)
    minmax = hist_df['date'].agg(['min', 'max'])
    last_er_idx = e_df[e_df['Earnings Date'] <= minmax['min']].index[0]
    first_er_idx = e_df[e_df['Earnings Date'] <= minmax['max']].index[0]
    relevant_earnings = e_df[first_er_idx:last_er_idx+1].reset_index(drop=True)
    return relevant_earnings

def fillEarnings(current, hist_df, idx_in):
    """
    Function to fill the earnings columns into the history
    :param current: dataframe holding the earnings data for the selected indices
    :param hist_df: history dataframe
    :param idx_in: relevant indices on the history dataframe to fill the earnings data for
    :return: history dataframe with the earnings added for the given indices
    """
    hist_df.loc[idx_in, 'EPS Estimate'] = current['EPS Estimate']
    hist_df.loc[idx_in, 'Reported EPS'] = current['Reported EPS']
    hist_df.loc[idx_in, 'Offset'] = current['Surprise(%)']
    hist_df.loc[idx_in, 'Earnings'] = current['Earnings Date']
    return hist_df


def getHistWithEarnings(relevant_earnings, hist_df):
    """
    Function to add the corresponding earnings data to the days for which the data was known.
    :param relevant_earnings: The earnings columns which are relevant for the given history time frame
    :param hist_df: The history dataframe
    :return: History with added columns for each of the relevant earnings
    """
    for idx in reversed(relevant_earnings.index):
        if idx>0:
            current = relevant_earnings.iloc[idx]
            next = relevant_earnings.iloc[idx-1]
            idx_in = hist_df[(hist_df['date'] >= current['Earnings Date']) &
                             (hist_df['date'] < next['Earnings Date'])].index
            hist_df = fillEarnings(current, hist_df, idx_in)
        else:
            current = relevant_earnings.iloc[idx]
            idx_in = hist_df[(hist_df['date'] >= current['Earnings Date'])].index
            hist_df = fillEarnings(current, hist_df, idx_in)
    return hist_df

def dropIrrelevant(hist_df: pd.DataFrame):
    labels = ['']
    return hist_df.drop(labels, axis=1)



In [19]:
def stockToCSV(symbol, period='1mo', interval='15m'):
    """
    Functions to write stock information to CSV
    :param symbol: ticker symbol of the company
    :param period: over what time period the history data should be taken
    :param interval: how often a sample is taken over the period
    :return:
    """
    stock = yf.Ticker(symbol)
    earnings = getEarnings(stock)
    # earnings = convertEarningsDate(earnings)
    hist = getHistory(symbol, stock, period=period, interval=interval)
    rel_earnings = getRelEarnings(earnings, hist)
    hist = getHistWithEarnings(rel_earnings, hist)
    hist.to_csv('./data/' + symbol)

def stockToDf(symbol, earnings, period='1mo', interval='15m'):
    """
    Functions to write stock information to CSV
    :param symbol: ticker symbol of the company
    :param earnings: dataframe containing the respective earnings
    :param period: over what time period the history data should be taken
    :param interval: how often a sample is taken over the period
    :return:
    """
    stock = yf.Ticker(symbol)
    # earnings = getEarnings(stock)
    # earnings = convertEarningsDate(earnings)
    hist = getHistory(symbol, stock, period=period, interval=interval)
    rel_earnings = getRelEarnings(earnings, hist)
    hist = getHistWithEarnings(rel_earnings, hist)
    return hist

def getStocks():
    """
    Get all stock symbols listed on Nasdaq
    :return: list with symbols
    """
    df = pd.read_table('http://www.nasdaqtrader.com/dynamic/symdir/nasdaqlisted.txt')
    # local run
    # df = pd.read_table('tickers.txt')
    symbols = [None]*len(df)
    for idx, line in df.iterrows():
        symbols[idx] = line[0].split('|')[0]
    return symbols

def selectStocks(stocks, n):
    """
    Select n random stocks from the list
    :param stocks: list of company symbols
    :param n: number of companies to select
    :return: selected symbols
    """
    idxs = []
    selected = [None]*n
    earnings = [None]*n
    counter = 0
    while len(idxs) < n:
        temp_int = randint(0, len(stocks))
        if temp_int not in idxs:
            earn = getEarnings(yf.Ticker(stocks[temp_int]))
            if earn is not None:
                idxs.append(temp_int)
                selected[counter] = stocks[temp_int]
                earnings[counter] = earn
                counter += 1
    return selected, earnings

def selectAll(stocks):
    """
    Select all stocks which have enough data on yfinance listed on the Nasdaq list
    :param stocks: list of company symbols fom Nasdaq
    :return: current symbols
    """
    selected = []
    earnings = []
    for stock in stocks:
        try:
            temp = getEarnings(yf.Ticker(stock))
        except:
            print('error loading the stock')
        if temp is not None:
            selected.append(stock)
            earnings.append(temp)
    return selected, earnings


In [20]:
# PARAMETERS FOR RUN
ALL_STOCKS = False
N = 50
ONE_CSV = True

def runAll(n):
    stocks = getStocks()
    if ALL_STOCKS:
        selected, earnings = selectAll(stocks)
    else:
        selected, earnings = selectStocks(stocks, n)
    if ONE_CSV:
        df = pd.DataFrame()
        for idx, stock in enumerate(selected):
            df = df.append(stockToDf(selected[idx], earnings[idx]))
        df.to_csv('data/fullset.csv')
    else:
        for stock in selected:
            stockToCSV(stock)

# while not succeeded, try except

In [21]:
runAll(N)

- PONOW: No earnings dates found, symbol may be delisted
- WALDW: No earnings dates found, symbol may be delisted
- ASCB: No earnings dates found, symbol may be delisted
- SOCL: No earnings dates found, symbol may be delisted
- ADXN: No earnings dates found, symbol may be delisted
- PETWW: No earnings dates found, symbol may be delisted
- PEGR: No earnings dates found, symbol may be delisted
- BCAN: No earnings dates found, symbol may be delisted
- IJT: No earnings dates found, symbol may be delisted
- EDTK: No earnings dates found, symbol may be delisted
- VONG: No earnings dates found, symbol may be delisted
- RFAC: No earnings dates found, symbol may be delisted
- ALTUU: No earnings dates found, symbol may be delisted
- BRKHU: No earnings dates found, symbol may be delisted
- PEGR: No earnings dates found, symbol may be delisted
- FTXO: No earnings dates found, symbol may be delisted
- HCVIU: No earnings dates found, symbol may be delisted
- UTAAW: No earnings dates found, symbol ma

  df = df.append(stockToDf(selected[idx], earnings[idx]))
  df = df.append(stockToDf(selected[idx], earnings[idx]))
  df = df.append(stockToDf(selected[idx], earnings[idx]))
  df = df.append(stockToDf(selected[idx], earnings[idx]))
  df = df.append(stockToDf(selected[idx], earnings[idx]))
  df = df.append(stockToDf(selected[idx], earnings[idx]))
  df = df.append(stockToDf(selected[idx], earnings[idx]))
  df = df.append(stockToDf(selected[idx], earnings[idx]))
  df = df.append(stockToDf(selected[idx], earnings[idx]))
  df = df.append(stockToDf(selected[idx], earnings[idx]))
  df = df.append(stockToDf(selected[idx], earnings[idx]))
  df = df.append(stockToDf(selected[idx], earnings[idx]))
  df = df.append(stockToDf(selected[idx], earnings[idx]))
  df = df.append(stockToDf(selected[idx], earnings[idx]))
  df = df.append(stockToDf(selected[idx], earnings[idx]))
  df = df.append(stockToDf(selected[idx], earnings[idx]))
  df = df.append(stockToDf(selected[idx], earnings[idx]))
  df = df.appe

In [22]:
import time
st = time.time()
print(yf.Ticker('KRKR').get_earnings_dates())
print(yf.Ticker('AAPL').get_earnings_dates())
ed = time.time()
print(ed - st)

                           EPS Estimate  Reported EPS  Surprise(%)
Earnings Date                                                     
2022-12-06 23:00:00-05:00           NaN           NaN          NaN
2022-08-24 01:00:00-04:00         -0.21          0.28       2.3333
2022-06-01 02:00:00-04:00           NaN           NaN          NaN
2022-03-29 01:00:00-04:00           NaN          0.48          NaN
2021-12-08 23:00:00-05:00         -0.75         -0.64       0.1523
2021-08-24 01:00:00-04:00         -0.78         -0.76       0.0256
2021-06-01 01:00:00-04:00         -0.71         -0.92      -0.2958
2021-04-15 03:00:00-04:00         -0.59         -1.96      -2.3333
2020-11-30 01:00:00-05:00         -0.39         -0.20       0.4822
2020-08-31 02:00:00-04:00         -1.10         -1.62      -0.4800
2020-05-26 01:00:00-04:00         -1.93         -2.05      -0.0654
2020-03-11 02:00:00-04:00          2.23          1.67      -0.2493
                           EPS Estimate  Reported EPS  Surpris

In [23]:
stockToDf('ANAB', getEarnings(yf.Ticker('ANAB')))

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,company,date,Diff,EPS Estimate,Reported EPS,Offset,Earnings
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2022-11-30 09:30:00-05:00,28.299999,28.500000,27.900000,28.049999,305982,0.0,0.0,ANAB,2022-11-30,-0.250000,-0.72,-1.18,-0.6321,2022-11-08
2022-11-30 09:45:00-05:00,28.215000,28.215000,28.215000,28.215000,759,0.0,0.0,ANAB,2022-11-30,0.000000,-0.72,-1.18,-0.6321,2022-11-08
2022-11-30 10:00:00-05:00,28.100000,28.129999,28.010000,28.010000,1343,0.0,0.0,ANAB,2022-11-30,-0.090000,-0.72,-1.18,-0.6321,2022-11-08
2022-11-30 10:15:00-05:00,27.830000,27.980000,27.129999,27.459999,9895,0.0,0.0,ANAB,2022-11-30,-0.370001,-0.72,-1.18,-0.6321,2022-11-08
2022-11-30 10:30:00-05:00,27.465000,27.760000,27.465000,27.700001,6754,0.0,0.0,ANAB,2022-11-30,0.235001,-0.72,-1.18,-0.6321,2022-11-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-30 15:00:00-05:00,30.885000,31.000000,30.885000,30.980000,3431,0.0,0.0,ANAB,2022-12-30,0.094999,-0.72,-1.18,-0.6321,2022-11-08
2022-12-30 15:15:00-05:00,31.000000,31.010000,30.990000,31.000000,3079,0.0,0.0,ANAB,2022-12-30,0.000000,-0.72,-1.18,-0.6321,2022-11-08
2022-12-30 15:30:00-05:00,31.000000,31.004999,30.959999,30.969999,9266,0.0,0.0,ANAB,2022-12-30,-0.030001,-0.72,-1.18,-0.6321,2022-11-08
2022-12-30 15:45:00-05:00,31.000000,31.040001,30.889999,30.969999,41125,0.0,0.0,ANAB,2022-12-30,-0.030001,-0.72,-1.18,-0.6321,2022-11-08


In [24]:
getHistory('CBAT', yf.Ticker('CBAT'), period='1mo', interval='15m')

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,company,date,Diff
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2022-11-30 09:30:00-05:00,1.1500,1.1500,1.1373,1.1373,10329,0.0,0.0,CBAT,2022-11-30,-0.0127
2022-11-30 09:45:00-05:00,1.1342,1.1400,1.1300,1.1400,5574,0.0,0.0,CBAT,2022-11-30,0.0058
2022-11-30 10:00:00-05:00,1.1400,1.1500,1.1400,1.1500,19388,0.0,0.0,CBAT,2022-11-30,0.0100
2022-11-30 10:15:00-05:00,1.1500,1.1600,1.1500,1.1527,21236,0.0,0.0,CBAT,2022-11-30,0.0027
2022-11-30 10:30:00-05:00,1.1500,1.1600,1.1500,1.1600,7701,0.0,0.0,CBAT,2022-11-30,0.0100
...,...,...,...,...,...,...,...,...,...,...
2022-12-30 15:00:00-05:00,0.9810,0.9811,0.9810,0.9811,18178,0.0,0.0,CBAT,2022-12-30,0.0001
2022-12-30 15:15:00-05:00,0.9810,0.9811,0.9810,0.9811,6626,0.0,0.0,CBAT,2022-12-30,0.0001
2022-12-30 15:30:00-05:00,0.9811,0.9875,0.9810,0.9875,10075,0.0,0.0,CBAT,2022-12-30,0.0064
2022-12-30 15:45:00-05:00,0.9851,0.9900,0.9851,0.9851,2606,0.0,0.0,CBAT,2022-12-30,0.0000
