# Introduction to Using AlgoSeek for Algorithmic Trading

## 1 - Library Import

In [1]:
%pip install awswrangler --user

Note: you may need to restart the kernel to use updated packages.


In [2]:
import boto3
# import awswrangler as wr
import pandas as pd 
import numpy as np  
import seaborn as sns
from tqdm import tqdm 
import os

pd.set_option('display.max_columns',1000)
pd.set_option('display.max_rows',1000)

s3 = boto3.resource('s3')


## 2 - Datasets Intro

### 2.1 - Overview
We will be looking at the following AlgoSeek Datasets:

- US Equity 1 Minute Trade and Quote (TAQ) 
- US Equity Options 1 Minute TAQ
- US Equity Standard Daily Adjusted OHLC 

Additionally, we will be using fundamentals data straight from EDGAR, which we will cover in a later notebook.

These are stored in S3 and are divided between several different buckets. They are stored in a standard format, allowing us to access data from specified days and equities. For each bucket in all of these datasets, there are index files that list metadata about the contents. We will first download all of these index files to use as reference when interacting with AlgoSeek's data.

We will fetch index files for the last 6 years and store them for future use, then demonstrate two different methodologies for downloading and fetching data:

- Downloading a month's data for all equities
- Downloading the entire 6 years of data for a subset of equities

In later notebooks, we will expand on these examples to show how to download data for stocks with the most volatility, liquidity, etc.

### 2.2 - Dataset Data Dictionaries

In [None]:
US Equity Trade and Quote Minute Bar Buckets


### 2.2 - Fetching the index files

US Equity 1min TAQ 

In [None]:
"""
US Equity Trade and Quote Minute Bar Buckets
---------------------------------------------------------
Bucket: s3://us-equity-1min-taq-yyyy where yyyy is a year (2016 - 2022)

Path Format: us-equity-1min-taq-yyyy/yyyymmdd/s/sss.csv.gz

Description: One csv.gz file per symbol per trading date where yyyymmdd is year, month and day, 
s - a single letter in A-Z range, sss - symbol

Example: s3://us-equity-1min-taq-2022/20220104/I/IBM.csv.gz
"""


In [None]:
us_equity_1min_taq = ['s3://us-equity-1min-taq-2017/.index/',
                      's3://us-equity-1min-taq-2018/.index/',
                      's3://us-equity-1min-taq-2019/.index/',
                      's3://us-equity-1min-taq-2020/.index/',
                      's3://us-equity-1min-taq-2021/.index/',
                      's3://us-equity-1min-taq-2022/.index/'
                      ]

US Equity Daily Standard Adjusted OHLC 

In [None]:
"""US Equity Standard Adjusted Daily OHLC Buckets
---------------------------------------------------------
Bucket: s3://us-equity-daily-ohlc-standard-adjusted-tradedate-yyyy where yyyy is a year 
(2016 - 2022)

Path Format: us-equity-daily-ohlc-standard-adjusted-tradedate-yyyy/yyyymmdd.csv

Description: One csv file per trading day where yyyymmdd is year, month and day

Example: s3://us-equity-daily-ohlc-standard-adjusted-tradedate-2022/20220104.csv
"""


In [None]:
us_equity_daily_ohlc = ['us-equity-daily-ohlc-standard-adjusted-tradedate-2017/.index/',
                      'us-equity-daily-ohlc-standard-adjusted-tradedate-2018/.index/',
                      'us-equity-daily-ohlc-standard-adjusted-tradedate-2019/.index/',
                      'us-equity-daily-ohlc-standard-adjusted-tradedate-2020/.index/',
                      'us-equity-daily-ohlc-standard-adjusted-tradedate-2021/.index/',
                      'us-equity-daily-ohlc-standard-adjusted-tradedate-2022/.index/'
                      ]

US Equity Options 1min TAQ 

In [None]:
"""
US Options Trade and Quote Minute Bar Buckets
---------------------------------------------------------
Bucket: s3://us-options-1min-taq-yyyy where yyyy is a year (2016 - 2022)

Path Format: us-options-1min-taq-yyyy/yyyymmdd/s/sss/sss.expdate.csv.gz

Description: One csv.gz file per ticker, trading day and contract expiration date where yyyymmdd is year, month and day, s - a single letter in A-Z range, sss - symbol, expdate is the contract expiration date in yyyymmdd format

Example: s3://us-options-1min-taq-2022/20220104/S/SPY/SPY.20220107.csv.gz
"""

In [1]:
us_equity_options_1min_taq_index = [
                        's3://us-options-1min-taq-2017/.index/',
                      's3://us-options-1min-taq-2018/.index/',
                      's3://us-options-1min-taq-2019/.index/',
                      's3://us-options-1min-taq-2020/.index/',
                      's3://us-options-1min-taq-2021/.index/',
                      's3://us-options-1min-taq-2022/.index/'
                      ]

We first much fetch the index files for each bucket we want to download from.

In [None]:
years = ['2017','2018','2019','2020','2021','2022']

In [None]:
index2017 = []
index2018 = []
index2019 = []
index2020 = []
index2021 = []
index2022 = []
tradeindex = []

indexfiles = [index2017,index2018,index2019,index2020,index2021,index2022]

files2017 = []
files2018 = []
files2019 = []
files2020 = []
files2021 = []
files2022 = []

files = [files2017,files2018,files2019,files2020,files2021,files2022]

counter=0

for i in years:
    print(i)
    
    bucket_name="us-equity-1min-trades-"+i
    
    bucket = s3.Bucket(bucket_name)
    
    for obj in tqdm(bucket.objects.filter(Prefix=".index/",RequestPayer='requester')):
        print(obj.key)
        indexfiles[counter].append(obj.key)
        files[counter].append('s3://'+bucket_name+'/'+obj.key)

    counter = counter+1


In [None]:
    
counter = 0
for i in tqdm(range(5)):
    
    bucket_name="us-equity-1min-trades-"+years[i]
    bucket = s3.Bucket(bucket_name)
    
    for t in tqdm(range(len(indexfiles[counter]))):
        local = 'trades/'+indexfiles[counter][t]
        
        with open(local, 'wb') as f:
            bucket.download_fileobj(indexfiles[counter][t], f,ExtraArgs={'RequestPayer':'requester'})
        
        # s3.Bucket(bucket_name18).download_file(files2018[253:][i],local,ExtraArgs={'RequestPayer':'requester'})
        
    counter = counter+1


# 3 - Download Daily Data Files

Since the daily dataset is significantly smaller and is used to filter/select stocks for fetching 1 minute Trade and Quote data, we will just go ahead and download the whole thing.

In [None]:
for i in us_equity_daily_ohlc:
    print(i)
    
    bucket_name=i[5:-7]
    
    bucket = s3.Bucket(bucket_name)
    
    for obj in tqdm(bucket.objects.filter(Prefix=".index/",RequestPayer='requester')):
        print(obj.key)
        
        local='daily/'+obj.key
        path, filename = os.path.split(obj.key)

        with open(local, 'wb') as f:
            bucket.download_fileobj(bucket_name+filename, f,ExtraArgs={'RequestPayer':'requester'})


# 4 - Downloading indices for 1 min TAQ Dataset

There are several ways that we will be interacting with the 1 min TAQ Equity and Options Datasets. Because they are too big to simply copy, we will first be downloading the index files that tell us where we can find specific datafiles. That way, we can fetch only the data that we are interested in as we need it.

In [2]:
from pathlib import Path

# setting the local paths for the files that we're downloading

index_path =Path('options_index')
if not index_path.exists():
    index_path.mkdir()

equity_index = Path('equity_index')
if not equit_index.exists():
    equity_index.mkdir()


SyntaxError: invalid syntax (<ipython-input-2-118b9b838aca>, line 4)

In [None]:
# Options
for bucket in us_equity_options_1min_taq_index:

    s3_bucket = s3.Bucket(bucket[5:-8])


    for obj in tqdm(s3_bucket.objects.filter(Prefix=".index/",RequestPayer='requester')):
        try:
            local = 'options_index/'+ obj.key[7:]

            with open(local,'wb') as f:
                s3_bucket.download_fileobj(obj.key, f, ExtraArgs={'RequestPayer':'requester'})

            print('Success ', obj.key)
        except:
            print('fail ', obj.key)

In [None]:
# Equity
for bucket in us_equity_1min_taq:

    s3_bucket = s3.Bucket(bucket[5:-8])

    for obj in tqdm(s3_bucket.objects.filter(Prefix=".index/",RequestPayer='requester')):
        try:
            local = 'equity_index/'+ obj.key[7:]

            with open(local,'wb') as f:
                s3_bucket.download_fileobj(obj.key, f, ExtraArgs={'RequestPayer':'requester'})

            print('Success ', obj.key)
        except:
            print('fail ', obj.key)

After downloading the index files, we use a Jupyter Magic command to run the following line as a shell command. This code pushes the files to an s3 storage locations

In [None]:
%sx aws s3 cp options_index/ s3://s3datalakeinventories/algoseek/us_equity_options_1min_taq1_index/ --recursive

In [None]:
# %sx aws s3 cp equity_index/ s3://s3datalakeinventories/algoseek/us_equity_1min_taq_index/ --recursive

## 5 - Dataset Exploration

Before we play around with the data, we must first download some samples to play with. Here, we download intraday data for January 2022 for all stocks (equity and options). Additionally, we download the full history for 10 stocks to demonstrate fetching a single stock.

In [None]:
start_file = '20220103.csv.gz'
end_file = '20220131.csv.gz'

In [3]:
daily = pd.read_parquet('C:/Users/julia/Documents/us_equity_daily_ohlc.parquet')

In [4]:
daily.head()

Unnamed: 0,TradeDate,SecId,Ticker,Open,High,Low,Close,MarketHoursVolume,CumulativePriceFactor,CumulativeVolumeFactor,AdjustmentFactor,AdjustmentReason
0,20160104,2234,SLF,30.44,30.58,30.0,30.57,454226,1.498942,1.0,,
1,20160104,18665,TNP,7.89,7.85,7.53,7.66,398554,3.226711,2.0,,
2,20160104,18679,ACE,115.02,115.41,112.52,113.73,2818882,1.230675,1.0,,
3,20160104,19690,LOGI,15.02,15.09,14.61,14.83,355490,1.187021,1.0,,
4,20160104,20946,DB,23.2,23.48,23.02,23.49,2554117,1.473137,1.158348,,


In [6]:
daily['TradeDate'] = pd.to_datetime(daily['TradeDate'],format="%Y%m%d")

### 5.2 - Single Stock Full History

In [8]:
aapl = pd.read_csv('C:/Users/julia/Downloads/aapl1min.csv')

In [10]:
aapl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1206563 entries, 0 to 1206562
Data columns (total 62 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   timebarstart                 1206563 non-null  object 
 1   openbartime                  1206563 non-null  object 
 2   openbidprice                 1205302 non-null  float64
 3   openbidsize                  1205302 non-null  float64
 4   openaskprice                 1205302 non-null  float64
 5   openasksize                  1205302 non-null  float64
 6   firsttradetime               1060098 non-null  object 
 7   firsttradeprice              1060098 non-null  float64
 8   firsttradesize               1060098 non-null  float64
 9   highbidtime                  1206561 non-null  object 
 10  highbidprice                 1206561 non-null  float64
 11  highbidsize                  1206561 non-null  float64
 12  highasktime                  1206561 non-n

In [13]:
aapl_options = pd.read_csv('datasets/options_trade_and_quotes_1min/AAPL.20200131.csv')

In [14]:
aapl_options.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55035 entries, 0 to 55034
Data columns (total 60 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Date                  55035 non-null  int64  
 1   TimeBarStart          55035 non-null  object 
 2   Ticker                55035 non-null  object 
 3   CallPut               55035 non-null  object 
 4   Strike                55035 non-null  float64
 5   ExpirationDate        55035 non-null  int64  
 6   OpenBidTime           49713 non-null  object 
 7   OpenBidPrice          49713 non-null  float64
 8   OpenBidSize           49713 non-null  float64
 9   OpenAskTime           54940 non-null  object 
 10  OpenAskPrice          54940 non-null  float64
 11  OpenAskSize           54940 non-null  float64
 12  OpenTradeTime         17034 non-null  object 
 13  OpenTradePrice        17034 non-null  float64
 14  OpenTradeSize         17034 non-null  float64
 15  HighBidTime        

In [None]:
import panel as pn
import pandas as pd
import plotly.graph_objects as go
import param

from bokeh.sampledata import stocks

pn.extension('plotly', template='fast-list')

tickers=['AAPL']
title = '## Stock Explorer Plotly'

ticker = pn.widgets.Select(name='Ticker', options=tickers)
window = pn.widgets.IntSlider(name='Window Size', value=6, start=1, end=21)

def get_df(ticker, window_size):
    df = aapl
    df['datetime'] = pd.to_datetime(df.datetime)
    return aapl.set_index('datetime').rolling(window=window_size).mean().reset_index()

def get_plot(ticker, window_size):
    df = get_df(ticker, window_size)
    return go.Scatter(x=df.datetime, y=df.lasttradeprice)

pn.Row(
    pn.Column(title, ticker, window),
    pn.bind(get_plot, ticker, window),
    sizing_mode='stretch_width'
)

class StockExplorer(param.Parameterized):

    ticker = param.Selector(default='AAPL', objects=tickers)

    window_size = param.Integer(default=6, bounds=(1, 21))

    @param.depends('ticker', 'window_size')
    def plot(self):
        return get_plot(self.ticker, self.window_size)

explorer = StockExplorer()

pn.Row(
    pn.Column(explorer.param),
    explorer.plot
)
ticker = pn.widgets.Select(
    name='Ticker', options=['AAPL', 'FB', 'GOOG', 'IBM', 'MSFT']
)
window = pn.widgets.IntSlider(
    name='Window', value=6, start=1, end=21
)

row = pn.Row(
    pn.Column(title, ticker, window, sizing_mode="fixed", width=300),
    get_plot(ticker.options[0], window.value)
)

def update(event):
    row[1].object = get_plot(ticker.value, window.value)

ticker.param.watch(update, 'value')
window.param.watch(update, 'value')

row

In [None]:
ticker.servable(area='sidebar')
window.servable(area='sidebar')

pn.panel("""This example compares **four different implementations of the same app** using

- the quick and easy ``interact`` function,
- more flexible *reactive* functions,
- declarative *Param-based* code, and
- explicit *callbacks*.""").servable()

pn.panel(pn.bind(get_plot, ticker, window)).servable(title='Plotly Stock Explorer');

In [8]:
import plotly.graph_objects as go
df= daily[daily['Ticker']=='AAPL']
df = df.set_index('TradeDate')
fig = go.Figure(go.Candlestick(x=df.index,
  open=df['Open'],
  high=df['High'],
  low=df['Low'],
  close=df['Close']))
fig.show()

### 5.3 - Fundamental data from SEC - Financial Statements, Cash Flows, Balance Sheet, etc;

In [15]:
aapl_company_facts = pd.read_json('datasets/CIK0000320193.json')

In [28]:
aapl_company_facts.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2 entries, dei to us-gaap
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   cik         2 non-null      int64 
 1   entityName  2 non-null      object
 2   facts       2 non-null      object
dtypes: int64(1), object(2)
memory usage: 64.0+ bytes


In [None]:
aapl_company_facts.iloc['facts']['us-gaap'].keys()

In [26]:
aapl_submissions = pd.read_json(r'datasets/sec_aapl.json')

ValueError: All arrays must be of the same length

In [25]:
aapl_submissions.head()

Unnamed: 0,0
cik,320193
entityType,operating
sic,3571
sicDescription,Electronic Computers
insiderTransactionForOwnerExists,0
