# Introduction to Using AlgoSeek for Algorithmic Trading

## 1 - Library Import

In [None]:
import boto3
import awswrangler as wr
import pandas as pd 
import numpy as np  
import seaborn as sns
from tqdm import tqdm 
import os

def get_trade_bucket_path(stock: str,year: int,month: int,day:int):
    """get the s3 path for trades for a stock
    
    args:
        stock (str): 
        
        
    
    """
    first_letter = stock[0]
    bucket = "s3://us-equity-1min-trades-"+str(year)
    path = bucket+'/{year}{month}{day}/{first_letter}/{stock}.csv.gz'.format(year,month,day,first_letter,stock)
    return path

pd.set_option('display.max_columns',1000)
pd.set_option('display.max_rows',1000)

s3 = boto3.resource('s3')


## 2 - Dataset Intro

We will be looking at the following AlgoSeek Datasets:

- US Equity 1 Minute Trade and Quote (TAQ) 
- US Equity Options 1 Minute TAQ
- US Equity Standard Daily Adjusted OHLC 

These are stored in S3 and are divided between several different buckets. They are stored in a standard format, allowing us to access data from specified days and equities. For each bucket in all of these datasets, there are index files that list metadata about the contents. We will first download all of these index files to use as reference when interacting with AlgoSeek's data.

We will fetch index files for the last 6 years and store them for future use, then demonstrate two different methodologies for downloading and fetching data:

- Downloading a month's data for all equities
- Downloading the entire 6 years of data for a subset of equities

In later notebooks, we will expand on these examples to show how to download data for stocks with the most volatility, liquidity, etc.

### 2.1 - Fetching the index files

US Equity 1min TAQ 

In [None]:
"""
US Equity Trade and Quote Minute Bar Buckets
---------------------------------------------------------
Bucket: s3://us-equity-1min-taq-yyyy where yyyy is a year (2016 - 2022)

Path Format: us-equity-1min-taq-yyyy/yyyymmdd/s/sss.csv.gz

Description: One csv.gz file per symbol per trading date where yyyymmdd is year, month and day, 
s - a single letter in A-Z range, sss - symbol

Example: s3://us-equity-1min-taq-2022/20220104/I/IBM.csv.gz
"""


In [None]:
us_equity_taq_1min = ['s3://us-equity-1min-taq-2017/.index/',
                      's3://us-equity-1min-taq-2018/.index/',
                      's3://us-equity-1min-taq-2019/.index/',
                      's3://us-equity-1min-taq-2020/.index/',
                      's3://us-equity-1min-taq-2021/.index/',
                      's3://us-equity-1min-taq-2022/.index/'
                      ]

US Equity Daily Standard Adjusted OHLC 

In [None]:
"""US Equity Standard Adjusted Daily OHLC Buckets
---------------------------------------------------------
Bucket: s3://us-equity-daily-ohlc-standard-adjusted-tradedate-yyyy where yyyy is a year 
(2016 - 2022)

Path Format: us-equity-daily-ohlc-standard-adjusted-tradedate-yyyy/yyyymmdd.csv

Description: One csv file per trading day where yyyymmdd is year, month and day

Example: s3://us-equity-daily-ohlc-standard-adjusted-tradedate-2022/20220104.csv
"""


In [None]:
us_equity_daily_ohlc = ['us-equity-daily-ohlc-standard-adjusted-tradedate-2017/.index/',
                      'us-equity-daily-ohlc-standard-adjusted-tradedate-2018/.index/',
                      'us-equity-daily-ohlc-standard-adjusted-tradedate-2019/.index/',
                      'us-equity-daily-ohlc-standard-adjusted-tradedate-2020/.index/',
                      'us-equity-daily-ohlc-standard-adjusted-tradedate-2021/.index/',
                      'us-equity-daily-ohlc-standard-adjusted-tradedate-2022/.index/'
                      ]

US Equity Options 1min TAQ 

In [None]:
"""
US Options Trade and Quote Minute Bar Buckets
---------------------------------------------------------
Bucket: s3://us-options-1min-taq-yyyy where yyyy is a year (2016 - 2022)

Path Format: us-options-1min-taq-yyyy/yyyymmdd/s/sss/sss.expdate.csv.gz

Description: One csv.gz file per ticker, trading day and contract expiration date where yyyymmdd is year, month and day, s - a single letter in A-Z range, sss - symbol, expdate is the contract expiration date in yyyymmdd format

Example: s3://us-options-1min-taq-2022/20220104/S/SPY/SPY.20220107.csv.gz
"""

In [None]:
us_options_1min_taq = ['s3://us-options-1min-taq-2017/.index/',
                      's3://us-options-1min-taq-2018/.index/',
                      's3://us-options-1min-taq-2019/.index/',
                      's3://us-options-1min-taq-2020/.index/',
                      's3://us-options-1min-taq-2021/.index/',
                      's3://us-options-1min-taq-2022/.index/'
                      ]

We first much fetch the index files for each bucket we want to download from.

In [None]:
years = ['2017','2018','2019','2020','2021','2022']

In [None]:
index2017 = []
index2018 = []
index2019 = []
index2020 = []
index2021 = []
index2022 = []
tradeindex = []

indexfiles = [index2017,index2018,index2019,index2020,index2021,index2022]

files2017 = []
files2018 = []
files2019 = []
files2020 = []
files2021 = []
files2022 = []

files = [files2017,files2018,files2019,files2020,files2021,files2022]

counter=0

for i in years:
    print(i)
    
    bucket_name="us-equity-1min-trades-"+i
    
    bucket = s3.Bucket(bucket_name)
    
    for obj in tqdm(bucket.objects.filter(Prefix=".index/",RequestPayer='requester')):
        print(obj.key)
        indexfiles[counter].append(obj.key)
        files[counter].append('s3://'+bucket_name+'/'+obj.key)

    counter = counter+1


In [None]:
    
counter = 0
for i in tqdm(range(5)):
    
    bucket_name="us-equity-1min-trades-"+years[i]
    bucket = s3.Bucket(bucket_name)
    
    for t in tqdm(range(len(indexfiles[counter]))):
        local = 'trades/'+indexfiles[counter][t]
        
        with open(local, 'wb') as f:
            bucket.download_fileobj(indexfiles[counter][t], f,ExtraArgs={'RequestPayer':'requester'})
        
        # s3.Bucket(bucket_name18).download_file(files2018[253:][i],local,ExtraArgs={'RequestPayer':'requester'})
        
    counter = counter+1


# 3 - Download Daily OHLC Data

Since the daily dataset is significantly smaller and is used to filter/select stocks for fetching 1 minute Trade and Quote data, we will just go ahead and download the whole thing.

In [None]:
for i in us_equity_daily_ohlc:
    print(i)
    
    bucket_name=i[5:-7]
    
    bucket = s3.Bucket(bucket_name)
    
    for obj in tqdm(bucket.objects.filter(Prefix=".index/",RequestPayer='requester')):
        print(obj.key)
        
        local='daily/'+obj.key
        path, filename = os.path.split(obj.key)

        with open(local, 'wb') as f:
            bucket.download_fileobj(bucket_name+filename, f,ExtraArgs={'RequestPayer':'requester'})


# 4 - Downloading 1 Minute TAQ Data

There are multiple different ways to download the