# Introduction to Using AlgoSeek data for Quantitative Finance

## 1 - Library Import

In [1]:
# import boto3
# import awswrangler as wr
import pandas as pd 
import numpy as np  
import seaborn as sns
from tqdm import tqdm 
import os

pd.set_option('display.max_columns',1000)
pd.set_option('display.max_rows',1000)

s3 = boto3.resource('s3')


NameError: name 'boto3' is not defined

## 2 - Datasets Intro

### 2.1 - Overview
We will be looking at the following AlgoSeek Datasets:

- US Equity 1 Minute Trade and Quote (TAQ) 
- US Equity Options 1 Minute TAQ
- US Equity Standard Daily Adjusted OHLC 

Additionally, we will be using fundamentals data straight from EDGAR, which we will cover in a later notebook.

These are stored in S3 and are divided between several different buckets. They are stored in a standard format, allowing us to access data from specified days and equities. For each bucket in all of these datasets, there are index files that list metadata about the contents. We will first download all of these index files to use as reference when interacting with AlgoSeek's data.

We will fetch index files for the last 6 years and store them for future use, then demonstrate two different methodologies for downloading and fetching data:

- Downloading a month's data for all equities
- Downloading the entire 6 years of data for a subset of equities

In later notebooks, we will expand on these examples to show how to download data for stocks with the most volatility, liquidity, etc.

### 2.2 - Dataset Data Dictionaries

In [None]:
US Equity Trade and Quote Minute Bar Buckets


### 2.2 - Fetching the index files

US Equity 1min TAQ 

In [None]:
"""
US Equity Trade and Quote Minute Bar Buckets
---------------------------------------------------------
Bucket: s3://us-equity-1min-taq-yyyy where yyyy is a year (2016 - 2022)

Path Format: us-equity-1min-taq-yyyy/yyyymmdd/s/sss.csv.gz

Description: One csv.gz file per symbol per trading date where yyyymmdd is year, month and day, 
s - a single letter in A-Z range, sss - symbol

Example: s3://us-equity-1min-taq-2022/20220104/I/IBM.csv.gz
"""


In [None]:
us_equity_1min_taq = ['s3://us-equity-1min-taq-2017/.index/',
                      's3://us-equity-1min-taq-2018/.index/',
                      's3://us-equity-1min-taq-2019/.index/',
                      's3://us-equity-1min-taq-2020/.index/',
                      's3://us-equity-1min-taq-2021/.index/',
                      's3://us-equity-1min-taq-2022/.index/'
                      ]

US Equity Daily Standard Adjusted OHLC 

In [None]:
"""US Equity Standard Adjusted Daily OHLC Buckets
---------------------------------------------------------
Bucket: s3://us-equity-daily-ohlc-standard-adjusted-tradedate-yyyy where yyyy is a year 
(2016 - 2022)

Path Format: us-equity-daily-ohlc-standard-adjusted-tradedate-yyyy/yyyymmdd.csv

Description: One csv file per trading day where yyyymmdd is year, month and day

Example: s3://us-equity-daily-ohlc-standard-adjusted-tradedate-2022/20220104.csv
"""


In [None]:
us_equity_daily_ohlc = ['us-equity-daily-ohlc-standard-adjusted-tradedate-2017/.index/',
                      'us-equity-daily-ohlc-standard-adjusted-tradedate-2018/.index/',
                      'us-equity-daily-ohlc-standard-adjusted-tradedate-2019/.index/',
                      'us-equity-daily-ohlc-standard-adjusted-tradedate-2020/.index/',
                      'us-equity-daily-ohlc-standard-adjusted-tradedate-2021/.index/',
                      'us-equity-daily-ohlc-standard-adjusted-tradedate-2022/.index/'
                      ]

US Equity Options 1min TAQ 

In [None]:
"""
US Options Trade and Quote Minute Bar Buckets
---------------------------------------------------------
Bucket: s3://us-options-1min-taq-yyyy where yyyy is a year (2016 - 2022)

Path Format: us-options-1min-taq-yyyy/yyyymmdd/s/sss/sss.expdate.csv.gz

Description: One csv.gz file per ticker, trading day and contract expiration date where yyyymmdd is year, month and day, s - a single letter in A-Z range, sss - symbol, expdate is the contract expiration date in yyyymmdd format

Example: s3://us-options-1min-taq-2022/20220104/S/SPY/SPY.20220107.csv.gz
"""

In [1]:
us_equity_options_1min_taq_index = [
                        's3://us-options-1min-taq-2017/.index/',
                      's3://us-options-1min-taq-2018/.index/',
                      's3://us-options-1min-taq-2019/.index/',
                      's3://us-options-1min-taq-2020/.index/',
                      's3://us-options-1min-taq-2021/.index/',
                      's3://us-options-1min-taq-2022/.index/'
                      ]

We first much fetch the index files for each bucket we want to download from.

In [None]:
years = ['2017','2018','2019','2020','2021','2022']

In [None]:
index2017 = []
index2018 = []
index2019 = []
index2020 = []
index2021 = []
index2022 = []
tradeindex = []

indexfiles = [index2017,index2018,index2019,index2020,index2021,index2022]

files2017 = []
files2018 = []
files2019 = []
files2020 = []
files2021 = []
files2022 = []

files = [files2017,files2018,files2019,files2020,files2021,files2022]

counter=0

for i in years:
    print(i)
    
    bucket_name="us-equity-1min-trades-"+i
    
    bucket = s3.Bucket(bucket_name)
    
    for obj in tqdm(bucket.objects.filter(Prefix=".index/",RequestPayer='requester')):
        print(obj.key)
        indexfiles[counter].append(obj.key)
        files[counter].append('s3://'+bucket_name+'/'+obj.key)

    counter = counter+1


In [None]:
    
counter = 0
for i in tqdm(range(5)):
    
    bucket_name="us-equity-1min-trades-"+years[i]
    bucket = s3.Bucket(bucket_name)
    
    for t in tqdm(range(len(indexfiles[counter]))):
        local = 'trades/'+indexfiles[counter][t]
        
        with open(local, 'wb') as f:
            bucket.download_fileobj(indexfiles[counter][t], f,ExtraArgs={'RequestPayer':'requester'})
        
        # s3.Bucket(bucket_name18).download_file(files2018[253:][i],local,ExtraArgs={'RequestPayer':'requester'})
        
    counter = counter+1


# 3 - Download Daily Data Files

Since the daily dataset is significantly smaller and is used to filter/select stocks for fetching 1 minute Trade and Quote data, we will just go ahead and download the whole thing.

In [None]:
for i in us_equity_daily_ohlc:
    print(i)
    
    bucket_name=i[5:-7]
    
    bucket = s3.Bucket(bucket_name)
    
    for obj in tqdm(bucket.objects.filter(Prefix=".index/",RequestPayer='requester')):
        print(obj.key)
        
        local='daily/'+obj.key
        path, filename = os.path.split(obj.key)

        with open(local, 'wb') as f:
            bucket.download_fileobj(bucket_name+filename, f,ExtraArgs={'RequestPayer':'requester'})


# 4 - Downloading indices for 1 min TAQ Dataset

There are several ways that we will be interacting with the 1 min TAQ Equity and Options Datasets. Because they are too big to simply copy, we will first be downloading the index files that tell us where we can find specific datafiles. That way, we can fetch only the data that we are interested in as we need it.

In [2]:
from pathlib import Path

# setting the local paths for the files that we're downloading

index_path =Path('options_index')
if not index_path.exists():
    index_path.mkdir()

equity_index = Path('equity_index')
if not equity_index.exists():
    equity_index.mkdir()


SyntaxError: invalid syntax (<ipython-input-2-118b9b838aca>, line 4)

In [None]:
# Options
for bucket in us_equity_options_1min_taq_index:

    s3_bucket = s3.Bucket(bucket[5:-8])


    for obj in tqdm(s3_bucket.objects.filter(Prefix=".index/",RequestPayer='requester')):
        try:
            local = 'options_index/'+ obj.key[7:]

            with open(local,'wb') as f:
                s3_bucket.download_fileobj(obj.key, f, ExtraArgs={'RequestPayer':'requester'})

            print('Success ', obj.key)
        except:
            print('fail ', obj.key)

In [None]:
# Equity
for bucket in us_equity_1min_taq:

    s3_bucket = s3.Bucket(bucket[5:-8])

    for obj in tqdm(s3_bucket.objects.filter(Prefix=".index/",RequestPayer='requester')):
        try:
            local = 'equity_index/'+ obj.key[7:]

            with open(local,'wb') as f:
                s3_bucket.download_fileobj(obj.key, f, ExtraArgs={'RequestPayer':'requester'})

            print('Success ', obj.key)
        except:
            print('fail ', obj.key)

After downloading the index files, we use a Jupyter Magic command to run the following line as a shell command. This code pushes the files to an s3 storage locations

In [None]:
%sx aws s3 cp options_index/ s3://s3datalakeinventories/algoseek/us_equity_options_1min_taq1_index/ --recursive

In [None]:
# %sx aws s3 cp equity_index/ s3://s3datalakeinventories/algoseek/us_equity_1min_taq_index/ --recursive

## 5 - Dataset Exploration

Before we play around with the data, we must first download some samples to play with. Here, we download intraday data for January 2022 for all stocks (equity and options). Additionally, we download the full history for 10 stocks to demonstrate fetching a single stock.

In [None]:
start_file = '20220103.csv.gz'
end_file = '20220131.csv.gz'

In [3]:
daily = pd.read_parquet('C:/Users/julia/Documents/us_equity_daily_ohlc.parquet')

In [4]:
daily.head()

Unnamed: 0,TradeDate,SecId,Ticker,Open,High,Low,Close,MarketHoursVolume,CumulativePriceFactor,CumulativeVolumeFactor,AdjustmentFactor,AdjustmentReason
0,20160104,2234,SLF,30.44,30.58,30.0,30.57,454226,1.498942,1.0,,
1,20160104,18665,TNP,7.89,7.85,7.53,7.66,398554,3.226711,2.0,,
2,20160104,18679,ACE,115.02,115.41,112.52,113.73,2818882,1.230675,1.0,,
3,20160104,19690,LOGI,15.02,15.09,14.61,14.83,355490,1.187021,1.0,,
4,20160104,20946,DB,23.2,23.48,23.02,23.49,2554117,1.473137,1.158348,,


In [6]:
daily['TradeDate'] = pd.to_datetime(daily['TradeDate'],format="%Y%m%d")
daily.set_index(['Ticker','TradeDate'],inplace=True)

In [67]:
daily['year'] = (daily.index.get_level_values('TradeDate').year)
daily['month'] = (daily.index.get_level_values('TradeDate').month)
daily['quarter'] = (daily.index.get_level_values('TradeDate').quarter)

Calculate Returns

In [58]:
for i in tqdm(range(21)):
    daily[f'ret{i}day'] = (daily.sort_index().groupby(level='Ticker',group_keys=False).Close.pct_change(i))

100%|██████████| 21/21 [41:40<00:00, 119.06s/it]


In [None]:
# Alternative Implementation

by_tick = daily.sort_index().groupby(level='Ticker')
T = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,12,13,14,15, 21, 42, 63, 126, 252]
for t in T:
    data[f'ret_{t:02}'] = by_ticker.close.pct_change(t)

In [62]:
daily[daily.index.get_level_values('Ticker')=='AAPL'].head(100)

Unnamed: 0_level_0,Unnamed: 1_level_0,SecId,Open,High,Low,Close,MarketHoursVolume,CumulativePriceFactor,CumulativeVolumeFactor,AdjustmentFactor,AdjustmentReason,ret0day,ret1day,ret2day,ret3day,ret4day,ret5day,ret6day,ret7day,ret8day,ret9day,ret10day,ret11day,ret12day,ret13day,ret14day,ret15day,ret16day,ret17day,ret18day,ret19day,ret20day
Ticker,TradeDate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
AAPL,2016-01-04,33449,102.5,105.36,102.0,105.35,65080262,7.517875,7.0,,,0.0,,,,,,,,,,,,,,,,,,,,
AAPL,2016-01-05,33449,105.7,105.85,102.41,102.71,53287854,7.517875,7.0,,,0.0,-0.025059,,,,,,,,,,,,,,,,,,,
AAPL,2016-01-06,33449,100.5,102.37,99.87,100.7,65111040,7.517875,7.0,,,0.0,-0.01957,-0.044139,,,,,,,,,,,,,,,,,,
AAPL,2016-01-07,33449,98.71,100.13,96.43,96.45,75288373,7.517875,7.0,,,0.0,-0.042205,-0.060948,-0.08448,,,,,,,,,,,,,,,,,
AAPL,2016-01-08,33449,98.53,99.1,96.76,96.96,64952530,7.517875,7.0,,,0.0,0.005288,-0.03714,-0.055983,-0.079639,,,,,,,,,,,,,,,,
AAPL,2016-01-11,33449,98.94,99.06,97.34,98.53,46199875,7.517875,7.0,,,0.0,0.016192,0.021566,-0.021549,-0.040697,-0.064737,,,,,,,,,,,,,,,
AAPL,2016-01-12,33449,100.49,100.69,98.84,99.96,45333435,7.517875,7.0,,,0.0,0.014513,0.030941,0.036392,-0.007349,-0.026774,-0.051163,,,,,,,,,,,,,,
AAPL,2016-01-13,33449,100.57,101.19,97.3,97.39,58176855,7.517875,7.0,,,0.0,-0.02571,-0.01157,0.004435,0.009746,-0.03287,-0.051796,-0.075558,,,,,,,,,,,,,
AAPL,2016-01-14,33449,97.9,100.48,95.74,99.52,56303032,7.517875,7.0,,,0.0,0.021871,-0.004402,0.010048,0.026403,0.03183,-0.011718,-0.031058,-0.055339,,,,,,,,,,,,
AAPL,2016-01-15,33449,95.91,97.71,95.37,97.13,73265669,7.517875,7.0,,,0.0,-0.024015,-0.00267,-0.028311,-0.014209,0.001753,0.00705,-0.035452,-0.054328,-0.078026,,,,,,,,,,,


In [63]:
by_tick = daily.sort_index().groupby(level='Ticker')

Forward Returns

In [64]:
daily['ret_fwd'] = by_tick.ret1day.shift(-1)
daily = daily.dropna(subset=['ret_fwd'])

In [68]:
daily.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,SecId,Open,High,Low,Close,MarketHoursVolume,CumulativePriceFactor,CumulativeVolumeFactor,AdjustmentFactor,AdjustmentReason,ret0day,ret1day,ret2day,ret3day,ret4day,ret5day,ret6day,ret7day,ret8day,ret9day,ret10day,ret11day,ret12day,ret13day,ret14day,ret15day,ret16day,ret17day,ret18day,ret19day,ret20day,ret_fwd,year,month,quarter
Ticker,TradeDate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
SLF,2016-01-04,2234,30.44,30.58,30.0,30.57,454226,1.498942,1.0,,,0.0,,,,,,,,,,,,,,,,,,,,,-0.011122,2016,1,1
TNP,2016-01-04,18665,7.89,7.85,7.53,7.66,398554,3.226711,2.0,,,0.0,,,,,,,,,,,,,,,,,,,,,-0.05483,2016,1,1
ACE,2016-01-04,18679,115.02,115.41,112.52,113.73,2818882,1.230675,1.0,,,0.0,,,,,,,,,,,,,,,,,,,,,0.010112,2016,1,1
LOGI,2016-01-04,19690,15.02,15.09,14.61,14.83,355490,1.187021,1.0,,,0.0,,,,,,,,,,,,,,,,,,,,,0.002023,2016,1,1
DB,2016-01-04,20946,23.2,23.48,23.02,23.49,2554117,1.473137,1.158348,,,0.0,,,,,,,,,,,,,,,,,,,,,-0.000426,2016,1,1


### 5.2 - Single Stock Full History

In [8]:
aapl = pd.read_csv('C:/Users/julia/Downloads/aapl1min.csv')

In [10]:
aapl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1206563 entries, 0 to 1206562
Data columns (total 62 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   timebarstart                 1206563 non-null  object 
 1   openbartime                  1206563 non-null  object 
 2   openbidprice                 1205302 non-null  float64
 3   openbidsize                  1205302 non-null  float64
 4   openaskprice                 1205302 non-null  float64
 5   openasksize                  1205302 non-null  float64
 6   firsttradetime               1060098 non-null  object 
 7   firsttradeprice              1060098 non-null  float64
 8   firsttradesize               1060098 non-null  float64
 9   highbidtime                  1206561 non-null  object 
 10  highbidprice                 1206561 non-null  float64
 11  highbidsize                  1206561 non-null  float64
 12  highasktime                  1206561 non-n

In [13]:
aapl_options = pd.read_csv('datasets/options_trade_and_quotes_1min/AAPL.20200131.csv')

In [14]:
aapl_options.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55035 entries, 0 to 55034
Data columns (total 60 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Date                  55035 non-null  int64  
 1   TimeBarStart          55035 non-null  object 
 2   Ticker                55035 non-null  object 
 3   CallPut               55035 non-null  object 
 4   Strike                55035 non-null  float64
 5   ExpirationDate        55035 non-null  int64  
 6   OpenBidTime           49713 non-null  object 
 7   OpenBidPrice          49713 non-null  float64
 8   OpenBidSize           49713 non-null  float64
 9   OpenAskTime           54940 non-null  object 
 10  OpenAskPrice          54940 non-null  float64
 11  OpenAskSize           54940 non-null  float64
 12  OpenTradeTime         17034 non-null  object 
 13  OpenTradePrice        17034 non-null  float64
 14  OpenTradeSize         17034 non-null  float64
 15  HighBidTime        

In [36]:
import pandas_ta as ta

In [50]:
import plotly.graph_objects as go
import plotly
from plotly.subplots import make_subplots

df= daily[daily['Ticker']=='AAPL']
df = df.set_index('TradeDate')

df['ma5'] = df['Close'].rolling(window=5).mean()
df['ma20'] = df['Close'].rolling(window=20).mean()

macd = ta.macd(df['Close'],fast=26, slow=12, signal=9)
print(macd.info())
stoch = ta.stoch(df['High'],df['Low'],df['Close'],k=13,smooth_k=3)
print(stoch)

fig = go.Figure()
fig = make_subplots(rows=4, cols=1, shared_xaxes=True,
                vertical_spacing=0.01,
                row_heights=[0.5,0.1,0.2,0.2])

fig.add_trace(go.Candlestick(x=df.index,
  open=df['Open'],
  high=df['High'],
  low=df['Low'],
  close=df['Close']))

fig.add_trace(go.Scatter(x=df.index,
                         y=df['ma5'],
                         opacity=0.7,
                         line=dict(color='blue', width=2),
                         name='MA 5'))

fig.add_trace(go.Scatter(x=df.index,
                         y=df['ma20'],
                         opacity=0.7,
                         line=dict(color='orange', width=2),
                         name='MA 20'))

# Plot volume trace on 2nd row

colors = ['green' if row['Open'] - row['Close'] >= 0
          else 'red' for index, row in df.iterrows()]
fig.add_trace(go.Bar(x=df.index,
                     y=df['MarketHoursVolume'],
                     marker_color=colors
                    ), row=2, col=1)

# Plot MACD trace on 3rd row
colorsM = ['green' if val >= 0
          else 'red' for val in macd['MACDh_12_26_9']]

fig.add_trace(go.Bar(x=df.index,
                     y=macd['MACDh_12_26_9'],
                     marker_color=colorsM
                    ), row=3, col=1)
fig.add_trace(go.Scatter(x=df.index,
                         y=macd['MACD_12_26_9'],
                         line=dict(color='black', width=2)
                        ), row=3, col=1)
fig.add_trace(go.Scatter(x=df.index,
                         y=macd['MACDs_12_26_9'],
                         line=dict(color='blue', width=1)
                        ), row=3, col=1)

# Plot stochastics trace on 4th row

fig.add_trace(go.Scatter(x=df.index,
                         y=stoch['STOCHd_13_3_3'],
                         line=dict(color='black', width=2)
                        ), row=4, col=1)
fig.add_trace(go.Scatter(x=df.index,
                         y=stoch['STOCHk_13_3_3'],
                         line=dict(color='blue', width=1)
                        ), row=4, col=1)
# update layout by changing the plot size, hiding legends & rangeslider, and removing gaps between dates
fig.update_layout(height=900, width=1200,
                  showlegend=False,
                  xaxis_rangeslider_visible=False)
#  Make the title dynamic to reflect whichever stock we are analyzing
fig.update_layout(
    title= str('AAPL')+' Live Share Price:',
    yaxis_title='Stock Price (USD per Shares)')

# update y-axis label
fig.update_yaxes(title_text="Price", row=1, col=1)
fig.update_yaxes(title_text="Volume", row=2, col=1)
fig.update_yaxes(title_text="MACD", showgrid=False, row=3, col=1)
fig.update_yaxes(title_text="Stoch", row=4, col=1)

fig.update_xaxes(
    rangeslider_visible=False,
    rangeselector_visible=False,
    rangeselector=dict(
        buttons=list([
            dict(count=15, label="15m", step="minute", stepmode="backward"),
            dict(count=45, label="45m", step="minute", stepmode="backward"),
            dict(count=1, label="HTD", step="hour", stepmode="todate"),
            dict(count=3, label="3h", step="hour", stepmode="backward"),
            dict(step="all")
        ])
    )
)
fig.show()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1577 entries, 2016-01-04 to 2022-04-06
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MACD_12_26_9   1544 non-null   float64
 1   MACDh_12_26_9  1544 non-null   float64
 2   MACDs_12_26_9  1544 non-null   float64
dtypes: float64(3)
memory usage: 49.3 KB
None
            STOCHk_13_3_3  STOCHd_13_3_3
TradeDate                               
2016-01-21            NaN            NaN
2016-01-22            NaN            NaN
2016-01-25      51.597580            NaN
2016-01-26      70.878029            NaN
2016-01-27      49.750156      57.408588
...                   ...            ...
2022-03-31      92.162054      96.296296
2022-04-01      85.195185      91.684041
2022-04-04      85.499236      87.618825
2022-04-05      83.560502      84.751641
2022-04-06      74.953115      81.337618

[1565 rows x 2 columns]


In [49]:
stoch

Unnamed: 0_level_0,STOCHk_13_3_3,STOCHd_13_3_3
TradeDate,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-01-21,,
2016-01-22,,
2016-01-25,51.597580,
2016-01-26,70.878029,
2016-01-27,49.750156,57.408588
...,...,...
2022-03-31,92.162054,96.296296
2022-04-01,85.195185,91.684041
2022-04-04,85.499236,87.618825
2022-04-05,83.560502,84.751641


### 5.3 - Fundamental data from SEC - Financial Statements, Cash Flows, Balance Sheet, etc;

In [10]:
aapl_company_facts = pd.read_json('datasets/CIK0000320193.json')

In [11]:
aapl_company_facts.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2 entries, dei to us-gaap
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   cik         2 non-null      int64 
 1   entityName  2 non-null      object
 2   facts       2 non-null      object
dtypes: int64(1), object(2)
memory usage: 64.0+ bytes


In [17]:
aapl_company_facts.loc['us-gaap']

cik                                                      320193
entityName                                           Apple Inc.
facts         {'AccountsPayable': {'label': 'Accounts Payabl...
Name: us-gaap, dtype: object

In [25]:
aapl_company_facts.loc['us-gaap']['facts'].keys()

dict_keys(['AccountsPayable', 'AccountsPayableCurrent', 'AccountsReceivableNetCurrent', 'AccruedIncomeTaxesCurrent', 'AccruedIncomeTaxesNoncurrent', 'AccruedLiabilities', 'AccruedLiabilitiesCurrent', 'AccruedMarketingCostsCurrent', 'AccumulatedDepreciationDepletionAndAmortizationPropertyPlantAndEquipment', 'AccumulatedOtherComprehensiveIncomeLossAvailableForSaleSecuritiesAdjustmentNetOfTax', 'AccumulatedOtherComprehensiveIncomeLossCumulativeChangesInNetGainLossFromCashFlowHedgesEffectNetOfTax', 'AccumulatedOtherComprehensiveIncomeLossForeignCurrencyTranslationAdjustmentNetOfTax', 'AccumulatedOtherComprehensiveIncomeLossNetOfTax', 'AdjustmentsToAdditionalPaidInCapitalSharebasedCompensationRequisiteServicePeriodRecognitionValue', 'AdjustmentsToAdditionalPaidInCapitalTaxEffectFromShareBasedCompensation', 'AdvertisingExpense', 'AllocatedShareBasedCompensationExpense', 'AllowanceForDoubtfulAccountsReceivableCurrent', 'AmortizationOfIntangibleAssets', 'AntidilutiveSecuritiesExcludedFromCompu

In [23]:
aapl_company_facts.loc['us-gaap']['facts']['EarningsPerShareDiluted']['units']

{'USD/shares': [{'start': '2006-10-01',
   'end': '2007-09-29',
   'val': 3.93,
   'accn': '0001193125-09-214859',
   'fy': 2009,
   'fp': 'FY',
   'form': '10-K',
   'filed': '2009-10-27'},
  {'start': '2006-10-01',
   'end': '2007-09-29',
   'val': 3.93,
   'accn': '0001193125-10-012091',
   'fy': 2009,
   'fp': 'FY',
   'form': '10-K/A',
   'filed': '2010-01-25',
   'frame': 'CY2007'},
  {'start': '2007-09-30',
   'end': '2008-06-28',
   'val': 4.1,
   'accn': '0001193125-09-153165',
   'fy': 2009,
   'fp': 'Q3',
   'form': '10-Q',
   'filed': '2009-07-22'},
  {'start': '2008-03-30',
   'end': '2008-06-28',
   'val': 1.19,
   'accn': '0001193125-09-153165',
   'fy': 2009,
   'fp': 'Q3',
   'form': '10-Q',
   'filed': '2009-07-22',
   'frame': 'CY2008Q2'},
  {'start': '2007-09-30',
   'end': '2008-09-27',
   'val': 5.36,
   'accn': '0001193125-09-214859',
   'fy': 2009,
   'fp': 'FY',
   'form': '10-K',
   'filed': '2009-10-27'},
  {'start': '2007-09-30',
   'end': '2008-09-27',
   '

In [24]:
aapl_eps = pd.json_normalize(aapl_company_facts.loc['us-gaap']['facts']['EarningsPerShareDiluted']['units']['USD/shares'])

Unnamed: 0,start,end,val,accn,fy,fp,form,filed,frame
0,2006-10-01,2007-09-29,3.93,0001193125-09-214859,2009,FY,10-K,2009-10-27,
1,2006-10-01,2007-09-29,3.93,0001193125-10-012091,2009,FY,10-K/A,2010-01-25,CY2007
2,2007-09-30,2008-06-28,4.1,0001193125-09-153165,2009,Q3,10-Q,2009-07-22,
3,2008-03-30,2008-06-28,1.19,0001193125-09-153165,2009,Q3,10-Q,2009-07-22,CY2008Q2
4,2007-09-30,2008-09-27,5.36,0001193125-09-214859,2009,FY,10-K,2009-10-27,
5,2007-09-30,2008-09-27,6.78,0001193125-10-012091,2009,FY,10-K/A,2010-01-25,
6,2007-09-30,2008-09-27,6.78,0001193125-10-238044,2010,FY,10-K,2010-10-27,CY2008
7,2008-09-28,2008-12-27,2.5,0001193125-10-012085,2010,Q1,10-Q,2010-01-25,
8,2008-09-28,2008-12-27,2.5,0001193125-10-238044,2010,FY,10-K,2010-10-27,CY2008Q4
9,2008-09-28,2009-03-28,4.29,0001193125-10-088957,2010,Q2,10-Q,2010-04-21,


In [29]:
aapl_assets = pd.json_normalize(aapl_company_facts.loc['us-gaap']['facts']['Assets']['units']['USD'])
aapl_profit = pd.json_normalize(aapl_company_facts.loc['us-gaap']['facts']['GrossProfit']['units']['USD'])
aapl_op_exp = pd.json_normalize(aapl_company_facts.loc['us-gaap']['facts']['OperatingExpenses']['units']['USD'])
aapl_long_debt = pd.json_normalize(aapl_company_facts.loc['us-gaap']['facts']['LongTermDebt']['units']['USD'])
aapl_rev = pd.json_normalize(aapl_company_facts.loc['us-gaap']['facts']['Revenues']['units']['USD'])


In [31]:
aapl_assets.sort_values('filed')

Unnamed: 0,end,val,accn,fy,fp,form,filed,frame
0,2008-09-27,39572000000,0001193125-09-153165,2009,Q3,10-Q,2009-07-22,
4,2009-06-27,48140000000,0001193125-09-153165,2009,Q3,10-Q,2009-07-22,CY2009Q2I
1,2008-09-27,39572000000,0001193125-09-214859,2009,FY,10-K,2009-10-27,
5,2009-09-26,53851000000,0001193125-09-214859,2009,FY,10-K,2009-10-27,
2,2008-09-27,36171000000,0001193125-10-012091,2009,FY,10-K/A,2010-01-25,
6,2009-09-26,47501000000,0001193125-10-012085,2010,Q1,10-Q,2010-01-25,
7,2009-09-26,47501000000,0001193125-10-012091,2009,FY,10-K/A,2010-01-25,
12,2009-12-26,53926000000,0001193125-10-012085,2010,Q1,10-Q,2010-01-25,CY2009Q4I
8,2009-09-26,47501000000,0001193125-10-088957,2010,Q2,10-Q,2010-04-21,
13,2010-03-27,57057000000,0001193125-10-088957,2010,Q2,10-Q,2010-04-21,CY2010Q1I


In [69]:
aapl_profit.sort_values('filed').head(50)

Unnamed: 0,start,end,val,accn,fy,fp,form,filed,frame
2,2007-09-30,2008-06-28,8406000000,0001193125-09-153165,2009,Q3,10-Q,2009-07-22,
3,2008-03-30,2008-06-28,2600000000,0001193125-09-153165,2009,Q3,10-Q,2009-07-22,CY2008Q2
14,2009-03-29,2009-06-27,3023000000,0001193125-09-153165,2009,Q3,10-Q,2009-07-22,
12,2008-09-28,2009-06-27,9526000000,0001193125-09-153165,2009,Q3,10-Q,2009-07-22,
0,2006-10-01,2007-09-29,8154000000,0001193125-09-214859,2009,FY,10-K,2009-10-27,
4,2007-09-30,2008-09-27,11145000000,0001193125-09-214859,2009,FY,10-K,2009-10-27,
17,2008-09-28,2009-09-26,13140000000,0001193125-09-214859,2009,FY,10-K,2009-10-27,
18,2008-09-28,2009-09-26,17222000000,0001193125-10-012091,2009,FY,10-K/A,2010-01-25,
22,2009-09-27,2009-12-26,6411000000,0001193125-10-012085,2010,Q1,10-Q,2010-01-25,
7,2008-09-28,2008-12-27,4507000000,0001193125-10-012085,2010,Q1,10-Q,2010-01-25,


In [33]:
aapl_op_exp.sort_values('filed')

Unnamed: 0,start,end,val,accn,fy,fp,form,filed,frame
2,2007-09-30,2008-06-28,3573000000,0001193125-09-153165,2009,Q3,10-Q,2009-07-22,
3,2008-03-30,2008-06-28,1208000000,0001193125-09-153165,2009,Q3,10-Q,2009-07-22,CY2008Q2
12,2009-03-29,2009-06-27,1351000000,0001193125-09-153165,2009,Q3,10-Q,2009-07-22,
10,2008-09-28,2009-06-27,4061000000,0001193125-09-153165,2009,Q3,10-Q,2009-07-22,
0,2006-10-01,2007-09-29,3745000000,0001193125-09-214859,2009,FY,10-K,2009-10-27,
14,2008-09-28,2009-09-26,5482000000,0001193125-09-214859,2009,FY,10-K,2009-10-27,
4,2007-09-30,2008-09-27,4870000000,0001193125-09-214859,2009,FY,10-K,2009-10-27,
15,2008-09-28,2009-09-26,5482000000,0001193125-10-012091,2009,FY,10-K/A,2010-01-25,
18,2009-09-27,2009-12-26,1686000000,0001193125-10-012085,2010,Q1,10-Q,2010-01-25,
5,2007-09-30,2008-09-27,4870000000,0001193125-10-012091,2009,FY,10-K/A,2010-01-25,


In [34]:
aapl_long_debt.sort_values('filed')

Unnamed: 0,end,val,accn,fy,fp,form,filed,frame
0,2012-09-29,0,0001193125-13-300670,2013,Q3,10-Q,2013-07-24,
2,2013-06-29,16958000000,0001193125-13-300670,2013,Q3,10-Q,2013-07-24,CY2013Q2I
1,2012-09-29,0,0001193125-13-416534,2013,FY,10-K,2013-10-30,CY2012Q3I
3,2013-09-28,16960000000,0001193125-13-416534,2013,FY,10-K,2013-10-30,
4,2013-09-28,16960000000,0001193125-14-024487,2014,Q1,10-Q,2014-01-28,
9,2013-12-28,16961000000,0001193125-14-024487,2014,Q1,10-Q,2014-01-28,CY2013Q4I
10,2014-03-29,16962000000,0001193125-14-157311,2014,Q2,10-Q,2014-04-24,CY2014Q1I
5,2013-09-28,16960000000,0001193125-14-157311,2014,Q2,10-Q,2014-04-24,
11,2014-06-28,29030000000,0001193125-14-277160,2014,Q3,10-Q,2014-07-23,CY2014Q2I
6,2013-09-28,16960000000,0001193125-14-277160,2014,Q3,10-Q,2014-07-23,


In [35]:
aapl_rev.sort_values('filed')

Unnamed: 0,start,end,val,accn,fy,fp,form,filed,frame
0,2015-09-27,2016-09-24,215639000000,0000320193-18-000145,2018,FY,10-K,2018-11-05,CY2016
1,2016-09-25,2016-12-31,78351000000,0000320193-18-000145,2018,FY,10-K,2018-11-05,CY2016Q4
2,2017-01-01,2017-04-01,52896000000,0000320193-18-000145,2018,FY,10-K,2018-11-05,CY2017Q1
3,2017-04-02,2017-07-01,45408000000,0000320193-18-000145,2018,FY,10-K,2018-11-05,CY2017Q2
4,2016-09-25,2017-09-30,229234000000,0000320193-18-000145,2018,FY,10-K,2018-11-05,CY2017
5,2017-07-02,2017-09-30,52579000000,0000320193-18-000145,2018,FY,10-K,2018-11-05,CY2017Q3
6,2017-10-01,2017-12-30,88293000000,0000320193-18-000145,2018,FY,10-K,2018-11-05,CY2017Q4
7,2017-12-31,2018-03-31,61137000000,0000320193-18-000145,2018,FY,10-K,2018-11-05,CY2018Q1
8,2018-04-01,2018-06-30,53265000000,0000320193-18-000145,2018,FY,10-K,2018-11-05,CY2018Q2
9,2017-10-01,2018-09-29,265595000000,0000320193-18-000145,2018,FY,10-K,2018-11-05,CY2018


In [26]:
aapl_submissions = pd.read_json(r'datasets/sec_aapl.json')

ValueError: All arrays must be of the same length

In [20]:
import json
with open('datasets/sec_aapl.json') as json_data:
    data = json.load(json_data)
aapl_submissions = pd.DataFrame(data)

ValueError: All arrays must be of the same length

In [21]:
data

{'cik': '320193',
 'entityType': 'operating',
 'sic': '3571',
 'sicDescription': 'Electronic Computers',
 'insiderTransactionForOwnerExists': 0,
 'insiderTransactionForIssuerExists': 1,
 'name': 'Apple Inc.',
 'tickers': ['AAPL'],
 'exchanges': ['Nasdaq'],
 'ein': '942404110',
 'description': '',
 'website': '',
 'investorWebsite': '',
 'category': 'Large accelerated filer',
 'fiscalYearEnd': '0924',
 'stateOfIncorporation': 'CA',
 'stateOfIncorporationDescription': 'CA',
 'addresses': {'mailing': {'street1': 'ONE APPLE PARK WAY',
   'street2': None,
   'city': 'CUPERTINO',
   'stateOrCountry': 'CA',
   'zipCode': '95014',
   'stateOrCountryDescription': 'CA'},
  'business': {'street1': 'ONE APPLE PARK WAY',
   'street2': None,
   'city': 'CUPERTINO',
   'stateOrCountry': 'CA',
   'zipCode': '95014',
   'stateOrCountryDescription': 'CA'}},
 'phone': '(408) 996-1010',
 'flags': '',
 'formerNames': [{'name': 'APPLE INC',
   'from': '2007-01-10T00:00:00.000Z',
   'to': '2019-08-05T00:00:0

In [25]:
aapl_submissions.head()

Unnamed: 0,0
cik,320193
entityType,operating
sic,3571
sicDescription,Electronic Computers
insiderTransactionForOwnerExists,0
