In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import boto3
from io import StringIO, BytesIO
import os
from dotenv import load_dotenv, find_dotenv
from etl.polygon.source_polygon import SourcePolygonConnector
from etl.etl_transformations.etl import ETL

{'AAPL': <generator object BaseClient._paginate_iter at 0x123a3f820>,
 'TSLA': <generator object BaseClient._paginate_iter at 0x123a3fdd0>}

In [2]:
Polygon = SourcePolygonConnector('polygon_api_key')

In [3]:
stocks = Polygon.get_stocks('2024-06-05', ['AAPL', 'TSLA'])

In [4]:
stocks

Unnamed: 0,open,high,low,close,volume,vwap,timestamp,transactions,otc,ticker
0,194.99,195.2,194.58,194.83,118043.0,194.8244,1717574400000,2422,,AAPL
1,194.79,196.9,194.6678,196.02,21936159.0,195.676,1717588800000,376029,,AAPL
2,196.01,196.7,195.59,195.9,22511239.0,196.0724,1717603200000,261315,,AAPL
3,195.87,195.87,195.34,195.53,2219173.0,195.8006,1717617600000,5918,,AAPL
4,195.65,196.04,195.63,195.67,124816.0,195.8682,1717660800000,2708,,AAPL
5,195.9,196.5,195.18,195.53,15596150.0,195.8139,1717675200000,316837,,AAPL
6,195.535,196.07,194.17,194.48,17949307.0,195.2266,1717689600000,221527,,AAPL
7,194.48,194.76,193.94,194.59,970136.0,194.425,1717704000000,5776,,AAPL
8,195.02,195.12,194.7,195.12,79842.0,194.9218,1717747200000,1617,,AAPL
9,195.0,195.97,193.45,195.85,20941938.0,195.1895,1717761600000,218534,,AAPL


In [20]:
stocks
#  dict[str:iter[tuple]]

{'AAPL': <generator object BaseClient._paginate_iter at 0x123a3f820>,
 'TSLA': <generator object BaseClient._paginate_iter at 0x123a3fdd0>}

In [21]:
next(aapl_object)

Agg(open=195.87, high=195.87, low=195.34, close=195.53, volume=2219173.0, vwap=195.8006, timestamp=1717617600000, transactions=5918, otc=None)

## Getting/Updating/Exporting Meta File

In [4]:
def return_date_list(bucket, arg_date, src_format, meta_key):
    min_date = datetime.strptime(arg_date, src_format).date() - timedelta(days=1)
    today = datetime.today().date()
    try:
        df_meta = read_csv_to_df(bucket, meta_key)
        dates = [(min_date + timedelta(days=x)) for x in range(0, (today-min_date).days + 1)]
        src_dates = set(pd.to_datetime(df_meta['source_date']).dt.date)
        dates_missing = set(dates[1:]) - src_dates
        if dates_missing:
            min_date = min(set(dates[1:]) - src_dates) - timedelta(days=1)
            return_dates = [date.strftime(src_format) for date in dates if date >= min_date]
            return_min_date = arg_date
        else:
            return_dates = []
            return_min_date = datetime(2200, 1, 1).date()
    except bucket.session.client('s3').execptions.NoSuchKey:
        return_dates = [(min_date + timedelta(days=x)).strftime(src_format) for x in range(0, (today-min_date).days + 1)]
        return_min_date = (min_date + timedelta(days=1)).strftime(src_format)
    return return_min_date, return_dates

In [5]:
def read_csv_to_df(bucket, key, decoding = 'utf-8', sep = ','):
    csv_obj = bucket.Object(key=key).get().get('Body').read().decode(decoding)
    data = StringIO(csv_obj)
    df = pd.read_csv(data, delimiter=sep)
    return df

In [6]:
def df_to_s3_csv(df, bucket, key):
    out_buffer = StringIO()
    df.to_csv(out_buffer, index=False)
    bucket.put_object(Body=out_buffer.getvalue(), Key=key)
    return True

In [7]:
def update_meta_file(bucket, meta_key, date_list):
    df_new = pd.DataFrame(columns=['source_date', 'datetime_of_processing'])
    df_new['source_date'] = date_list
    df_new['datetime_of_processing'] = datetime.today().strftime('%Y-%m-%d %H:%M:%S')
    df_old = read_csv_to_df(bucket, meta_key)
    df_all = pd.concat([df_old, df_new])
    df_to_s3_csv(df_all, bucket, meta_key)
    return True

## Extract

In [8]:
def get_stocks(tickers, start_date, end_date, client, timespan = 'hour'):
    stock_objects = {}
    for stock in tickers:
        stock_objects[stock] = client.list_aggs(ticker=stock, multiplier=4, timespan=timespan,
                                                        from_=start_date, to=end_date, limit=50000)
    return stock_objects

In [9]:
stock = get_stocks('*', '2024-05-01', '2024-05-03', )

TypeError: get_stocks() missing 1 required positional argument: 'client'

In [7]:
def dict_to_df(dict):
    dataframes = []
    for ticker, data in dict.items():
        df = pd.DataFrame(data).assign(ticker=ticker)
        dataframes.append(df)
    return pd.concat(dataframes, ignore_index=True)

In [8]:
def extract(tickers, client, date_list):
    if date_list:
        dictStocks = get_stocks(tickers, date_list[0], date_list[-1], client)
        df = dict_to_df(dictStocks)
        return df

## Transform

In [9]:
def timestamp_to_datetime(df):
    if 'timestamp' in df.columns:
        df['date_time'] = pd.to_datetime(df['timestamp'], unit='ms')
        return df
    else: 
        print('No timestamp column')

In [10]:
def drop_columns(df, columns):
    col = columns
    return df[col]

In [11]:
def clean_df(df):
    df = df.dropna()
    df = df.drop_duplicates()
    df['date_time'] = pd.to_datetime(df['date_time'])
    df = df.astype({'open': 'float', 'close': 'float', 'high': 'float', 'low': 'float', 'transactions': 'int'})
    return df

In [12]:
def add_col_returns(df):
    df['periodic_return'] = df.groupby('ticker')['close'].pct_change()*100
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.dropna(subset=['periodic_return'], inplace=True)
    return df

In [13]:
def make_weekly_aggregated(df):
    df.set_index('date_time', inplace=True)
    weekly_aggregated = df.groupby('ticker').resample('W-MON').agg({
        'open': 'first',
        'close': 'last',
        'high': 'max',
        'low': 'min',
        'transactions': 'sum',
        'periodic_return': 'std'  # This calculates the standard deviation of the 4-hourly returns for weekly volatility
    }).rename(columns={'periodic_return': 'pct_volatility'}).reset_index()
    return weekly_aggregated

In [14]:
def add_col_change(weekly_aggregated):
    weekly_aggregated['weekly_pct_change'] = weekly_aggregated.groupby('ticker')['close'].pct_change()*100
    weekly_aggregated['weekly_pct_change'].fillna(0, inplace=True)
    return weekly_aggregated

In [15]:
def transform(df, columns):
    dt_cng_df = timestamp_to_datetime(df)
    dp_col_df = drop_columns(dt_cng_df, columns)
    cleaned_df = clean_df(dp_col_df)
    rtn_add_df = add_col_returns(cleaned_df)
    weekly_aggregated = make_weekly_aggregated(rtn_add_df)
    return weekly_aggregated
    

## Load

In [16]:
def df_to_s3(weekly_aggregated, bucket_target, key):
    out_buffer = BytesIO()
    weekly_aggregated.to_parquet(out_buffer, index=False)
    bucket_target.put_object(Body=out_buffer.getvalue(), Key=key)
    return True

In [17]:
def load(df, bucket, trg_key, trg_format, meta_key, date_list, src_format):
    key = trg_key + datetime.today().strftime(src_format) + trg_format
    df_to_s3(df, bucket, key)
    update_meta_file(bucket, meta_key, date_list)

In [18]:
def run_etl(tickers, client, trg_bucket, date_list, columns, arg_date, trg_key, src_format, trg_format, meta_key):
    raw_df = extract(tickers, client, date_list)
    if raw_df:
        cleaned_df = transform(raw_df, columns)
        extract_date_list = [date for date in date_list if date >= arg_date]
        load(cleaned_df, trg_bucket, trg_key, trg_format, meta_key, extract_date_list, src_format)

In [19]:
def main():
    tickers = ['AAPL', 'TSLA']
    client = RESTClient(POLYGON_API_KEY)
    arg_date = '2024-04-14'
    src_format = "%Y-%m-%d"
    trg_format = '.parquet'
    trg_key = 'polygon_weekly_report_'
    columns = ['ticker', 'open', 'close', 'low', 'high', 'transactions', 'date_time']
    
    
    # Init
    s3 = boto3.resource('s3')
    meta_key = 'meta_file.csv'
    bucket_target = s3.Bucket('adaptivesharks-test-etl-target')
    
    # Run Application
    extract_date, date_list = return_date_list(bucket_target, arg_date, src_format, meta_key)
    run_etl(tickers, client, bucket_target, date_list, columns, extract_date, trg_key, src_format, trg_format, meta_key)
    

In [20]:
# Run
main()

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
from etl.polygon.source_polygon import 

In [15]:
from collections import namedtuple
import random

# Define the named tuple for stock data
StockData = namedtuple('StockData', ['open', 'high', 'low', 'close', 'volume', 'vwap', 'timestamp', 'transactions', 'otc'])

def generate_stock_data():
    """
    Generate a single instance of stock data with random values.
    """
    open_price = round(random.uniform(100, 200), 2)
    high_price = round(open_price + random.uniform(0, 10), 2)
    low_price = round(open_price - random.uniform(0, 10), 2)
    close_price = round(random.uniform(low_price, high_price), 2)
    volume = random.randint(1000000, 50000000)
    vwap = round((high_price + low_price + close_price) / 3, 4)
    timestamp = random.randint(1500000000, 1600000000) * 1000  # Random UNIX timestamp in milliseconds
    transactions = random.randint(100000, 500000)
    otc = None  # Assuming OTC is None for simplicity

    return StockData(open=open_price, high=high_price, low=low_price, close=close_price, 
                     volume=volume, vwap=vwap, timestamp=timestamp, 
                     transactions=transactions, otc=otc)

def stock_data_generator():
    """
    Generator function to create a sequence of stock data.
    """
    while True:
        yield generate_stock_data()

def get_ticker_stock_data(tickers):
    """
    Given a list of tickers, return a dictionary where the keys are the tickers and the values
    are generators of named tuples representing stock data.
    """
    return {ticker: stock_data_generator() for ticker in tickers}

# Example usage
tickers = ['AAPL', 'GOOG', 'MSFT']
stock_data = get_ticker_stock_data(tickers)

In [17]:
data = stock_data['AAPL']

In [19]:
next(data)

StockData(open=189.58, high=196.49, low=179.97, close=194.72, volume=1418250, vwap=190.3933, timestamp=1591120850000, transactions=324346, otc=None)