In [28]:
import os
import json
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from tqdm import tqdm
from pprint import pprint
import logging
import boto3

# if a log file already exists, delete it
if os.path.exists('dataset_build.log'):
    os.remove('dataset_build.log')
# set up logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# create a file handler
handler = logging.FileHandler('dataset_build.log')
handler.setLevel(logging.INFO)
# create a logging format
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
# add the handlers to the logger
logger.addHandler(handler)

In [69]:
# count the number of files in the prices folder
! ls prices | wc -l

1137


Convert jsons into an easy to use dict

In [29]:
prices = {}
# iterate over all json files in the prices folder
for file in tqdm(os.listdir('prices')):
    # open the json file
    with open(os.path.join('prices', file)) as f:
        daily_stock = json.load(f)
        if 'Time Series (Daily)' not in daily_stock:
            logger.error('No time series data for {}'.format(file))
            continue
        prices[daily_stock['Meta Data']['2. Symbol']] = daily_stock['Time Series (Daily)']

100%|██████████| 846/846 [00:12<00:00, 65.52it/s] 


Create a function that will convert stock data into dataframe with moving averages

In [60]:
def add_indicators(price_df) -> pd.DataFrame:
    """
    Add indicators to the price dataframe
    :param price_df: the price dataframe
    :return: the price dataframe with indicators
    """
    # convert the date column to datetime from format YYYY-MM-DD
    price_df['date'] = pd.to_datetime(price_df['date'])
    # set the date column as the index
    price_df = price_df.set_index('date').sort_index(ascending=True)
    # take a moving average of the adjusted close price
    price_df['30_day_MA'] = price_df['adj_close'].rolling(30).mean()
    price_df['50_day_MA'] = price_df['adj_close'].rolling(30).mean()
    price_df['100_day_MA'] = price_df['adj_close'].rolling(100).mean()
    price_df['200_day_MA'] = price_df['adj_close'].rolling(200).mean()
    # the highs and lows
    price_df['4_week_high'] = price_df['adj_close'].rolling(4*7).max()
    price_df['4_week_low'] = price_df['adj_close'].rolling(4*7).min()
    price_df['10_week_high'] = price_df['adj_close'].rolling(10*7).max()
    price_df['10_week_low'] = price_df['adj_close'].rolling(10*7).min()
    price_df['52_week_high'] = price_df['adj_close'].rolling(52*7).max()
    price_df['52_week_low'] = price_df['adj_close'].rolling(52*7).min()
    # take only the data up to 2 years ago and convert to numeric
    # price_df = price_df[price_df.index > datetime.now() - timedelta(days=365*2)].apply(pd.to_numeric).dropna().sort_index(ascending=False)
    price_df = price_df[price_df.index > datetime.now() - timedelta(days=365*2)].dropna().sort_index(ascending=False)
    return price_df

In [61]:
def convert_dict_to_df(stock: str, daily_prices: dict) -> pd.DataFrame:
    """Convert a dictionary of stock data to a pandas dataframe.
    
    Args:
        stock_dict (dict): A dictionary of stock data.
    
    Returns:
        pd.DataFrame: A dataframe of stock data.
    """
    # create a row for each stock and date
    rows = []
    for date, price in daily_prices.items():
        rows.append([stock, date, price['1. open'], price['2. high'], price['3. low'], price['4. close'], price['5. adjusted close'], price['6. volume'], price['7. dividend amount'], price['8. split coefficient']])
    # create a dataframe from the rows
    price_df = pd.DataFrame(rows, columns=['stock', 'date', 'open', 'high', 'low', 'close', 'adj_close', 'volume', 'dividend', 'split'])
    # convert all columns to numeric except for the date and stock
    price_df[price_df.columns[2:]] = price_df[price_df.columns[2:]].apply(pd.to_numeric, errors='coerce')
    price_df = add_indicators(price_df)
    return price_df

convert_dict_to_df('AAPL', prices['AAPL'])

Unnamed: 0_level_0,stock,open,high,low,close,adj_close,volume,dividend,split,30_day_MA,50_day_MA,100_day_MA,200_day_MA,4_week_high,4_week_low,10_week_high,10_week_low,52_week_high,52_week_low
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2023-03-07,AAPL,153.700,154.0299,151.13,151.600,151.600000,56182028,0.0,1.0,149.157596,149.157596,143.038994,146.738979,155.330000,142.782531,155.330000,124.829874,180.456347,124.829874
2023-03-06,AAPL,153.785,156.3000,153.46,153.830,153.830000,87558028,0.0,1.0,148.800776,148.800776,142.906629,146.680982,155.330000,141.644265,155.330000,124.829874,180.456347,124.829874
2023-03-03,AAPL,148.045,151.1100,147.33,151.030,151.030000,70732297,0.0,1.0,148.261787,148.261787,142.766300,146.653689,155.330000,141.644265,155.330000,124.829874,180.456347,124.829874
2023-03-02,AAPL,144.380,146.7100,143.90,145.910,145.910000,52279761,0.0,1.0,147.729597,147.729597,142.650685,146.622005,155.330000,140.895405,155.330000,124.829874,180.456347,124.829874
2023-03-01,AAPL,146.830,147.2285,145.01,145.310,145.310000,55478991,0.0,1.0,147.366076,147.366076,142.639434,146.623725,155.330000,137.660333,155.330000,124.829874,180.456347,124.829874
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-03-15,AAPL,121.410,124.0000,120.42,123.990,122.361841,92590555,0.0,1.0,126.293396,126.293396,123.560363,113.351992,135.382944,114.832033,141.068654,114.832033,141.068654,53.772436
2021-03-12,AAPL,120.400,121.1700,119.16,121.030,119.440710,88105050,0.0,1.0,126.549077,126.549077,123.477631,113.130787,135.382944,114.832033,141.068654,114.832033,141.068654,53.319503
2021-03-11,AAPL,122.540,123.2100,121.26,121.960,120.358497,103026514,0.0,1.0,127.070631,127.070631,123.454015,112.924016,135.382944,114.832033,141.068654,114.832033,141.068654,53.319503
2021-03-10,AAPL,121.690,122.1700,119.45,119.980,118.404498,111943326,0.0,1.0,127.724839,127.724839,123.437845,112.710963,135.382944,114.832033,141.068654,114.832033,141.068654,53.319503


Create a combined dataframe

In [62]:
# iterate all the stocks and convert the data to a dataframe
stock_dfs = []
for stock, daily_prices in tqdm(prices.items()):
    stock_dfs.append(convert_dict_to_df(stock, daily_prices))
# concatenate all the dataframes into one
stocks_df = pd.concat(stock_dfs)

100%|██████████| 841/841 [00:29<00:00, 28.89it/s]


In [63]:
stocks_df

Unnamed: 0_level_0,stock,open,high,low,close,adj_close,volume,dividend,split,30_day_MA,50_day_MA,100_day_MA,200_day_MA,4_week_high,4_week_low,10_week_high,10_week_low,52_week_high,52_week_low
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2023-03-07,AB,38.48,38.6500,37.6500,37.66,37.660000,173429,0.000,1.0,38.732735,38.732735,37.175538,38.518286,41.216558,36.822615,41.216558,32.566291,51.158164,31.214417
2023-03-06,AB,39.17,39.2500,38.3300,38.61,38.610000,244517,0.000,1.0,38.725137,38.725137,37.120260,38.515428,41.216558,36.822615,41.216558,32.566291,51.158164,31.214417
2023-03-03,AB,38.25,39.1500,38.2200,39.11,39.110000,309329,0.000,1.0,38.662609,38.662609,37.064080,38.510387,41.216558,36.822615,41.216558,32.566291,51.158164,31.214417
2023-03-02,AB,38.52,38.6900,37.8800,38.01,38.010000,232515,0.000,1.0,38.554252,38.554252,37.013237,38.494718,41.216558,36.822615,41.216558,32.566291,51.158164,31.214417
2023-03-01,AB,38.70,39.2500,38.3424,38.94,38.940000,221900,0.000,1.0,38.535970,38.535970,36.976873,38.486260,41.216558,36.734147,41.216558,32.566291,51.158164,31.214417
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-03-15,ROST,122.62,126.1200,122.2800,125.27,122.670140,2647075,0.285,1.0,116.445526,116.445526,109.878172,99.680297,122.670140,108.498420,122.670140,105.411026,122.670140,58.592078
2021-03-12,ROST,121.26,122.9000,120.2500,122.62,119.802577,2971697,0.000,1.0,115.980951,115.980951,109.540171,99.547007,122.108352,108.498420,122.108352,105.411026,122.108352,58.592078
2021-03-11,ROST,121.93,123.6900,120.6950,120.87,118.092787,2424957,0.000,1.0,115.693381,115.693381,109.271489,99.419359,122.108352,108.498420,122.108352,105.049528,122.108352,58.592078
2021-03-10,ROST,125.34,126.2135,121.3800,121.57,118.776703,3805940,0.000,1.0,115.270656,115.270656,109.030360,99.291027,122.108352,108.498420,122.108352,105.049528,122.108352,58.592078


In [66]:
# use boto3 to write the dataframe to dynamodb
dynamodb = boto3.resource('dynamodb', region_name='us-east-2')
table = dynamodb.Table('StockPrices')
# iterate over all the rows in the dataframe
for index, row in tqdm(stocks_df.iterrows()):
    # convert the index to a string
    date = str(index)
    # create a dictionary of the row data and convert all the values to strings
    row_dict = {key: str(value) for key, value in row.to_dict().items()}
    # add the date to the dictionary
    row_dict['Date'] = date
    # change the stock name to ticker
    row_dict['Stock'] = row_dict.pop('stock')
    # write the data to dynamodb
    table.put_item(Item=row_dict)

171it [00:31,  5.46it/s]
 32%|███▏      | 267/841 [35:32<1:16:24,  7.99s/it]


KeyboardInterrupt: 