In [1]:
# ! pip install pandas numpy dask tomli pyarrow fastparquet

In [2]:
import numpy as np
import pandas as pd
import dask.dataframe as dd

import os
import tomli

# Load Config

In [3]:
CONFIG_FILE_PATH = "../config.tomli"

with open(CONFIG_FILE_PATH, 'rb') as config_file:
    config = tomli.load(config_file)

ETF_DATA_DRIVE_PATH = f"../{config['data']['etfs']}"
STOCK_DATA_DRIVE_PATH = f"../{config['data']['stocks']}"
PROCESSED_DATA_DRIVE_PATH = f"../{config['data']['processed']}"
SYMBOLS_FILE_PATH = f"../{config['data']['symbols']}"

data_dtypes = config['etf_stock_data_type']
symbols_dtype = config['symbols_data_types']

date_format = config['format']['date_format']

SAMPLE_COUNT = 3

# ETFs

In [4]:
# get the list of etf files
# additionally sort the list to always return the same order.
etf_files = sorted(os.listdir(ETF_DATA_DRIVE_PATH))

# load a random etf file
num_etf = len(etf_files)
random_etf_file_index = np.random.randint(0, num_etf-1)
etf_file = etf_files[random_etf_file_index]
etf_file_path = f"{ETF_DATA_DRIVE_PATH}/{etf_file}"
random_etf_df = pd.read_csv(etf_file_path, dtype=data_dtypes, parse_dates=['Date'])
random_etf_df['Date'] = random_etf_df['Date'].dt.strftime(date_format)

# print a sample
print(f"ETF file name: {etf_file} ({random_etf_file_index+1}/{num_etf})")
random_etf_df.head(SAMPLE_COUNT)

ETF file name: SCHB.csv (1653/2165)


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2009-11-03,25.0,25.01,24.24,24.57,20.626394,504300
1,2009-11-04,27.0,27.0,24.5,24.6,20.651585,162400
2,2009-11-05,24.92,25.09,24.73,25.08,21.054543,112400


In [5]:
random_etf_df.dtypes

Date          object
Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume         int64
dtype: object

# Stocks

In [6]:
# get the list of stock files
# additionally sort the list to always return the same order.
stock_files = sorted(os.listdir(STOCK_DATA_DRIVE_PATH))

# load a random stock file
num_stocks = len(stock_files)
random_stock_file_index = np.random.randint(0, num_stocks-1)
stock_file = stock_files[random_stock_file_index]
stock_file_path = f"{STOCK_DATA_DRIVE_PATH}/{stock_file}"
random_stock_df = pd.read_csv(stock_file_path, parse_dates=['Date'])
random_stock_df['Date'] = random_stock_df['Date'].dt.strftime(date_format)

# print a sample
print(f"Stock file name: {stock_file} ({random_stock_file_index+1}/{num_stocks})")
random_stock_df.head(SAMPLE_COUNT)

Stock file name: KLXE.csv (2986/5884)


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2018-08-29,25.0,27.09,25.0,27.09,27.09,675700
1,2018-08-30,27.5,28.950001,26.25,28.440001,28.440001,1069800
2,2018-08-31,28.85,28.950001,27.799999,28.559999,28.559999,181700


In [7]:
random_stock_df.dtypes

Date          object
Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume         int64
dtype: object

# Symbols

In [8]:
symbols = pd.read_csv(SYMBOLS_FILE_PATH, dtype=symbols_dtype)
symbols.head(SAMPLE_COUNT)

Unnamed: 0,Nasdaq Traded,Symbol,Security Name,Listing Exchange,Market Category,ETF,Round Lot Size,Test Issue,Financial Status,CQS Symbol,NASDAQ Symbol,NextShares
0,Y,A,"Agilent Technologies, Inc. Common Stock",N,,N,100.0,N,,A,A,N
1,Y,AA,Alcoa Corporation Common Stock,N,,N,100.0,N,,AA,AA,N
2,Y,AAAU,Perth Mint Physical Gold ETF,P,,Y,100.0,N,,AAAU,AAAU,N


In [9]:
symbols.dtypes

Nasdaq Traded        object
Symbol               object
Security Name        object
Listing Exchange     object
Market Category      object
ETF                  object
Round Lot Size      float64
Test Issue           object
Financial Status     object
CQS Symbol           object
NASDAQ Symbol        object
NextShares           object
dtype: object

In [10]:
random_etf_df.to_parquet(f"{PROCESSED_DATA_DRIVE_PATH}/test.parquet")

In [11]:
! pip freeze > ../requirements.txt