In [1]:
# ! pip install pandas numpy dask tomli

In [2]:
import numpy as np
import pandas as pd
import dask.dataframe as dd

import os
import tomli

# Load Config

In [12]:
CONFIG_FILE_PATH = "../config.tomli"

with open(CONFIG_FILE_PATH, 'rb') as config_file:
    config = tomli.load(config_file)

ETF_DATA_DRIVE_PATH = f"../{config['data']['etfs']}"
STOCK_DATA_DRIVE_PATH = f"../{config['data']['stocks']}"
SYMBOLS_FILE_PATH = f"../{config['data']['symbols']}"

# ETFs

In [22]:
# get the list of etf files
# additionally sort the list to always return the same order.
etf_files = sorted(os.listdir(ETF_DATA_DRIVE_PATH))

# load a random etf file
num_etf = len(etf_files)
random_etf_file_index = np.random.randint(0, num_etf-1)
etf_file = etf_files[random_etf_file_index]
etf_file_path = f"{ETF_DATA_DRIVE_PATH}/{etf_file}"
random_etf_df = pd.read_csv(etf_file_path, parse_dates=['Date'])

# print a sample
print(f"ETF file name: {etf_file} ({random_etf_file_index+1}/{num_etf})")
random_etf_df.head(3)

ETF file name: IVLU.csv (1052/2165)


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2015-07-15,24.799999,24.799999,24.77,24.77,21.80105,200
1,2015-07-16,24.77,24.77,24.77,24.77,21.80105,0
2,2015-07-17,24.77,24.77,24.77,24.77,21.80105,0


In [20]:
random_etf_df.dtypes

Date         datetime64[ns]
Open                float64
High                float64
Low                 float64
Close               float64
Adj Close           float64
Volume                int64
dtype: object

# Stocks

In [21]:
# get the list of stock files
# additionally sort the list to always return the same order.
stock_files = sorted(os.listdir(STOCK_DATA_DRIVE_PATH))

# load a random stock file
num_stocks = len(stock_files)
random_stock_file_index = np.random.randint(0, num_stocks-1)
stock_file = stock_files[random_stock_file_index]
stock_file_path = f"{STOCK_DATA_DRIVE_PATH}/{stock_file}"
random_stock_df = pd.read_csv(stock_file_path, parse_dates=['Date'])

# print a sample
print(f"Stock file name: {stock_file} ({random_stock_file_index+1}/{num_stocks})")
random_stock_df.head(3)

Stock file name: YUMA.csv (5841/5884)


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1980-03-17,1380.0,1460.0,1380.0,1380.0,1379.698486,0
1,1980-03-18,1380.0,1420.0,1340.0,1340.0,1339.707153,0
2,1980-03-19,1540.0,1620.0,1540.0,1540.0,1539.663452,0


In [19]:
random_stock_df.dtypes

Date         datetime64[ns]
Open                float64
High                float64
Low                 float64
Close               float64
Adj Close           float64
Volume                int64
dtype: object

# Symbols

In [14]:
symbols = pd.read_csv(SYMBOLS_FILE_PATH)
symbols.head(3)

Unnamed: 0,Nasdaq Traded,Symbol,Security Name,Listing Exchange,Market Category,ETF,Round Lot Size,Test Issue,Financial Status,CQS Symbol,NASDAQ Symbol,NextShares
0,Y,A,"Agilent Technologies, Inc. Common Stock",N,,N,100.0,N,,A,A,N
1,Y,AA,Alcoa Corporation Common Stock,N,,N,100.0,N,,AA,AA,N
2,Y,AAAU,Perth Mint Physical Gold ETF,P,,Y,100.0,N,,AAAU,AAAU,N


In [15]:
symbols.dtypes

Nasdaq Traded        object
Symbol               object
Security Name        object
Listing Exchange     object
Market Category      object
ETF                  object
Round Lot Size      float64
Test Issue           object
Financial Status     object
CQS Symbol           object
NASDAQ Symbol        object
NextShares           object
dtype: object

In [7]:
# random_etf_df.to_parquet("t.parquet")

In [None]:
# ! pip freeze > ../requirements.txt