In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import json
from alpaca_trade_api.rest import REST, TimeFrame
import tables
import os
from datetime import datetime

In [17]:

url = "https://paper-api.alpaca.markets/v2/calendar?start=2024-06-01T00%3A00%3A00Z&end=2024-08-31T23%3A59%3A59Z" ## get trading days from 2024-06-01 to 2024-08-31

key = json.loads(open("key.txt","r").read())
api_key_id = key["APCA-API-KEY-ID"]
api_secret_key = key["APCA-API-SECRET-KEY"]

headers = {
    "accept": "application/json",
    "APCA-API-KEY-ID": api_key_id,
    "APCA-API-SECRET-KEY": api_secret_key
}

api = REST(api_key_id, api_secret_key, base_url='https://data.alpaca.markets/v2')

In [18]:
response = requests.get(url, headers=headers)
df_market_hours = pd.DataFrame(response.json())
df_market_hours["market open"] = df_market_hours["date"] + "T" + df_market_hours["open"] + ":00Z"
df_market_hours["market close"] = df_market_hours["date"] + "T" + df_market_hours["close"] + ":00Z"
df_market_hours = df_market_hours[["date", "market open", "market close"]]
df_market_hours.set_index("date", inplace= True)

In [20]:
df_market_hours.head()

Unnamed: 0_level_0,market open,market close
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-06-03,2024-06-03T09:30:00Z,2024-06-03T16:00:00Z
2024-06-04,2024-06-04T09:30:00Z,2024-06-04T16:00:00Z
2024-06-05,2024-06-05T09:30:00Z,2024-06-05T16:00:00Z
2024-06-06,2024-06-06T09:30:00Z,2024-06-06T16:00:00Z
2024-06-07,2024-06-07T09:30:00Z,2024-06-07T16:00:00Z


In [21]:
symbol_list = ["SMH", "SOXX","NVDA", "TSM", "AVGO", "TXN", "AMD", "ASML", "ADI", "AMAT", "QCOM", "KLAC","INTC"]
folder_name = "data"

# Ensure the folder exists
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

for symbol in symbol_list:
    for i in range(len(df_market_hours)):
        date = df_market_hours.index[i]
               
        market_open = df_market_hours.iloc[i]["market open"]
        market_close = df_market_hours.iloc[i]["market close"]

        # Convert the date string to a datetime object if necessary
        if isinstance(date, str):
            date = datetime.strptime(date, '%Y-%m-%d')  # Adjust format if your date string is different

        # Create a file name for the CSV file using the symbol and date
        csv_filename = os.path.join(folder_name, f"{symbol}_{date.strftime('%Y%m%d')}_quote.csv")

        # Check if the file already exists
        if os.path.exists(csv_filename):
            print(f"Data for {symbol} on {date.strftime('%Y-%m-%d')} already exists. Skipping download.")
            continue  # Skip to the next iteration if the file exists

        print(f"Start downloading quote data for {symbol} on {date.strftime('%Y-%m-%d')}...")
        quotes = api.get_quotes(symbol=symbol, start=market_open, end=market_close, limit=3000000).df  
        print(f"Finished downloading quote data for {symbol} on {date.strftime('%Y-%m-%d')}")

        # Save the quotes DataFrame to the CSV file
        quotes.to_csv(csv_filename, index=True)

        print(f"Saved data to {csv_filename}")
        print("--------------------")

Start downloading quote data for SMH on 2024-06-03...
Finished downloading quote data for SMH on 2024-06-03
Saved data to data\SMH_20240603_quote.csv
--------------------
Start downloading quote data for SMH on 2024-06-04...
Finished downloading quote data for SMH on 2024-06-04
Saved data to data\SMH_20240604_quote.csv
--------------------
Start downloading quote data for SMH on 2024-06-05...
Finished downloading quote data for SMH on 2024-06-05
Saved data to data\SMH_20240605_quote.csv
--------------------
Start downloading quote data for SMH on 2024-06-06...
Finished downloading quote data for SMH on 2024-06-06
Saved data to data\SMH_20240606_quote.csv
--------------------
Start downloading quote data for SMH on 2024-06-07...
Finished downloading quote data for SMH on 2024-06-07
Saved data to data\SMH_20240607_quote.csv
--------------------
Start downloading quote data for SMH on 2024-06-10...
Finished downloading quote data for SMH on 2024-06-10
Saved data to data\SMH_20240610_quote

In [23]:
# Specify the folder where the CSV files are located
folder_name = "data"

# Get the list of all files in the folder
all_files = os.listdir(folder_name)

# Filter the list to include only .csv files
csv_files = [f for f in all_files if f.endswith('.csv')]

# Print the list of .csv files
print(csv_files)

['ADI_20240603_quote.csv', 'ADI_20240604_quote.csv', 'ADI_20240605_quote.csv', 'ADI_20240606_quote.csv', 'ADI_20240607_quote.csv', 'ADI_20240610_quote.csv', 'ADI_20240611_quote.csv', 'ADI_20240612_quote.csv', 'ADI_20240613_quote.csv', 'ADI_20240614_quote.csv', 'ADI_20240617_quote.csv', 'ADI_20240618_quote.csv', 'ADI_20240620_quote.csv', 'ADI_20240621_quote.csv', 'ADI_20240624_quote.csv', 'ADI_20240625_quote.csv', 'ADI_20240626_quote.csv', 'ADI_20240627_quote.csv', 'ADI_20240628_quote.csv', 'ADI_20240701_quote.csv', 'ADI_20240702_quote.csv', 'ADI_20240703_quote.csv', 'ADI_20240705_quote.csv', 'ADI_20240708_quote.csv', 'ADI_20240709_quote.csv', 'ADI_20240710_quote.csv', 'ADI_20240711_quote.csv', 'ADI_20240712_quote.csv', 'ADI_20240715_quote.csv', 'ADI_20240716_quote.csv', 'ADI_20240717_quote.csv', 'ADI_20240718_quote.csv', 'ADI_20240719_quote.csv', 'ADI_20240722_quote.csv', 'ADI_20240723_quote.csv', 'ADI_20240724_quote.csv', 'ADI_20240725_quote.csv', 'ADI_20240726_quote.csv', 'ADI_202407

In [24]:
# Check if there is missing data(number of quote > download limit)
for csv in csv_files:
    df = pd.read_csv("data/"   + csv)
    num_rows = len(df)
    # Check if the number of rows is exactly 2,000,001
    if num_rows == 3000000:
        print(f"{csv} has exactly 3,000,000 rows.")

In [22]:
# Re-download quote data for 2024-08-02
# quotes_20240802 = api.get_quotes(symbol="NVDA", start="2024-08-02T09:30:00Z", end="2024-08-02T16:00:00Z", limit=3000000).df  
# quotes_20240802.to_csv("data/NVDA_20240802_quote.csv", index=True)

# Re-download quote data for 2024-08-05
# quotes_20240805 = api.get_quotes(symbol="NVDA", start="2024-08-05T09:30:00Z", end="2024-08-05T16:00:00Z", limit=3000000).df  
# quotes_20240805.to_csv("data/NVDA_20240805_quote.csv", index=True)