In [1]:
import pandas as pd
import numpy as np
import requests
from io import BytesIO
import time
from datetime import date, datetime, timedelta
import random

In [None]:
# Import packages
import yfinance as yf

# Read and print the stock tickers that make up S&P500
df_tickers = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
df_tickers = df_tickers.rename(columns={'Symbol':'ticker'})
# Make sure GOOG and GOOGL are not both included
df_tickers = df_tickers[df_tickers['ticker'] != 'GOOG']

random.seed(1993)
# Get stratified random 20 samples per sector
df_tickers_sample = df_tickers.groupby('GICS Sector', group_keys=False).apply(lambda x: x.sample(20))

# Manually add few tech tickers
df_tickers_sample = pd.concat([df_tickers_sample, df_tickers[df_tickers['ticker'].isin(['META', 'AAPL', 'NFLX', 'GOOGL', 'AMZN', 'TSLA'])]])
df_tickers_sample = df_tickers_sample.drop_duplicates(subset='ticker')
tickers = list(df_tickers_sample['ticker'])

In [36]:
df_tickers_sample

Unnamed: 0,Symbol,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
167,EA,Electronic Arts,Communication Services,Interactive Home Entertainment,"Redwood City, California",2002-07-22,712515,1982
483,WBD,Warner Bros. Discovery,Communication Services,Broadcasting,"New York City, New York",2022-04-11,1437107,2022 (Warner Bros. 1923)
291,LYV,Live Nation Entertainment,Communication Services,Movies & Entertainment,"Beverly Hills, California",2019-12-23,1335258,2010
472,VZ,Verizon,Communication Services,Integrated Telecommunication Services,"New York City, New York",1983-11-30,732712,1983 (1877)
363,PARA,Paramount Global,Communication Services,Movies & Entertainment,"New York City, New York",1994-09-30,813828,2019 (Paramount Pictures 1912)
...,...,...,...,...,...,...,...,...
116,CMS,CMS Energy,Utilities,Multi-Utilities,"Jackson, Michigan",1957-03-04,811156,1886
165,EIX,Edison International,Utilities,Electric Utilities,"Rosemead, California",1957-03-04,827052,1886
22,AMZN,Amazon,Consumer Discretionary,Broadline Retail,"Seattle, Washington",2005-11-18,1018724,1994
39,AAPL,Apple Inc.,Information Technology,"Technology Hardware, Storage & Peripherals","Cupertino, California",1982-11-30,320193,1977


In [33]:
tickers.extend(['BTC/USD', 'ETH/USD', 'XRP/USD', 'LTC/USD', 'ADA/USD', 'MATIC/USD'])

In [None]:
# Twelve Data
api_key = twelve_data_api

data_frames = []

for ticker in tickers:
    
    try:
        # Get Earliest Date
        url = f'https://api.twelvedata.com/earliest_timestamp?symbol={ticker}&interval=1day&apikey={api_key}'
        r = requests.get(url)
        
        start_date = r.text.split('"')[3]
        end_date = date.today().strftime('%Y-%m-%d')
        total_days = (pd.to_datetime(end_date) - pd.to_datetime(start_date)).days
        
        batch_size = np.ceil(total_days / 5000).astype(int)
        
        l_dates = []        
        
        for i in range(batch_size):
            l_dates.append((pd.to_datetime(end_date)  - timedelta(days=(4990 * i))).strftime('%Y-%m-%d'))
        l_dates.append(start_date)
        
        for i in range(batch_size):
            try:
                batch_end = l_dates[i]
                batch_start = l_dates[i+1]
                
                # Make sure end & start dates don't overlap throughout the loop
                if (i > 0):
                    batch_end = (pd.to_datetime(batch_end) - timedelta(days=1)).strftime('%Y-%m-%d')
                
                url = f'https://api.twelvedata.com/time_series?&start_date={batch_start}&end_date={batch_end}&symbol={ticker}&format=CSV&interval=1day&apikey={api_key}'
                r = requests.get(url)

                # Decode bytes into a string
                data_string = r.content.decode('utf-8')

                # Use StringIO to treat the string as a file-like object
                data_file = BytesIO(data_string.encode())
                
                # Union the new data to data from previous iteration. 
                data = pd.read_csv(data_file, delimiter=';')
                data['ticker'] = ticker
                
                # Append data to list
                data_frames.append(data)
                
                # Check how many credits are left, and pause for 60 seconds to align with Twelve Data's limit
                api_credit_remaining = r.headers['Api-Credits-Left']
                if api_credit_remaining == '1':
                    time.sleep(61)

            except:
                pass
        
        print(data_frames[-1])

    except:
        print(f"Couldn't download {ticker}.")

df_final = pd.concat([i for i in data_frames if len(i) > 0])

merge_cols = ['GICS Sector', 'GICS Sub-Industry', 'Headquarters Location', 'Date added', 'Founded'] # 'CIK'
df_model = df_final.merge(df_tickers, left_on='ticker', right_on='Symbol', how='left')
df_model.to_csv(f'data/{date.today().strftime("%Y-%m-%d")}.csv', index=False)

In [60]:
df_model

Unnamed: 0,datetime,open,high,low,close,volume,ticker,Symbol,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
0,2024-04-19,126.05000,127.46000,125.74,127.27,1869200,EA,EA,Electronic Arts,Communication Services,Interactive Home Entertainment,"Redwood City, California",2002-07-22,712515,1982
1,2024-04-18,127.09000,127.39000,125.11,125.63,1389500,EA,EA,Electronic Arts,Communication Services,Interactive Home Entertainment,"Redwood City, California",2002-07-22,712515,1982
2,2024-04-17,126.87000,127.53000,126.29,126.31,1699200,EA,EA,Electronic Arts,Communication Services,Interactive Home Entertainment,"Redwood City, California",2002-07-22,712515,1982
3,2024-04-16,127.72000,128.77000,126.36,126.55,1899900,EA,EA,Electronic Arts,Communication Services,Interactive Home Entertainment,"Redwood City, California",2002-07-22,712515,1982
4,2024-04-15,128.21001,128.39999,126.55,127.05,1419700,EA,EA,Electronic Arts,Communication Services,Interactive Home Entertainment,"Redwood City, California",2002-07-22,712515,1982
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1472992,1997-05-30,21.12000,22.12000,21.12,22.00,220400,ARE,ARE,Alexandria Real Estate Equities,Real Estate,Office REITs,"Pasadena, California",2017-03-20,1035443,1994
1472993,1997-05-29,21.00000,21.38000,21.00,21.38,296800,ARE,ARE,Alexandria Real Estate Equities,Real Estate,Office REITs,"Pasadena, California",2017-03-20,1035443,1994
1472994,1997-05-28,20.62000,21.38000,20.62,21.00,1862700,ARE,ARE,Alexandria Real Estate Equities,Real Estate,Office REITs,"Pasadena, California",2017-03-20,1035443,1994
1472995,1970-01-01,152.53000,152.53000,152.50,152.50,164699,ARE,ARE,Alexandria Real Estate Equities,Real Estate,Office REITs,"Pasadena, California",2017-03-20,1035443,1994
