# Scrape the Alpha Vantage api and collect daily prices in the backgroung

## Get daily financial data of all companies, going a quarter back

In [30]:
try:
    import alpha_vantage
except:
    ! pip install alpha_vantage
    import alpha_vantage

import requests
from pprint import pprint
import csv
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime, timedelta
import json
import time
from tqdm import tqdm
import calendar
import os
import logging
# set up logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# create a file handler
handler = logging.FileHandler('price_scrape.log')
handler.setLevel(logging.INFO)
# create a logging format
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
# add the handlers to the logger
logger.addHandler(handler)
sns.set()
pd.set_option('display.max_colwidth',1000)

Load all keys for the alpha vantage api and multiply it by 5 since each key is allowed up to 5 requests per minute

In [31]:
with open('../keys.json') as f:
    keys = json.load(f) * 5

## download the dataset for all stocks

Load the CSV of all tickers in the us stock market and use it to get the price of all stocks

## <span style="color:red">TODO order the stock scraping process according to highest exchange volume or market cap mentioned in company overview dataset</span>

In [32]:
import string
import random

def get_stockprice(company_symbol: str = 'MSFT'):
    endpoint = "https://www.alphavantage.co/query"
    parameters = {
        "function": "TIME_SERIES_DAILY_ADJUSTED",
        "symbol": company_symbol,
        "outputsize": 'compact'
    }
    for _ in range(100):
        parameters['apikey'] = ''.join(random.choices(string.ascii_uppercase + string.digits, k=15))
        # Send a GET request to the API endpoint
        response = requests.get(endpoint, params=parameters)
        # Check if the request was successful
        if response.status_code == 200:
            data = response.json()
            if 'Note' not in data: 
                break
            logger.warning(f'API key {parameters["apikey"]} has been used too many times. response note: {data["Note"]}')
            data = None
            time.sleep(1)
        else: 
            logger.error(f'API key {parameters["apikey"]} has returned an error. response note: {response.json()}')
    return data

# pprint(get_stockprice('MSFT'))

Sweep over all stock tickers and pull daily data

### <span style="color:red">TODO later on, combine the price dataset with the sentiment dataset such that for each stock price you have the daily averaged sentiment as well</span>

In [33]:
# get stock price for all tickers
def get_stockprice_all(stocks_to_watch: list):
    os.makedirs('prices', exist_ok=True)
    # only get stock price for stocks that are not in the directory
    seen_stocks = [f.split('.')[0] for f in os.listdir('prices') if os.path.isfile(os.path.join('prices', f))]
    for ticker in tqdm([t for t in stocks_to_watch if t not in seen_stocks]):
        data = get_stockprice(ticker)
        if data is None: 
            logger.error(f'Unnable to fetch data for {ticker}')
            continue
        with open(f'prices/{ticker}.json', 'w') as outfile:
            json.dump(data, outfile, indent=4)

In [34]:
# load pd dataframe from csv
tickers = pd.read_csv('../tickers.csv').sort_values('symbol')
tickers.head()

Unnamed: 0,symbol,name,exchange,assetType,ipoDate,ipoYear
1757,A,Agilent Technologies Inc,NYSE,Stock,1999-11-18,1999
5188,AA,Alcoa Corp,NYSE,Stock,2016-10-18,2016
7179,AAA,AXS FIRST PRIORITY CLO BOND ETF,NYSE ARCA,ETF,2020-09-09,2020
6080,AAAU,Goldman Sachs Physical Gold ETF,BATS,ETF,2018-08-15,2018
8092,AAC,Ares Acquisition Corporation - Class A,NYSE,Stock,2021-03-25,2021


Load the news sentiment dataset and sort the tickers according to most frequent mentions

In [35]:
# load the sentiment data
sentiment_df = pd.read_csv('../news_sentiment_dataset/sentiments.csv')
# get a list of tickers sorted by frequency
ticker_list = sentiment_df['ticker'].value_counts().index.tolist()
len(ticker_list)

6203

In [36]:
tickers_to_watch = tickers['symbol'].unique()
# sort tickers_to_watch according to the order of ticker_list and add the rest of the tickers
tickers_to_watch = [t for t in ticker_list if t in tickers_to_watch] + [t for t in tickers_to_watch if t not in ticker_list]
print(tickers_to_watch[:10])
get_stockprice_all(tickers_to_watch)

['TSLA', 'META', 'MSFT', 'GOOG', 'AAPL', 'BLK', 'BCS', 'AMZN', 'BBBY', 'NVDA']


  2%|▏         | 175/11114 [28:20<29:31:04,  9.71s/it]


KeyboardInterrupt: 