## Get all fundamental data of a company

Including:

* Overview
* Income Statement
* Balance Sheet
* Cash Flow
* Earnings
* Earnings Calender

In [1]:
try:
    import alpha_vantage
except:
    ! pip install alpha_vantage
    import alpha_vantage

import requests
from pprint import pprint
import csv
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime, timedelta
import json
import time
from tqdm import tqdm
import calendar
import os
import string
import random
import logging
# set up logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# create a file handler
handler = logging.FileHandler('company_overview_scrape.log')
handler.setLevel(logging.INFO)
# create a logging format
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
# add the handlers to the logger
logger.addHandler(handler)
sns.set()
pd.set_option('display.max_colwidth',1000)

## Scrape date according to the function requested

### <span style="color:red">TODO later on, pull only data about companies that are approaching their earnings call/quarter end</span>

In [2]:
functions = ['OVERVIEW', 'INCOME_STATEMENT', 'BALANCE_SHEET', 'CASH_FLOW', 'EARNINGS']

def company_data(function: str = 'OVERVIEW', company_symbol: str = 'MSFT'):
    endpoint = "https://www.alphavantage.co/query"
    parameters = {
        "function": function,
        "symbol": company_symbol,
        "horizon": "12month"
    }
    for _ in range(100):
        parameters['apikey'] = ''.join(random.choices(string.ascii_uppercase + string.digits, k=15))
        # Send a GET request to the API endpoint
        response = requests.get(endpoint, params=parameters)
        # Check if the request was successful
        if response.status_code == 200:
            data = response.json()
            if 'Note' not in data: 
                break
            logger.warning(f'API key {parameters["apikey"]} has been used too many times. response note: {data["Note"]}')
            data = None
            time.sleep(1)
        else: 
            logger.error(f'API key {parameters["apikey"]} has returned an error. response note: {response.json()}')
    return data

msft_data_horizon = company_data(function='OVERVIEW', company_symbol='MSFT')
pprint(msft_data_horizon)

{'200DayMovingAverage': '254.02',
 '50DayMovingAverage': '243.38',
 '52WeekHigh': '313.66',
 '52WeekLow': '212.83',
 'Address': 'ONE MICROSOFT WAY, REDMOND, WA, US',
 'AnalystTargetPrice': '284.08',
 'AssetType': 'Common Stock',
 'Beta': '0.916',
 'BookValue': '24.59',
 'CIK': '789019',
 'Country': 'USA',
 'Currency': 'USD',
 'Description': 'Microsoft Corporation is an American multinational technology '
                'company which produces computer software, consumer '
                'electronics, personal computers, and related services. Its '
                'best known software products are the Microsoft Windows line '
                'of operating systems, the Microsoft Office suite, and the '
                'Internet Explorer and Edge web browsers. Its flagship '
                'hardware products are the Xbox video game consoles and the '
                'Microsoft Surface lineup of touchscreen personal computers. '
                'Microsoft ranked No. 21 in the 2020 Fortu

Sweep over all stock tickers and pull daily data

In [3]:
# get stock price for all tickers
def get_companies_data(stocks_to_watch: list):
    """Get stock price for all tickers in the list"""
    os.makedirs('overview', exist_ok=True)
    # only get stock price for stocks that are not in the directory
    seen_stocks = [f.split('.')[0] for f in os.listdir('overview') if os.path.isfile(os.path.join('overview', f))]
    for ticker in tqdm([t for t in stocks_to_watch if t not in seen_stocks]):
        data = [company_data(func, ticker) for func in functions]
        # check whether any of the items in data is None and get the index of the first None item
        if any(d is None for d in data):
            logger.error(f'Unnable to fetch {functions[data.index(None)]} data for {ticker}')
            continue
        with open(f'overview/{ticker}.json', 'w') as outfile:
            json.dump(data, outfile, indent=4)

get_companies_data(['MSFT'])

0it [00:00, ?it/s]


Create a dataset based on the most mentioned stocks

In [4]:
# load pd dataframe from csv
tickers = pd.read_csv('../tickers.csv').sort_values('symbol')
# load the sentiment data
sentiment_df = pd.read_csv('../news_sentiment_dataset/sentiments.csv')
# get a list of tickers sorted by frequency
ticker_list = sentiment_df['ticker'].value_counts().index.tolist()
len(ticker_list)

6187

In [6]:
tickers_to_watch = tickers['symbol'].unique()
# sort tickers_to_watch according to the order of ticker_list and add the rest of the tickers
tickers_to_watch = [t for t in ticker_list if t in tickers_to_watch] + [t for t in tickers_to_watch if t not in ticker_list]
print(tickers_to_watch[:10])
get_companies_data(tickers_to_watch)

['TSLA', 'META', 'MSFT', 'GOOG', 'AAPL', 'BLK', 'BCS', 'AMZN', 'BBBY', 'NVDA']


  1%|          | 73/11100 [59:39<150:54:53, 49.27s/it]