# SEC EDGAR Data Collection

### Imports and Constants

In [1]:
import asyncio
import concurrent.futures
import json
import os
import random
import time
import zipfile
from datetime import datetime, timedelta
from typing import Dict, List, Set, Tuple, Union

import aiohttp
import numpy as np
import orjson
import pandas as pd
import requests
import smart_open
from bs4 import BeautifulSoup
from dateutil import relativedelta
from smart_open import open
from tqdm import tqdm


In [2]:
heading = {"User-Agent": "locke@gatech.edu"}
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

### Create ticker to CIK mapping & get SP500 tickers

In [3]:
url = "https://www.sec.gov/files/company_tickers.json"
r = requests.get(url, headers=heading)
ticker_cik = pd.DataFrame(r.json()).T
ticker_cik.set_index("ticker", inplace=True)

In [4]:
def get_cik(ticker: str) -> str:
    return str(ticker_cik.loc[ticker, "cik_str"]).zfill(10)

In [5]:
def get_tickers_and_sectors() -> List[Tuple[str, str]]:
    """Get a list of tickers and their sectors for all stocks in the S&P 500.

    Returns:
        List[Tuple[str, str]]: List of tuples containing ticker and sector.
    """
    url = "http://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    response = requests.get(url)
    source = BeautifulSoup(response.text, "lxml")
    table = source.find("table", {"class": "wikitable sortable"})
    tickers_and_sectors = []
    for row in table.findAll("tr")[1:]:
        ticker = row.findAll("td")[0].text.replace("\n", "")
        sector = row.findAll("td")[3].text.replace("\n", "")
        tickers_and_sectors.append((ticker, sector))
    return tickers_and_sectors

### Download and process bulk data from SEC

In [6]:
DATA_DIR = "data"
SEC_DATA_DIR = os.path.join(DATA_DIR, "raw")
PROCESSED_DATA_DIR = os.path.join(DATA_DIR, "processed")

os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(SEC_DATA_DIR, exist_ok=True)
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)

In [7]:
def download_sec_data(force_update: bool = False):
    """
    Download all company facts from SEC.

    Args:
        force_update (bool, optional): If True, download updated data regardless of whether it already exists. Defaults to False.
    """
    # if data/companyfacts.zip does not exist, download it
    if force_update or not os.path.exists(os.path.join(SEC_DATA_DIR, "companyfacts.zip")):
        print("Downloading companyfacts.zip...")
        url = "https://www.sec.gov/Archives/edgar/daily-index/xbrl/companyfacts.zip"
        r = requests.get(url, headers=heading)
        with open("data/raw/companyfacts.zip", "wb") as f:
            f.write(r.content)
    # if JSON files do not exist, unzip them
    if force_update or not os.path.exists(os.path.join(SEC_DATA_DIR, "CIK000032019.json")):
        print("Unzipping companyfacts.zip...")
        with zipfile.ZipFile(os.path.join(SEC_DATA_DIR, "companyfacts.zip"), "r") as zip_ref:
            zip_ref.extractall("data/raw")

In [8]:
def process_item(item: dict) -> dict:
    """
    Process a single item in the SEC company facts JSON file.

    Args:
        item (dict): The item to process, e.g. "dei:EntityCommonStockSharesOutstanding".

    Returns:
        dict: The processed item.
    """

    processed_item = {}
    processed_item["label"] = item["label"]
    processed_item["description"] = item["description"]

    for unit in item["units"]:

        # store data by timeframe (i.e. FY2022, Q32019) and filing date
        # for each category, seperate annual data from quarterly data
        processed_item[unit] = {}
        processed_item[unit]["by_timeframe"] = {"annual": {}, "quarterly": {}}
        processed_item[unit]["by_filing_date"] = {"annual": {}, "quarterly": {}}
        for entry in item["units"][unit]:
            fy = entry["fy"]
            fp = entry["fp"]
            filing_date = entry["filed"]
            if entry["fp"] == "FY":
                processed_item[unit]["by_timeframe"]["annual"][fy] = entry
                processed_item[unit]["by_filing_date"]["annual"][filing_date] = entry
            else:
                if fy not in processed_item[unit]["by_timeframe"]["quarterly"]:
                    processed_item[unit]["by_timeframe"]["quarterly"][fy] = {}
                processed_item[unit]["by_timeframe"]["quarterly"][fy][fp] = entry
                processed_item[unit]["by_filing_date"]["quarterly"][filing_date] = entry

    return processed_item

In [9]:
def process_json(ticker: str) -> None:
    """
    Process the SEC company facts JSON file for a given ticker, creating a new JSON file.

    Args:
        ticker (str): The ticker symbol for the company to process.
    """

    # find SEC data for the company
    cik = get_cik(ticker)
    try:
        with open(os.path.join(SEC_DATA_DIR, f"CIK{cik}.json")) as f:
            sec_data = orjson.loads(f.read())
    except FileNotFoundError:
        raise FileNotFoundError(f"CIK{cik}.json not found in raw data directory. Use download_sec_data().")

    # process the data
    data = {"ticker": ticker, "cik": sec_data["cik"], "entityName": sec_data["entityName"]}
    for item in sec_data["facts"]["dei"]:
        data[item] = process_item(sec_data["facts"]["dei"][item])
    for item in sec_data["facts"]["us-gaap"]:
        data[item] = process_item(sec_data["facts"]["us-gaap"][item])

    # write the data to a JSON file
    with open(os.path.join(PROCESSED_DATA_DIR, f"{ticker}.json"), "w") as f:
        f.write(json.dumps(data, indent=4))

Only run the below cells if you want to download and process the most recent data from the SEC.

In [10]:
# get up to date data from SEC
#download_sec_data(force_update=True)

In [11]:
# process data for stocks in S&P 500
#tickers_and_sectors = get_tickers_and_sectors()
#for ticker, sector in tqdm(tickers_and_sectors):
    #try:
        #process_json(ticker)
    #except Exception as e:
        #print(f"Could not process data for {ticker}. Error: {e}")

### Generic methods to retrieve metrics from processed JSON data

In [12]:
def get_financials(ticker: str) -> dict:
    """
    Get the processed financials for a given ticker.

    Args:
        ticker (str): The ticker symbol for the company to get financials for.

    Returns:
        dict: The processed financials.
    """
    with open(os.path.join(PROCESSED_DATA_DIR, f"{ticker}.json")) as f:
        financials = json.loads(f.read())
    return financials

In [13]:
def get_metric_by_timeframe(financials: dict, metric: str, units: str, year: str, quarter: str = None) -> float:
    """
    Get a metric for a company by timeframe.

    Args:
        financials (dict): The processed financials for the company.
        metric (str): The metric to get.
        year (str): The year to get the metric for.
        quarter (str, optional): The quarter to get the metric for. Defaults to None, which gets the annual metric.

    Returns:
        float: The metric value.
    """
    try:
        if quarter is None:
            return financials[metric][units]["by_timeframe"]["annual"][year]["val"]
        else:
            return financials[metric][units]["by_timeframe"]["quarterly"][year][quarter]["val"]
    except KeyError:
        ticker = financials["ticker"]
        if quarter is None:
            raise KeyError(f"Metric {metric} not found for {ticker} in {year}.")
        else:
            raise KeyError(f"Metric {metric} not found for {ticker} in {year} Q{quarter}.")

In [14]:
def get_closest_filing_date(financials: dict, metric: str, units: str, date: str, quarterly: bool = False) -> str:
    """
    Get the closest filing date before a given date.

    Args:
        financials (dict): The processed financials for the company.
        metric (str): The metric to get.
        units (str): The units of the metric.
        date (str): The date to get the closest filing date for.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.

    Returns:
        str: The closest filing date.
    """

    if quarterly:
        filing_dates = list(financials[metric][units]["by_filing_date"]["quarterly"].keys())
    else:
        filing_dates = list(financials[metric][units]["by_filing_date"]["annual"].keys())

    closest_date = None
    left = 0
    right = len(filing_dates) - 1
    while left <= right:
        mid = (left + right) // 2
        if filing_dates[mid] <= date:
            closest_date = filing_dates[mid]
            left = mid + 1
        else:
            right = mid - 1

    if closest_date is None:
        ticker = financials["ticker"]
        raise KeyError(f"Metric {metric} not found for {ticker} before {date}.")

    return closest_date

In [15]:
def get_metric(financials: dict, metric: str, units: str, query_date: str = None, quarterly: bool = False, tolerance: int = 52):
    """
    Get a metric for a company from the most recent filing date before a given date.

    Args:
        financials (dict): The processed financials for the company.
        metric (str): The metric to get.
        units (str): The units of the metric.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The metric value.
    """

    if query_date is None:
        query_date = datetime.now().strftime("%Y-%m-%d")

    filing_date = get_closest_filing_date(financials, metric, units, query_date, quarterly)
    if datetime.strptime(query_date, "%Y-%m-%d") - datetime.strptime(filing_date, "%Y-%m-%d") > timedelta(weeks=tolerance):
        ticker = financials["ticker"]
        raise KeyError(f"Metric {metric} not found for {ticker} within {tolerance} weeks of {query_date}.")

    if quarterly:
        return financials[metric][units]["by_filing_date"]["quarterly"][filing_date]["val"]
    else:
        return financials[metric][units]["by_filing_date"]["annual"][filing_date]["val"]


In [16]:
def get_concept(
        financials: dict,
        concept: str,
        tags: List[str],
        units: str,
        query_date: str = None,
        quarterly: bool = False,
        tolerance: int = 52
    ) -> float:
    """
    Get a concept for a company by trying several different XRBL tags. Returns the first valid value found.

    Args:
        financials (dict): The processed financials for the company.
        concept (str): The name of the concept, strictly used for error messages.
        tags (List[str]): The XRBL tags to check for the concept.
        units (str): The units of the concept. Assumes the same units for all tags.
        query_date (str, optional): The date to get the concept for in Y-m-d format. Defaults to None, which gets the latest concept.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The concept value.
    """
    for tag in tags:
        try:
            return get_metric(financials, tag, units, query_date, quarterly, tolerance)
        except KeyError:
            continue
    period = "quarterly" if quarterly else "annual"
    ticker = financials["ticker"]
    if query_date is None: query_date = "today"
    raise ValueError(f"Could not find {period} {concept} for {ticker} within {tolerance} weeks of {query_date}.")

### Methods to get specific financials from processed JSON data 

In [17]:
def get_shares_outstanding(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the shares outstanding for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The shares outstanding.
    """
    tags = ["EntityCommonStockSharesOutstanding", "WeightedAverageNumberOfSharesOutstandingBasic", "CommonStockSharesOutstanding"]
    return get_concept(
        financials,
        "shares outstanding",
        tags,
        "shares",
        query_date,
        quarterly,
        tolerance
    )

In [18]:
def get_net_income(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the net income for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The net income.
    """
    tags = ["NetIncomeLoss", "ProfitLoss", "NetIncomeLossAvailableToCommonStockholdersBasic", "IncomeLossFromContinuingOperations"]
    return get_concept(
        financials,
        "net income",
        tags,
        "USD",
        query_date,
        quarterly,
        tolerance
    )
    

In [19]:
def get_interest_expense(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the interest expense for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The interest expense.
    """
    tags = ["InterestExpense", "InterestExpenseDebt", "InterestAndDebtExpense", "InterestExpenseBorrowings", "InterestIncomeExpenseNonoperatingNet", "InterestCostsIncurred", "InterestIncomeExpenseNet", "InterestPaidNet", "InterestPaid"]
    return abs(get_concept(
        financials,
        "interest expense",
        tags,
        "USD",
        query_date,
        quarterly,
        tolerance
    ))

In [20]:
def get_tax_expense(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the tax expense for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The tax expense.
    """
    try:
        tags = ["IncomeTaxExpenseBenefit", "IncomeTaxExpenseBenefitContinuingOperations", "CurrentIncomeTaxExpenseBenefit"]
        return get_concept(
            financials,
            "tax expense",
            tags,
            "USD",
            query_date,
            quarterly,
            tolerance
        )
    except:
        pretax_income = get_concept(
            financials,
            "pretax income",
            ["ProfitLoss"],
            "USD",
            query_date,
            quarterly,
            tolerance
        )
        net_income = get_net_income(financials, query_date, quarterly, tolerance)
        return pretax_income - net_income

In [21]:
def get_revenue(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the revenue for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The revenue.
    """
    tags = ["Revenues", "SalesRevenueNet", "SalesRevenueGoodsNet", "SalesRevenueServicesNet", "SalesRevenueNetOfInterestExpense", "RevenueFromContractWithCustomerExcludingAssessedTax", "RevenueFromContractWithCustomerIncludingAssessedTax", "RevenuesNetOfInterestExpense", "OperatingLeasesIncomeStatementLeaseRevenue", "OperatingLeaseLeaseIncome", "RegulatedAndUnregulatedOperatingRevenue"]
    return get_concept(
        financials,
        "revenue",
        tags,
        "USD",
        query_date,
        quarterly,
        tolerance
    )

In [22]:
def get_cost_of_revenue(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the cost of revenue for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The cost of revenue.
    """
    tags = ["CostOfRevenue", "CostOfGoodsAndServicesSold", "CostOfGoodsSold", "CostOfServices"]
    return get_concept(
        financials,
        "cost of revenue",
        tags,
        "USD",
        query_date,
        quarterly,
        tolerance
    )

In [23]:
def get_operating_expenses(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the operating expenses for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The operating expenses.
    """
    tags = ["OperatingExpenses", "OperatingCostsAndExpenses"]
    return get_concept(
        financials,
        "operating expenses",
        tags,
        "USD",
        query_date,
        quarterly,
        tolerance
    )

In [24]:
def get_depreciation(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the depreciation for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The depreciation.
    """
    tags = ["Depreciation"]
    return get_concept(
        financials,
        "depreciation",
        tags,
        "USD",
        query_date,
        quarterly,
        tolerance
    )

In [25]:
def get_amortization(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the amortization for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The amortization.
    """
    tags = ["Amortization", "AmortizationOfIntangibleAssets", "AmortizationOfDebtDiscountPremium"]
    return get_concept(
        financials,
        "amortization",
        tags,
        "USD",
        query_date,
        quarterly,
        tolerance
    )

In [26]:
def get_depreciation_and_amortization(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the depreciation and amortization for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The depreciation and amortization.
    """
    try:
        tags = ["DepreciationAndAmortization", "DepreciationDepletionAndAmortization", "DepreciationAmortizationAndAccretionNet", "OtherDepreciationAndAmortization"]
        return get_concept(
            financials,
            "depreciation and amortization",
            tags,
            "USD",
            query_date,
            quarterly,
            tolerance
        )
    except:
        try:
            depreciation = get_depreciation(financials, query_date, quarterly, tolerance)
        except:
            depreciation = 0
        try:
            amortization = get_amortization(financials, query_date, quarterly, tolerance)
        except:
            amortization = 0
        if depreciation == 0 and amortization == 0:
            period = "quarterly" if quarterly else "annual"
            ticker = financials["ticker"]
            if query_date is None: query_date = "today"
            raise ValueError(f"Could not find {period} depreciation and amortization for {ticker} within {tolerance} weeks of {query_date}.")
        return depreciation + amortization


In [27]:
def get_current_debt(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the current debt for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The current debt.
    """
    tags = ["LongTermDebtCurrent", "LongTermDebtAndCapitalLeaseObligationsCurrent", "DebtCurrent", "LongTermDebtMaturitiesRepaymentsOfPrincipalInNextTwelveMonths", "LinesOfCreditCurrent", "LineOfCredit", "OperatingLeaseLiabilityCurrent"]
    return get_concept(
        financials,
        "current debt",
        tags,
        "USD",
        query_date,
        quarterly,
        tolerance
    )

In [28]:
def get_noncurrent_debt(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the noncurrent debt for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The noncurrent debt.
    """
    tags = ["LongTermDebtNoncurrent", "ConvertibleLongTermNotesPayable", "OperatingLeaseLiabilityNoncurrent", "UnsecuredLongTermDebt", "LongTermDebtAndCapitalLeaseObligations"]
    return get_concept(
        financials,
        "noncurrent debt",
        tags,
        "USD",
        query_date,
        quarterly,
        tolerance
    )

In [29]:
def get_total_debt(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the total debt for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The total debt.
    """
    try:
        tags = ["LongTermDebt", "DebtAndCapitalLeaseObligations"]
        return get_concept(
            financials,
            "total debt",
            tags,
            "USD",
            query_date,
            quarterly,
            tolerance
        )
    except:
        try:
            current_debt = get_current_debt(financials, query_date, quarterly, tolerance)
        except:
            current_debt = 0
        try:
            noncurrent_debt = get_noncurrent_debt(financials, query_date, quarterly, tolerance)
        except:
            noncurrent_debt = 0
        if current_debt == 0 and noncurrent_debt == 0:
            period = "quarterly" if quarterly else "annual"
            ticker = financials["ticker"]
            if query_date is None: query_date = "today"
            raise ValueError(f"Could not find {period} total debt for {ticker} within {tolerance} weeks of {query_date}.")
        return current_debt + noncurrent_debt

In [30]:
def get_property_plant_equipment(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the property, plant, and equipment for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The property, plant, and equipment.
    """
    # TODO: investigate if we can use the "PropertyPlantAndEquipmentNet" tag (may require changes to depreciation and amortization below)
    tags = ["PropertyPlantAndEquipmentGross"]
    return get_concept(
        financials,
        "property, plant, and equipment",
        tags,
        "USD",
        query_date,
        quarterly,
        tolerance
    )

In [31]:
def get_capex(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the capital expenditures for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The capital expenditures.
    """
    try:
        tags = ["CapitalExpenditures", "PaymentsToAcquirePropertyPlantAndEquipment", "PaymentsToAcquireProductiveAssets"]
        return get_concept(
            financials,
            "capital expenditures",
            tags,
            "USD",
            query_date,
            quarterly,
            tolerance
        )
    except:
        # TODO: check if use of depreciation and amortization is correct
        if query_date is None:
            query_date = datetime.today().strftime("%Y-%m-%d")
        one_year_ago = datetime.strptime(query_date, "%Y-%m-%d") - relativedelta.relativedelta(years=1)
        current_ppe = get_property_plant_equipment(financials, query_date, quarterly, tolerance)
        previous_ppe = get_property_plant_equipment(financials, one_year_ago.strftime("%Y-%m-%d"), quarterly, tolerance)
        d_and_a = get_depreciation_and_amortization(financials, query_date, quarterly, tolerance)
        return current_ppe - previous_ppe + d_and_a

In [32]:
def get_cash(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the cash for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The cash.
    """
    tags = ["Cash", "CashAndCashEquivalentsAtCarryingValue", "CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsIncludingDisposalGroupAndDiscontinuedOperations", "CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalents"]
    return get_concept(
        financials,
        "cash",
        tags,
        "USD",
        query_date,
        quarterly,
        tolerance
    )

In [33]:
def get_marketable_securities(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the marketable securities for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The marketable securities.
    """
    tags = ["MarketableSecurities", "MarketableSecuritiesCurrent"]
    return get_concept(
        financials,
        "marketable securities",
        tags,
        "USD",
        query_date,
        quarterly,
        tolerance
    )

In [34]:
def get_accounts_receivable(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the accounts receivable for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The accounts receivable.
    """
    tags = ["AccountsReceivableNetCurrent"]
    return get_concept(
        financials,
        "accounts receivable",
        tags,
        "USD",
        query_date,
        quarterly,
        tolerance
    )

In [35]:
def get_inventory(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the inventory for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The inventory.
    """
    tags = ["InventoryNet", "InventoryNetCurrent"]
    return get_concept(
        financials,
        "inventory",
        tags,
        "USD",
        query_date,
        quarterly,
        tolerance
    )

In [36]:
def get_other_current_assets(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the other current assets for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The other current assets.
    """
    tags = ["OtherAssetsCurrent", "PrepaidExpenseAndOtherAssetsCurrent"]
    return get_concept(
        financials,
        "other current assets",
        tags,
        "USD",
        query_date,
        quarterly,
        tolerance
    )

In [37]:
def get_current_assets(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the current assets for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The current assets.
    """
    try:
        tags = ["AssetsCurrent"]
        return get_concept(
            financials,
            "current assets",
            tags,
            "USD",
            query_date,
            quarterly,
            tolerance
        )
    except:
        try:
            cash = get_cash(financials, query_date, quarterly, tolerance)
        except:
            cash = 0
        try:
            marketable_securities = get_marketable_securities(financials, query_date, quarterly, tolerance)
        except:
            marketable_securities = 0
        try:
            accounts_receivable = get_accounts_receivable(financials, query_date, quarterly, tolerance)
        except:
            accounts_receivable = 0
        try:
            inventory = get_inventory(financials, query_date, quarterly, tolerance)
        except:
            inventory = 0
        try:
            other_current_assets = get_other_current_assets(financials, query_date, quarterly, tolerance)
        except:
            other_current_assets = 0
        current_assets = cash + marketable_securities + accounts_receivable + inventory + other_current_assets
        if current_assets == 0:
            period = "quarterly" if quarterly else "annual"
            ticker = financials["ticker"]
            if query_date is None: query_date = "today"
            raise ValueError(f"Could not find {period} current assets for {ticker} within {tolerance} weeks of {query_date}.")
        return current_assets

In [38]:
def get_total_assets(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the total assets for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The total assets.
    """
    tags = ["Assets"]
    return get_concept(
        financials,
        "total assets",
        tags,
        "USD",
        query_date,
        quarterly,
        tolerance
    )

In [39]:
def get_accounts_payable(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the accounts payable for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The accounts payable.
    """
    tags = ["AccountsPayableCurrent", "OtherAccountsPayableAndAccruedLiabilities", "AccountsPayableTradeCurrent", "AccountsPayableTradeCurrentAndNoncurrent"]
    return get_concept(
        financials,
        "accounts payable",
        tags,
        "USD",
        query_date,
        quarterly,
        tolerance
    )

In [40]:
def get_taxes_payable(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the taxes payable for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The taxes payable.
    """
    tags = ["TaxesPayableCurrent", "TaxesPayableCurrentAndNoncurrent", "AccruedIncomeTaxesCurrent", "AccruedIncomeTaxes", "AccrualForTaxesOtherThanIncomeTaxesCurrent"]
    return get_concept(
        financials,
        "taxes payable",
        tags,
        "USD",
        query_date,
        quarterly,
        tolerance
    )

In [41]:
def get_accrued_salaries(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the accrued salaries for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The accrued salaries.
    """
    tags = ["AccruedSalariesAndWagesCurrent", "AccruedSalariesCurrent"]
    return get_concept(
        financials,
        "accrued salaries",
        tags,
        "USD",
        query_date,
        quarterly,
        tolerance
    )

In [42]:
def get_interest_payable(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the interest payable for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The interest payable.
    """
    tags = ["InterestPayableCurrent", "InterestPayableCurrentAndNoncurrent"]
    return get_concept(
        financials,
        "interest payable",
        tags,
        "USD",
        query_date,
        quarterly,
        tolerance
    )

In [43]:
def get_deferred_revenues(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the deferred revenues for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The deferred revenues.
    """
    tags = ["DeferredRevenueCurrent", "ContractWithCustomerLiability", "ContractWithCustomerLiabilityCurrent"]
    return get_concept(
        financials,
        "deferred revenues",
        tags,
        "USD",
        query_date,
        quarterly,
        tolerance
    )

In [44]:
def get_accrued_liabilities(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the accrued liabilities for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The accrued liabilities.
    """
    tags = ["AccruedLiabilitiesCurrent", "AccruedInsuranceCurrent", "AccruedLiabilitiesCurrentAndNoncurrent"]
    return get_concept(
        financials,
        "accrued liabilities",
        tags,
        "USD",
        query_date,
        quarterly,
        tolerance
    )

In [45]:
def get_other_current_liabilities(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the other current liabilities for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The other current liabilities.
    """
    tags = ["OtherLiabilitiesCurrent", "OtherAccruedLiabilitiesCurrent", "LiabilitiesOfDisposalGroupIncludingDiscontinuedOperationCurrent", "DerivativeLiabilitiesCurrent", "LiabilitiesOfDisposalGroupIncludingDiscontinuedOperation"]
    return get_concept(
        financials,
        "other current liabilities",
        tags,
        "USD",
        query_date,
        quarterly,
        tolerance
    )

In [46]:
def get_current_liabilities(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the current liabilities for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The current liabilities.
    """
    try:
        tags = ["LiabilitiesCurrent"]
        return get_concept(
            financials,
            "current liabilities",
            tags,
            "USD",
            query_date,
            quarterly,
            tolerance
        )
    except:
        try:
            current_debt = get_current_debt(financials, query_date, quarterly, tolerance)
        except:
            current_debt = 0
        try:
            accounts_payable = get_accounts_payable(financials, query_date, quarterly, tolerance)
        except:
            accounts_payable = 0
        try:
            taxes_payable = get_taxes_payable(financials, query_date, quarterly, tolerance)
        except:
            taxes_payable = 0
        try:
            accrued_salaries = get_accrued_salaries(financials, query_date, quarterly, tolerance)
        except:
            accrued_salaries = 0
        try:
            interest_payable = get_interest_payable(financials, query_date, quarterly, tolerance)
        except:
            interest_payable = 0
        try:
            deferred_revenues = get_deferred_revenues(financials, query_date, quarterly, tolerance)
        except:
            deferred_revenues = 0
        try:
            accrued_liabilities = get_accrued_liabilities(financials, query_date, quarterly, tolerance)
        except:
            accrued_liabilities = 0
        try:
            other_liabilities = get_other_current_liabilities(financials, query_date, quarterly, tolerance)
        except:
            other_liabilities = 0
        current_liabilities = current_debt + accounts_payable + taxes_payable + accrued_salaries + interest_payable + deferred_revenues + accrued_liabilities + other_liabilities
        if current_liabilities == 0:
            period = "quarterly" if quarterly else "annual"
            ticker = financials["ticker"]
            if query_date is None: query_date = "today"
            raise ValueError(f"Could not find {period} current liabilities for {ticker} within {tolerance} weeks of {query_date}.")
        return current_liabilities

In [47]:
def get_liabilities_and_equity(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the liabilities and equity for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The liabilities and equity.
    """
    tags = ["LiabilitiesAndStockholdersEquity"]
    return get_concept(
        financials,
        "liabilities and equity",
        tags,
        "USD",
        query_date,
        quarterly,
        tolerance
    )

In [48]:
def get_stockholders_equity(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the shareholders equity for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The shareholders equity.
    """
    tags = ["StockholdersEquity", "StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest"]
    return get_concept(
        financials,
        "shareholders equity",
        tags,
        "USD",
        query_date,
        quarterly,
        tolerance
    )

In [49]:
def get_preferred_stock(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the preferred stock for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The preferred stock.
    """
    tags = ["PreferredStockValue", "PreferredStockValueIncludingPortionAttributableToNoncontrollingInterest"]
    return get_concept(
        financials,
        "preferred stock",
        tags,
        "USD",
        query_date,
        quarterly,
        tolerance
    )

In [50]:
def get_preferred_dividends(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the preferred dividends for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The preferred dividends.
    """
    tags = ["DividendsPreferredStock"]
    return get_concept(
        financials,
        "preferred dividends",
        tags,
        "USD",
        query_date,
        quarterly,
        tolerance
    )

In [51]:
def get_total_liabilities(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the total liabilities for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The total liabilities.
    """
    try:
        tags = ["Liabilities"]
        return get_concept(
            financials,
            "total liabilities",
            tags,
            "USD",
            query_date,
            quarterly,
            tolerance
        )
    except:
        liabilities_and_equity = get_liabilities_and_equity(financials, query_date, quarterly, tolerance)
        equity = get_stockholders_equity(financials, query_date, quarterly, tolerance)
        return liabilities_and_equity - equity


In [52]:
def get_book_value(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the book value for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The book value.
    """
    equity = get_stockholders_equity(financials, query_date, quarterly, tolerance)
    try:
        preferred_stock = get_preferred_stock(financials, query_date, quarterly, tolerance)
    except:
        preferred_stock = 0
    return equity - preferred_stock

In [53]:
def get_book_value_per_share(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the book value per share for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The book value per share.
    """
    book_value = get_book_value(financials, query_date, quarterly, tolerance)
    shares = get_shares_outstanding(financials, query_date, quarterly, tolerance)
    return book_value / shares

In [54]:
def get_earnings_per_share(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the earnings per share for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The earnings per share.
    """
    net_income = get_net_income(financials, query_date, quarterly, tolerance)
    try:
        preferred_dividends = get_preferred_dividends(financials, query_date, quarterly, tolerance)
    except:
        preferred_dividends = 0
    shares = get_shares_outstanding(financials, query_date, quarterly, tolerance)
    return (net_income - preferred_dividends) / shares

In [55]:
def get_sales_per_share(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the sales per share for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The sales per share.
    """
    sales = get_revenue(financials, query_date, quarterly, tolerance)
    shares = get_shares_outstanding(financials, query_date, quarterly, tolerance)
    return sales / shares

In [56]:
def get_working_capital(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the working capital for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The working capital.
    """
    current_assets = get_current_assets(financials, query_date, quarterly, tolerance)
    current_liabilities = get_current_liabilities(financials, query_date, quarterly, tolerance)
    return current_assets - current_liabilities

In [57]:
def get_change_in_working_capital(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the change in working capital for a company over a one year period.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The change in working capital.
    """
    if query_date is None:
        query_date = datetime.today().strftime("%Y-%m-%d")
    one_year_ago = datetime.strptime(query_date, "%Y-%m-%d") - relativedelta.relativedelta(years=1)
    current_wc = get_working_capital(financials, query_date, quarterly, tolerance)
    previous_wc = get_working_capital(financials, one_year_ago.strftime("%Y-%m-%d"), quarterly, tolerance)
    return current_wc - previous_wc

In [58]:
def get_operating_cash_flow(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the operating cash flow for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The operating cash flow.
    """
    net_income = get_net_income(financials, query_date, quarterly, tolerance)
    d_and_a = get_depreciation_and_amortization(financials, query_date, quarterly, tolerance)
    change_in_wc = get_change_in_working_capital(financials, query_date, quarterly, tolerance)
    return net_income + d_and_a - change_in_wc

In [59]:
def get_ocf_per_share(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the operating cash flow per share for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The operating cash flow per share.
    """
    ocf = get_operating_cash_flow(financials, query_date, quarterly, tolerance)
    shares = get_shares_outstanding(financials, query_date, quarterly, tolerance)
    return ocf / shares

In [60]:
# TODO: test and fix
def get_cash_dividends(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the cash dividends for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The cash dividends.
    """
    tags = ["DividendsCommonStockCash", "DividendsCash"]
    return get_concept(
        financials,
        "cash dividends",
        tags,
        "USD",
        query_date,
        quarterly,
        tolerance
    )

In [61]:
def get_ebit(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the EBIT for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The EBIT.
    """
    try:
        tags = ["OperatingIncomeLoss"]
        return get_concept(
            financials,
            "ebit",
            tags,
            "USD",
            query_date,
            quarterly,
            tolerance
        )
    except:
        try:
            net_income = get_net_income(financials, query_date, quarterly, tolerance)
            interest_expense = get_interest_expense(financials, query_date, quarterly, tolerance)
            tax_expense = get_tax_expense(financials, query_date, quarterly, tolerance)
            return net_income + interest_expense + tax_expense
        except:
            revenue = get_revenue(financials, query_date, quarterly, tolerance)
            cost_of_revenue = get_cost_of_revenue(financials, query_date, quarterly, tolerance)
            operating_expenses = get_operating_expenses(financials, query_date, quarterly, tolerance)
            return revenue - cost_of_revenue - operating_expenses

In [62]:
def get_ebitda(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the EBITDA for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The EBITDA.
    """
    ebit = get_ebit(financials, query_date, quarterly, tolerance)
    d_and_a = get_depreciation_and_amortization(financials, query_date, quarterly, tolerance)
    return ebit + d_and_a

In [63]:
def get_ufcf(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the unlevered free cash flow for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The unlevered free cash flow.
    """
    ebit = get_ebit(financials, query_date, quarterly, tolerance)
    tax_expense = get_tax_expense(financials, query_date, quarterly, tolerance)
    d_and_a = get_depreciation_and_amortization(financials, query_date, quarterly, tolerance)
    capex = get_capex(financials, query_date, quarterly, tolerance)
    change_in_wc = get_change_in_working_capital(financials, query_date, quarterly, tolerance)
    return ebit - tax_expense + d_and_a - capex - change_in_wc

In [64]:
def get_research_and_development(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the research and development costs for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The reserach and development costs.
    """
    tags = ["ResearchAndDevelopmentExpense", "ResearchAndDevelopmentExpenseExcludingAcquiredInProcessCost", "ResearchAndDevelopmentExpenseSoftwareExcludingAcquiredInProcessCost"]
    return get_concept(
        financials,
        "r&d",
        tags,
        "USD",
        query_date,
        quarterly,
        tolerance
    )

### External calls

In [65]:
POLYGON_KEY = "uwQtl3txGt5BLbecq7ZbIu0ZbuitCGjc"

In [66]:
async def get_price(ticker: str, query_date: str = None, timeout: int = 20) -> float:
    """Get the actual price of a stock.

    Args:
        ticker (str): Ticker symbol of the stock.
        query_date (str, optional): Date to query. Defaults to None, which queries today's date.
        timeout (int): time to wait before raising TimeoutError.
    Returns:
        float: Price of the stock on the given date.
    """
    if query_date is None or query_date == datetime.date.today().strftime("%Y-%m-%d"):
        url = f"https://api.polygon.io/v2/aggs/ticker/{ticker}/prev?adjusted=true&apiKey={POLYGON_KEY}"
        async with aiohttp.ClientSession() as session:
            try:
                async with session.get(url, timeout=timeout) as resp:
                    response = await resp.json()
            except concurrent.futures.TimeoutError:
                raise TimeoutError(
                    f"{ticker}: Timed out while retrieving price")

        return response["results"][0]["c"]

    else:
        url = f"https://api.polygon.io/v1/open-close/{ticker}/{query_date}?adjusted=true&apiKey={POLYGON_KEY}"
        async with aiohttp.ClientSession() as session:
            try:
                async with session.get(url, timeout=timeout) as resp:
                    response = await resp.json()
            except concurrent.futures.TimeoutError:
                raise TimeoutError(
                    f"{ticker}: Timed out while retrieving price")

            i = 0
            while response["status"] != "OK":
                # markets will not close for more than 3 days at a time
                # if price not found within 3 days, price likely does not exist for that time period
                if i >= 2:
                    raise ValueError(f"Could not find price for {ticker}")
                i += 1
                curr_date = datetime.strptime(query_date, "%Y-%m-%d").date()
                query_date = (curr_date - datetime.timedelta(days=1)
                              ).strftime("%Y-%m-%d")
                url = f"https://api.polygon.io/v1/open-close/{ticker}/{query_date}?adjusted=true&apiKey={POLYGON_KEY}"
                try:
                    async with session.get(url, timeout=timeout) as resp:
                        response = await resp.json()
                except concurrent.futures.TimeoutError:
                    raise TimeoutError(
                        f"{ticker}: Timed out while retrieving price")

        return response["close"]


In [67]:
# TODO: find way to get historic WACC data (and ideally without using valueinvesting.io)
# also want to find a range / good stdev for WACC
async def get_wacc(ticker: str) -> float:
    """
    Get the weighted average cost of capital debt for a given ticker.

    Args:
        ticker (str): Ticker symbol of the stock.

    Returns:
        float: WACC.
    """
    fingerprint = random.randint(100000, 999999999)
    url = f"https://valueinvesting.io/get_1_company?inp={ticker}&finger_print={fingerprint}"
    async with aiohttp.ClientSession() as session:
        try:
            async with session.get(url) as resp:
                response = await resp.json()
        except concurrent.futures.TimeoutError:
            raise TimeoutError(f"{ticker}: Timed out while retrieving WACC and debt data.")
        try:
            data = json.loads(response["most"][39]["value_text"])
            return float(data["R12"][1].replace("%", "")) / 100
        except:
            try:
                data = json.loads(response["most"][28]["value_text"])
                return float(data["R10"][1].replace("%", "")) / 100
            except Exception as e:
                raise ValueError(f"Cannot retrieve WACC and debt data for {ticker}. Response: {response}")

In [68]:
# TODO: find good mean/stdev for growth rate by company or industry

### Methods to calculate fundamental ratios

In [69]:
async def get_pb_ratio(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the price to book ratio for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The price to book ratio.
    """
    price = await get_price(financials["ticker"], query_date)
    bvps = get_book_value_per_share(financials, query_date, quarterly, tolerance)
    return price / bvps

In [70]:
async def get_pe_ratio(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the price to earnings ratio for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The price to earnings ratio.
    """
    price = await get_price(financials["ticker"], query_date)
    eps = get_earnings_per_share(financials, query_date, quarterly, tolerance)
    return price / eps

In [71]:
async def get_ps_ratio(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the price to sales ratio for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The price to sales ratio.
    """
    price = await get_price(financials["ticker"], query_date)
    sales = get_sales_per_share(financials, query_date, quarterly, tolerance)
    return price / sales

In [72]:
async def get_pcf_ratio(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the price to cash flow ratio for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The price to cash flow ratio.
    """
    price = await get_price(financials["ticker"], query_date)
    cfps = get_ocf_per_share(financials, query_date, quarterly, tolerance)
    return price / cfps

In [73]:
async def get_market_cap(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the enterprise value for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The enterprise value.
    """
    shares = get_shares_outstanding(financials, query_date, quarterly, tolerance)
    price = await get_price(financials["ticker"], query_date)
    return shares * price

In [74]:
async def get_enterprise_value(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the enterprise value for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The enterprise value.
    """
    market_cap = await get_market_cap(financials, query_date, quarterly, tolerance)
    debt = get_total_debt(financials, query_date, quarterly, tolerance)
    cash = get_cash(financials, query_date, quarterly, tolerance)
    return market_cap + debt - cash

In [75]:
async def get_ev_to_ebitda(financials: dict, query_date: str = None, quarterly: bool = False, tolerance: int = 52) -> float:
    """
    Get the enterprise value to EBITDA ratio for a company.

    Args:
        financials (dict): The processed financials for the company.
        query_date (str, optional): The date to get the metric for in Y-m-d format. Defaults to None, which gets the latest metric.
        quarterly (bool, optional): Whether to get the quarterly metric. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.

    Returns:
        float: The enterprise value to EBITDA ratio.
    """
    ev = await get_enterprise_value(financials, query_date, quarterly, tolerance)
    ebitda = get_ebitda(financials, query_date, quarterly, tolerance)
    return ev / ebitda

### Testbed

In [76]:
async def get_data(
        tickers_and_sectors: List[Tuple[str, str]],
        query_date: str = None,
        quarterly: bool = False,
        tolerance: int = 52,
        silent: bool = False
    ) -> List[dict]:
    """
    Get all necessary data to run a DCF model for a given list of tickers and sectors.

    Args:
        tickers_and_sectors (List[Tuple[str, str]]): List of tuples containing ticker and sector.
        query_date (str, optional): The date to get the data for in Y-m-d format. Defaults to None, which gets the latest data.
        quarterly (bool, optional): Whether to get quarterly data. Defaults to False.
        tolerance (int, optional): The max number of weeks to look back for a filing date. Defaults to 52.
        silent: True to suppress exception output, False to print exception output.
    Returns:
        List[dict]: List of dictionaries containing data for each ticker.
    """

    rows = []
    for ticker, sector in tqdm(tickers_and_sectors):

        data = {"ticker": ticker, "sector": sector}
        try:
            financials = get_financials(ticker)
        except Exception as e:
            if not silent: print(f"Could not find processed financials data for {ticker}.")
            #rows.append(data)
            continue

        try:
            data["shares_outstanding"] = get_shares_outstanding(financials, query_date, quarterly, tolerance)
        except Exception as e:
            data["shares_outstanding"] = None
            if not silent: print(e)

        try:
            data["net_income"] = get_net_income(financials, query_date, quarterly, tolerance)
        except Exception as e:
            data["net_income"] = None
            if not silent: print(e)

        try:
            data["current_liabilities"] = get_current_liabilities(financials, query_date, quarterly, tolerance)
        except Exception as e:
            data["current_liabilities"] = None
            if not silent: print(e)

        try:
            data["current_assets"] = get_current_assets(financials, query_date, quarterly, tolerance)
        except Exception as e:
            data["current_assets"] = None
            if not silent: print(e)
        
        try:
            data["ebit"] = get_ebit(financials, query_date, quarterly, tolerance)
        except Exception as e:
            data["ebit"] = None
            if not silent: print(e)
        
        try:
            data["depreciation_and_amortization"] = get_depreciation_and_amortization(financials, query_date, quarterly, tolerance)
        except Exception as e:
            data["depreciation_and_amortization"] = None
            if not silent: print(e)
        
        try:
            data["ufcf"] = get_ufcf(financials, query_date, quarterly, tolerance)
        except Exception as e:
            data["ufcf"] = None
            if not silent: print(e)

        try:
            data["interest_expense"] = get_interest_expense(financials, query_date, quarterly, tolerance)
        except Exception as e:
            data["interest_expense"] = None
            if not silent: print(e)

        try:
            data["tax_expense"] = get_tax_expense(financials, query_date, quarterly, tolerance)
        except Exception as e:
            data["tax_expense"] = None
            if not silent: print(e)

        try:
            data["research_and_development"] = get_research_and_development(financials, query_date, quarterly, tolerance)
        except Exception as e:
            data["research_and_development"] = None
            if not silent: print(e)

        # add data to rows
        rows.append(data)

    return rows

### Build financials dataframe

In [80]:
tickers_and_sectors = get_tickers_and_sectors()
rows = get_data(tickers_and_sectors, query_date="2009-01-01",silent=True)

KeyboardInterrupt: 

In [79]:
financials_df = pd.DataFrame(rows).set_index("ticker")
financials_df.head()

ValueError: DataFrame constructor not properly called!

In [None]:
# exploration to see which tickers are missing data
print(financials_df[financials_df["research_and_development"].isna()].shape[0])
financials_df[financials_df["research_and_development"].isna()]

498


Unnamed: 0_level_0,sector,shares_outstanding,net_income,current_liabilities,current_assets,ebit,depreciation_and_amortization,ufcf,interest_expense,tax_expense,research_and_development
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
MMM,Industrial Conglomerates,,,,,,,,,,
AOS,Building Products,,,,,,,,,,
ABT,Health Care Equipment,,,,,,,,,,
ABBV,Pharmaceuticals,,,,,,,,,,
ACN,IT Consulting & Other Services,,,,,,,,,,
ATVI,Interactive Home Entertainment,,,,,,,,,,
ADM,Agricultural Products,,,,,,,,,,
ADBE,Application Software,,,,,,,,,,
ADP,Data Processing & Outsourced Services,,,,,,,,,,
AAP,Automotive Retail,,,,,,,,,,
