Net stuff

In [None]:
import httpx
import asyncio
from fredapi import Fred
import requests
import numpy as np
import json
import pandas as pd
import pickle
import yahoo_fin.stock_info as si
# We use normal datetime for fred info and pandas datetime for data
from datetime import datetime
from dateutil.relativedelta import relativedelta
from itertools import islice
import math
import os
import matplotlib.pyplot as plt
from functools import lru_cache
from scipy.stats import hmean

fred = Fred(api_key='0c34c4dd2fd6943f6549f1c990a8a0f0') 

async def fetch(url, url_headers, semaphore, client, timeout, max_retries, start_retry_delay):
        #function to fetch data from some url with retries and error responces
        for attempt in range(1,max_retries):
            try:
                async with semaphore:
                    response = await client.get(url, timeout=timeout, headers= url_headers)
                    response.raise_for_status()
                    return response  # Successful request, exit the loop
            except httpx.HTTPStatusError as e:
                    headers = response.headers
                    #Sometimes a retry-after header is returned
                    retry_after = headers.get('Retry-After')
                    if retry_after != None:
                        #Just for debugging
                        print(retry_after)
                        await asyncio.sleep(int(retry_after))
                        continue
                    if e.response.status_code == 404:
                        return "del"
                    print(f"Error response {e.response.status_code} for {url}")
            except httpx.TimeoutException as e:
                print(f"Timeout reached: {e}")
                print(f"Retrying in {attempt*start_retry_delay} seconds...")
                await asyncio.sleep(attempt*start_retry_delay)
            except httpx.RequestError as e:
                print(f"An error occurred: {e}.")
                await asyncio.sleep(attempt*start_retry_delay)
        return 0
                

def fred_info(ids:list, start:str, end:str):
    #Returns a dataframe with all of the indicators together
    #start and end are datatime objects
    start = start.strftime('%Y-%m-%d')
    end = end.strftime('%Y-%m-%d')
    series_list = []
    for id in ids:
        series = fred.get_series(id,observation_start=start, observation_end=end)
        series_list.append(series)
    frame = pd.concat(series_list, axis=1, join="outer")
    frame.columns = ids
    return frame

async def yahoo_fetch(ticker, start_year, end_year, semaphore, max_retries, start_retry_delay):
    #Fetch implemented to get the price data for the company 
    async with semaphore:
        for attempt in range(max_retries +1):
            try:
                response = await asyncio.to_thread(si.get_data,ticker,min(start_year["static"], start_year["dynamic"]), max(end_year["static"], end_year["dynamic"]))
                return response  # Successful request, exit the loop
            except requests.exceptions.ConnectionError as ce:
                print("Yahoo connection error.")
                await asyncio.sleep(attempt*start_retry_delay)
            except Exception as e:
                print(f"Yahoo error:{e}")
                await asyncio.sleep(attempt*start_retry_delay)
        return 0
    
TIMEOUT = 8
RETRIES = 2
START_RETRY_DELAY = 3

Plotting

In [None]:
comp = comp_load("AAPL")
data_points = comp.converted_data["GrossProfit"]
# with open("examples/grossprofit.json","w") as file:
#     json.dump(data_points,file, indent=1)
duplicates = set()
for i, datapoint in enumerate(data_points):
    for j, datapoint2 in enumerate(data_points):
        if i != j:  # Avoid comparing the item with itself
            if abs((datapoint["end"] - datapoint2["end"]).days) < 5  and abs((datapoint["start"] - datapoint2["start"]).days) <  5 and datapoint["val"] == datapoint2["val"]:
                # Convert datapoint2 to a tuple of its relevant attributes
                duplicate_tuple = (datapoint2["start"], datapoint2["end"], datapoint2["val"])
                duplicates.add(duplicate_tuple)

for start, end, val in duplicates:
    print(f"{start.strftime('%Y-%m-%d')} -> {end.strftime('%Y-%m-%d')}, Value: {val}")


In [None]:
comp = comp_load("NVDA")
example_save(comp.data["GrossProfit"], "grossNVDA")

# company_wordsearch("UNP", "OperatingExpenses")

In [None]:
#Different names
measure_conversion = {"Assets":["EquityAndLiabilities", "LiabilitiesAndStockholdersEquity"],
                    "Liabilities":[],
                    "AssetsCurrent":["CurrentAssets"],
                    "LiabilitiesCurrent":["CurrentLiabilities"],
                    "AssetsNoncurrent":["NoncurrentAssets"],
                    "LiabilitiesNoncurrent":["NoncurrentLiabilities"],
                    "Revenues": [],
                    "AccountsPayableCurrent": ["AccountsPayableTradeCurrent"],
                    "EntityCommonStockSharesOutstanding": ["CommonStockSharesOutstanding","NumberOfSharesOutstanding"],
                    "StockholdersEquity":["Equity","StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest"],
                    "DerivativeLiabilities":["FairValueLiabilitiesMeasuredOnRecurringBasisDerivativeFinancialInstrumentsLiabilities"],
                    "DerivativeAssets":["FairValueAssetsMeasuredOnRecurringBasisDerivativeFinancialInstrumentsAssets"],
                    "ShortTermBorrowings":["DebtCurrent"],
                    "CostofRevenue":["CostOfGoodsAndServicesSold"],
                    "CostOfGoodsAndServicesSold": ["CostOfRevenue","CostOfGoodsAndServiceExcludingDepreciationDepletionAndAmortization"],
                    "CostOfGoodsSold": ["CostOfGoodsSoldExcludingDepreciationDepletionAndAmortization"],
                    "Amortization": ["FiniteLivedIntangibleAssetsAmortizationExpenseNextTwelveMonths","AmortizationOfIntangibleAssets"],
                    "IncomeTaxesPaidNet":["IncomeTaxesPaid"],
                    "NetIncomeLoss": ["ProfitLoss"],
                    "OperatingExpenses": ["OperatingCostsAndExpenses"],
                    "PaymentsToAcquirePropertyPlantAndEquipment": ["PaymentsForCapitalImprovements","CapitalExpenditure","PurchasesOfPropertyPlantAndEquipment", "PaymentsForPropertyPlantAndEquipment"],
                    "NetCashProvidedByUsedInOperatingActivities" : ["CashAndCashEquivalentsFromOperatingActivities", "OperatingActivitiesNetCashInflowsOutflows","CashFlowsFromOperatingActivities"],
                    "InterestIncome" : ["InvestmentIncomeInterest"],
                    "LongTermDebtCurrent" : ["LongTermDebtMaturitiesRepaymentsOfPrincipalInNextTwelveMonths"]
}
approximate_measure_conversion = {
                    "Liabilities":["LiabilitiesFairValueDisclosure"],
                    "AssetsCurrent":["CurrentAssetsOtherThanAssetsOrDisposalGroupsClassifiedAsHeldForSaleOrAsHeldForDistributionToOwners",],
                    "LiabilitiesCurrent":["CurrentLiabilitiesOtherThanLiabilitiesIncludedInDisposalGroupsClassifiedAsHeldForSale"],
                    "EntityCommonStockSharesOutstanding": ["WeightedAverageNumberOfSharesOutstandingBasic", "WeightedAverageNumberOfDilutedSharesOutstanding"],
                    "PrepaidExpenseAndOtherAssetsCurrent":["OtherAssetsCurrent"],
                    "CapitalLeaseObligationsNoncurrent": ["OperatingLeaseLiabilityNoncurrent"],
                    "CapitalLeaseObligationsCurrent": ["LesseeOperatingLeaseLiabilityPaymentsDueNextTwelveMonths"],
                    "IntangibleAssetsNetExcludingGoodwill":["FiniteLivedIntangibleAssetsNet"],
                    "AccountsReceivableNet": ["AccountsReceivableNetCurrent"],
                    "Revenues": ["SalesRevenueNet","RevenueFromContractWithCustomerExcludingAssessedTax"],
                    "CostOfGoodsSold":["CostOfGoodsAndServicesSold", "CostOfRevenue"],
                    "CostofRevenue":["CostOfGoodsSold"],
                    "CostOfGoodsAndServicesSold": ["CostOfGoodsSold"],
                    "AccountsPayableAndAccruedLiabilitiesCurrent": ["AccountsPayableCurrent"],
                    "AccountsPayableAndAccruedLiabilities": ["AccountsPayable"],
                    "Depreciation":["DepreciationAndAmortization"],
                    "ShortTermBorrowings":["LongTermDebtMaturitiesRepaymentsOfPrincipalInNextTwelveMonths","LongTermDebtCurrent"],
                    "AccountsReceivableNet" :["AccountsReceivableGrossCurrent"],
                    "DepreciationAndAmortization" : ["DepreciationDepletionAndAmortization"],  
}
additive_conversion = {"Assets":["AssetsCurrent", "AssetsNoncurrent"],
                    "Liabilities":["LiabilitiesCurrent", "LiabilitiesNoncurrent"],
                    "AccountsPayableAndAccruedLiabilitiesCurrent":["AccountsPayableCurrent", "AccruedLiabilitiesCurrent"],
                    "CashCashEquivalentsAndShortTermInvestments": ["CashAndCashEquivalentsAtCarryingValue","ShortTermInvestments"],
                    "AccountsReceivableNet": ["AccountsReceivableNetCurrent", "AccountsReceivableNetNoncurrent"],
                    "CostsAndExpenses":["OperatingExpenses", "CostOfGoodsAndServicesSold"],
                    "GrossProfit":["OperatingIncomeLoss","DepreciationAndAmortization"],
                    "AccountsPayableAndAccruedLiabilities": ["AccruedLiabilities", "AccountsPayable"],
                    "AccountsPayableAndAccruedLiabilitiesCurrent": ["AccruedLiabilitiesCurrent", "AccountsPayableCurrent"],
                    "LongTermDebt": ["LongTermDebtCurrent","LongTermDebtNoncurrent"],
                    "DepreciationAndAmortization": ["Depreciation", "AmortizationOfIntangibleAssets"],
                    "OperatingIncomeLoss":["NetIncomeLoss","IncomeTaxesPaidNet", "DepreciationAndAmortization","InterestExpense"],
                    "PaymentsToAcquirePropertyPlantAndEquipment": ["PaymentsToAcquireProductiveAssets", "PaymentsToAcquireOtherPropertyPlantAndEquipment"],
                    "CapitalExpenditure":["PaymentsToAcquireEquipmentOnLease","PaymentsToAcquireOilAndGasPropertyAndEquipment","PaymentsToAcquireOtherProductiveAssets","PaymentsToAcquireProductiveAssets"],
                    "Revenue": ["GrossProfit", "CostOfGoodsAndServicesSold"],
                    "InterestExpense": ["InterestPaidNet", "InterestIncome"],
                    "LongTermDebt": ["LongTermDebtMaturitiesRepaymentsOfPrincipalAfterYearFive","LongTermDebtMaturitiesRepaymentsOfPrincipalInYearFive", "LongTermDebtMaturitiesRepaymentsOfPrincipalInYearFour","LongTermDebtMaturitiesRepaymentsOfPrincipalInYearThree","LongTermDebtMaturitiesRepaymentsOfPrincipalInYearTwo","LongTermDebtMaturitiesRepaymentsOfPrincipalInNextTwelveMonths"]
}       
approximate_additive_conversion = {
                    "LiabilitesCurrent":["ShortTermBorrowings","AccountsPayableAndAccruedLiabilitiesCurrent","TaxesPayableCurrent", "DividendsPayableCurrent","OtherLiabilitiesCurrent"],
                    "AssetsCurrent":["CashCashEquivalentsAndShortTermInvestments","AccountsReceivableNetCurrent", "CapitalLeaseObligationsCurrent","InventoryNet","PrepaidExpenseAndOtherAssetsCurrent"],
                    "LiabilitesNoncurrent":["LongTermDebtNoncurrent","CapitalLeaseObligationsNoncurrent","DeferredTaxLiabilities","PensionAndOtherPostretirementDefinedBenefitPlansLiabilitiesNoncurrent","CapitalLeaseObligationsNoncurrent","DeferredRevenue","OtherLiabilitiesNoncurrent"],
                    "AssetsNoncurrent": ["PropertyPlantAndEquipmentNet","IntangibleAssetsNetExcludingGoodwill","AccountsReceivableNetNoncurrent","OtherAssetsNoncurrent"], #FINISH WITH GPT-4
                    
}
subtract_conversion = {"Liabilities":["Assets","StockholdersEquity"],
                        "LiabilitiesNoncurrent": ["Liabilities","LiabilitiesCurrent"],
                        "AssetsNoncurrent": ["Assets", "AssetsCurrent"],
                        "CostOfRevenue":["Revenues","GrossProfit"],
                        "GrossProfit": ["Revenues","CostOfGoodsAndServicesSold"],
                        "OperatingIncomeLoss":["GrossProfit", "OperatingExpenses"],
                        "Depreciation":["DepreciationDepletionAndAmortization","AmortizationOfIntangibleAssets"],
                        "CostsAndExpenses":["Revenues","NetIncomeLoss"],
                        "CostOfGoodsAndServicesSold": ["CostsAndExpenses","OperatingExpenses"],
                        "FreeCashFlow": ["NetCashProvidedByUsedInOperatingActivities", "PaymentsToAcquirePropertyPlantAndEquipment"]
}
optional = ["DividendsPayableCurrent",
            "CapitalLeaseObligationsCurrent",
            "PensionAndOtherPostretirementDefinedBenefitPlansLiabilitiesNoncurrent",
            "CapitalLeaseObligationsNoncurrent",
            "DeferredRevenue",
            "AccountsReceivableNetNoncurrent",
            "PaymentsToAcquireOtherPropertyPlantAndEquipment"
            ]

#used in unitrun
valid_units = ['USD','shares','USD/shares', 'Year', 'Entity', 'Segment', 'USD/Contract', 'Job',  'pure', 'USD/Investment', 'Position']
all_units = ['Patent', 'USD', 'Restaurant', 'CNY', 'count', 'former_employee', 'State', 'state', 'membership', 'USD/rights', 'companies', 'Contracts', 'JPY/USD', 'EUR/shares', 'Cases', 'CHF/EUR', 'reportable_unit', 'businesses', 'stores', 'USD/warrant', 'employees', 'reportable_segments', 'derivative', 'Property', 'Employees', 'interest_rate_swap', 'USD/EUR', 'positions', 'Store', 'country', 'USD/Investment', 'CNY/shares', 'USD/shares_unit', 'Reporting_Unit', 'MXN/USD', 'item', 'day', 'uSDollarPerTonne', 'Rate', 'BRL', 'reportablesegments', 'LegalMatter', 'business_segment', 'Interest_Rate_Swap', 'JPY/EUR', 'plan', 'INR/shares', 'JPY', 'JPY/shares', 'numberofprojects', 'EUR', 'unit', 'Years', 'Job', 'years', 'USD/Decimal', 'instrument', 'GBP/EUR', 'reportable_segment', 'percentage', 'Contract', 'Plaintiff', 'TWD/shares', 'TWD', 'Account', 't', 'businessSegment', 'CHF', 'year', 'Positions', 'Projects', 'acquisitions', 'TWD/EUR', 'shares', 'GBP/shares', 'classAction', 'Interest_Rate_Swaps', 'reporting_unit', 'Investment', 'segement', 'Wells', 'segments', 'warehouse', 'AUD/EUR', 'Ground', 'project', 'Segments', 'reportableSegments', 'USD/Contract', 'GBP', 'Derivative', 'case_filed', 'BusinessSegment', 'DKK/shares', 'Acquisition', 'position', 'CNY/USD', 'location', 'defendant', 'operatingSegment', 'Year', 'Operating_segments', 'Y', 'company', 'Business', 'Tonne', 'plaintiff', 'businesscombinations', 'Derivatives', 'Reportable_Segments', 'HKD', 'MYR', 'units', 'operating_segments', 'employee', 'CHF/shares', 'EUR/USD', 'AED', 'patent', 'USD/Right', 'numberofyears', 'Segment', 'customer', 'subsidiaries', 'MXN/shares', 'legalmatter', 'NumberofBusinesses', 'acre', 'DKK', 'Option', 'sqft', 'Entity', 'Business_Segments', 'Employee', 'Person', 'claims', 'states', 'MXN', 'CAD/EUR', 'CAD', 'Location', 'account', 'reportableSegment', 'securities', 'Project', 'VEF/USD', 'business_unit', 'Country', 'Acquistions', 'Operating_Segment', 'Position', 'current_employees', 'Land', 'segment', 'CAD/shares', 'business', 'AUD', 'entity', 'acquisition', 'legal_action', 'countries', 'claim', 'D', 'Day', 'lawsuit', 'USD/PartnershipUnit', 'security', 'Percent', 'ZAR', 'INR/EUR', 'individuals', 'Stock_options', 'USD/shares', 'INR', 'pure', 'Businesses']

Datapoint reshape

In [None]:
def timediff(a,b):
    return abs((a-b).days)

def reshape(measure,datapoint_list, annual = False):
    #Reshapes the datapoint list so that its indexed by end and each item retains its attrs
    #Designed to be used after data is converted to datetime
    #True if it's static
    dynamic = True
    if "start" not in datapoint_list[0]: 
        dynamic = False
    elif pd.isna(datapoint_list[0]["start"]):
        dynamic = False
    if dynamic ==False:
        reshaped_data = {}
        for item in datapoint_list:
            date = item["end"]
            if date not in reshaped_data:
                reshaped_data[date] = []
            reshaped_data[date].append({
                "val": item["val"],
                # "accn": item["accn"],
                # "fy": item["fy"],
                # "fp": item["fp"],
                # "form": item["form"],
                "filed": item["filed"],
                # "frame": item.get("frame") 
            })
        return reshaped_data, dynamic
    else:
        if annual:
        #We need the yearly values
            connected = []
            for datapoint in datapoint_list:
                if datapoint["form"] in ["8-K","10-K/A", "10-K", "20-F", "6-K"] and timediff(datapoint["start"], datapoint["end"])>340:
                    connected.append(datapoint)
        else:      
            #We need to get quartertly data for all of the quarters 
            #This should create data that is connected, whenever there is a link missing, we construct it
            #The data is sorted by the end date 
            #When two points are used to infer data the later filing date is asigned to the new point 
            for datapoint in datapoint_list:
                datapoint["dur"] = timediff(datapoint["end"],datapoint["start"])
            connected = []
            wanted_end = datapoint_list[0]["end"]
            total_end = datapoint_list[-1]["end"]
            while(wanted_end < total_end):
                for datapoint in datapoint_list:
                    missing = True
                    if timediff(wanted_end,datapoint["end"]) < 10 and datapoint["dur"] < 100:
                        connected.append(datapoint)
                        wanted_end = datapoint["end"] + pd.Timedelta(days=91)
                        missing = False
                #missing 
                #Find the interval of possible points to use to infer
                useful_ends = (wanted_end -pd.Timedelta(days=100),wanted_end + pd.Timedelta(days=370))
                pieces =[]
                for datapoint in datapoint_list:
                    if useful_ends[0]<datapoint["end"] <useful_ends[1]:
                        pieces.append(datapoint)
                synthesised = False
                candidates = []
                for i,piece1 in enumerate(pieces,start=1):
                    for piece2 in pieces[i:]:
                        if abs(piece1["dur"] - piece2["dur"]) <100: #If the periods have a difference representing a quarter
                            if piece1["dur"] > piece2["dur"]: #Piece one is the longer duration 
                                if timediff(piece1["end"],piece2["end"]) <6: #If they match by their ends 
                                    candidates.append({"end": piece2["start"], "start":piece1["start"], "val": piece1["val"]-piece2["val"], "filed": max([piece1["filed"], piece2["filed"]])})

                                elif (timediff(piece1["start"], piece2["start"])) < 6: #If they match by their starts
                                    candidates.append({"end": piece1["end"], "start": piece2["end"], "val": piece1["val"]-piece2["val"], "filed": max([piece1["filed"], piece2["filed"]])})

                            elif piece1["dur"] <piece2["dur"]:
                                if timediff(piece1["end"],piece2["end"]) <6: #If they match by their ends 
                                    candidates.append({"end": piece1["start"], "start":piece2["start"], "val": piece2["val"]-piece1["val"], "filed": max([piece1["filed"], piece2["filed"]])})

                                elif (timediff(piece1["start"], piece2["start"])) < 6: #If they match by their starts
                                    candidates.append({"end": piece2["end"], "start": piece1["end"], "val": piece2["val"]-piece1["val"], "filed": max([piece1["filed"], piece2["filed"]])})
                            if candidates != []:
                                filed = candidates[0]["filed"]
                                index=0
                                for i, candidate in enumerate(candidates[1:],start=1):
                                    if candidate["filed"] < filed:
                                        filed = candidate["filed"]
                                        index = i
                                diff = candidates[index]
                                if timediff(diff["end"], wanted_end) <10: #If the ends match we have the point
                                    diff["special"] = "synth"
                                    connected.append(diff)
                                    synthesised = True
                                    wanted_end = diff["end"] + pd.Timedelta(days=91)
                if synthesised == False and missing == True:
                    connected.append({"end": wanted_end, "start": wanted_end + pd.Timedelta(days=91), "val": np.nan, "filed": pd.Timestamp(year=1993, month=1, day=1)})
                    wanted_end = wanted_end + pd.Timedelta(days=91)
        #After getting the connected data, treat it the same as static so that you can use the same method and have consistency
        reshaped_data = {}
        for item in connected:
            date = item["end"]
            if date not in reshaped_data:
                reshaped_data[date] = []
            reshaped_data[date].append({
                "val": item["val"],
                "start": item["start"],
                # "accn": item["accn"],
                # "fy": item["fy"],
                # "fp": item["fp"],
                # "form": item["form"],
                "filed": item["filed"],
                "special": item.get("special")
                # "frame": item.get("frame") 
            })
        return reshaped_data, dynamic



# apple = comp_load("AAPL")
# data= apple.converted_data["GrossProfit"]
# # point_list, unit = unitrun(data["units"], apple.ticker)
# reshaped, dynamic = reshape("GrossProfit", data, annual=True)
# # print(reshaped)
# prev_key = pd.Timestamp(year=1993, month=1, day=1)
# for key,value in list(reshaped.items())[1:]:
#         # difff = timediff(key,prev_key)
#         # if difff == 98:
#         #     print(value)
#         # print(difff)
#         # prev_key = key
#         print(value["GrossProfit"][0])
#         print(timediff(value["GrossProfit"][0]["filed"],key))



In [None]:
comp = comp_load("AAPL")
#"NetCashProvidedByUsedInOperatingActivities", "PaymentsToAcquirePropertyPlantAndEquipment"
frame, unit =recursive_fact(comp,"NetCashProvidedByUsedInOperatingActivities", dynamic_tolerance=pd.DateOffset(years=1))
frame.to_csv("examples/doubleinit.csv")
# print(comp.start_year)
# print(comp.end_year)

Functions

In [272]:
FACTS_PATH = r"C:\Edgar_data"
SUBMISSIONS_PATH = r"C:\Submissions_data"

#Unavailable stuff
with open(r'other_pickle\unavailable.json', 'r') as file:
        Unavailable_Measures = json.load(file)

#Lookup table for the undeprecated version of a measure
with open(r"other_pickle\deprecated_to_current.json", "r") as file:
    deprecate_conversion = json.load(file)
    file.close()

#Categories
with open(r"categories\categories.json", "r") as file:
    categories = json.load(file)

#Make sure the necessary folders exist
for category, num_range in categories.items():
    os.makedirs(f"companies_data\{category}", exist_ok=True)
    os.makedirs(f"companies_data_missing\{category}", exist_ok=True)

for path in ["checkout", "companies", "companies_data", "companies_data_missing", "units-checkout"]:
    os.makedirs(path, exist_ok=True)

with open("categories\category_measures.json", "r") as file:
    category_measures = json.load(file)

#The first entry date into the EDGAR database
START = datetime.strptime('1993-01-01', r"%Y-%m-%d")

#Easily load a company into a variable
def comp_load(ticker):
    with open(f"companies\{ticker}.pkl", "rb")as file:
        company = pickle.load(file)
    return company

def example_save(data, name):
    with open(f"examples/{name}.json", "w") as file:
        json.dump(data,file,indent=1)
    print("Saved")
    
#Manually figure out which measure is used with some company
def company_wordsearch(ticker, word):
    with open(f"companies\{ticker}.pkl", "rb")as file:
        company = pickle.load(file)
    data = company.data
    compdict = {}
    for key,value in data.items():
        compdict[key] = value["description"]

    matching_elements  ={}
    for name, desc in compdict.items():
        if word.lower() in name.lower():   
            matching_elements[name] = desc 
    with open(f"checkout\{ticker}.json", "w") as file:
        json.dump(matching_elements, file, indent =1)
    formatted_json = json.dumps(matching_elements, indent=4)
    formatted_with_newlines = formatted_json.replace('\n', '\n\n')
    print(formatted_with_newlines)  

@lru_cache(maxsize=None)
def closest_date(dates, target_date, ticker, fallback=False):
    left, right = 0, len(dates) - 1
    
    while left <= right:
        mid = left + (right - left) // 2
        
        if dates[mid] < target_date:
            left = mid + 1
        elif dates[mid] > target_date:
            right = mid - 1
        else:
            # Exact match
            if fallback:
                # Ensure mid-1 is within bounds
                return dates[mid], dates[mid-1] if mid-1 >= 0 else None
            return dates[mid]
    
    if left > 0:
        if fallback:
            # Ensure left-2 is within bounds for the second closest date
            second_closest = dates[left-2] if left-2 >= 0 else None
            return dates[left-1], second_closest
        return dates[left-1]
    else:
        # print(f"All dates are greater for {ticker}")
        return (None, None) if fallback else None

def unitrun(dict, ticker, all=False, debug=False):
    if all:
        unit_list = all_units
    else:
        unit_list = valid_units
    for unit in unit_list:
        try:
            return dict[unit], unit 
        except KeyError:
            continue
    print(f"No unit available for {ticker}")
    if debug: 
        units = []
        for key,value in dict.items():
            units.append(key)
        units = list(set(units))
        with open(f"units-checkout\\{ticker}.json", "w") as file:
            json.dump(units,file)
    return False, False

#Removes the top layer of the dict and returns the flattened version
#We are then left with only the measures, no layer on top 
def flatten(data, ticker):
    #Flatten the dataset 
    #Also removes platforms from the data and keeps staircases
    flat_data = {}
    for key, value in data["facts"].items():
        flat_data.update(value)
    # missing_count = 0
    # total_count = 0
    filtered_count = 0
    duplicate_count = 0
    for measure, datapoints_units in flat_data.items():
        datapoints, unit = unitrun(datapoints_units["units"], ticker, all=True)
        if datapoints == False:
            continue
        if len(datapoints) <3:
            continue 
        filtered = []
        duplicates = []
        end_prev = datapoints[0]["end"]
        val_prev = datapoints[0]["val"]
        # Always add the first element; comparison starts from the second element
        filtered.append(datapoints[0])
        if "start" in datapoints[0] and "start" in datapoints[-1]:
            start_prev = datapoints[0].get("start", None)
            for i in range(1, len(datapoints)):
                end = datapoints[i]["end"]
                val = datapoints[i]["val"]
                # try:
                start = datapoints[i].get("start", None)
                # except KeyError:
                #     print(measure)
                #     print(datapoints[i])
                #     print(datapoints)
                #     break
                if not (end == end_prev and val == val_prev and start == start_prev): #or datapoints[i]["form"] in ["8-K","10-K/A"]: #or end == end_next
                    filtered.append(datapoints[i]) 
                    filtered_count +=1
                else:
                    duplicates.append(datapoints[i]) 
                    duplicate_count +=1
                end_prev = end
                val_prev = val
                start_prev = start
        else:
            for i in range(1, len(datapoints)):
                end = datapoints[i]["end"]
                val = datapoints[i]["val"]
                if not (end == end_prev and val == val_prev): #or datapoints[i]["form"] in ["8-K","10-K/A"]: #or end == end_next
                    filtered.append(datapoints[i]) 
                    filtered_count +=1
                else:
                    duplicates.append(datapoints[i]) 
                    duplicate_count +=1
                end_prev = end
                val_prev = val
        flat_data[measure]["units"][unit] = filtered
        #For each company, measure, for each unfiltered datapoint if we dont have it we add one
        # for datapoint_duplicate in duplicates:
        #     gotem = False
        #     total_count +=1
        #     for datapoint_filtered in filtered:
        #         if datapoint_duplicate["val"] == datapoint_filtered["val"] and datapoint_duplicate["end"] == datapoint_filtered["end"]:
        #             gotem = True
        #     if gotem == False:
        #         # if "start" in datapoint_unfiltered.keys():
        #         missing_count +=1

    print(f"{ticker}:{(duplicate_count/(filtered_count+duplicate_count))*100}%")
    # print(f"{ticker}:{(1-missing_count/total_count)*100}%")
    return flat_data

def getcik(ticker):
    #Convert the ticker into the proper cik
    for key,value in cikdata.items():
        if value["ticker"] == ticker:
            cik = value["cik_str"]
            break
    return str(cik).zfill(10)

#Headers for EDGAR call
headers = {
    "User-Agent":"ficakc@seznam.cz",
    "Accept-Encoding":"gzip, deflate",
}

# cik_url =  "https://www.sec.gov/files/company_tickers.json"
# cikdata = requests.get(cik_url, headers=headers).json()

with open(r"other_pickle\cik.json","r") as file:
    cikdata = json.load(file)
    file.close()
    
def sync_companyfacts(ticker:str):
    cik = getcik(ticker)
    data_url = f"https://data.sec.gov/api/xbrl/companyfacts/CIK{cik}.json"
    data  = httpx.get(data_url, headers= headers)
    return data
    
async def companyfacts(ticker:str, client, semaphore):
    #Get all the financial data for a ticker
    cik = getcik(ticker)
    data_url = f"https://data.sec.gov/api/xbrl/companyfacts/CIK{cik}.json"
    facts = await fetch(data_url, headers, semaphore, client, TIMEOUT,RETRIES,START_RETRY_DELAY)
    return facts

async def companysubmissions(ticker:str, client, semaphore):
    #Get all the financial data for a ticker
    cik = getcik(ticker)
    data_url = f"https://data.sec.gov/submissions/CIK{cik}.json"
    facts = await fetch(data_url, headers, semaphore, client, TIMEOUT,RETRIES,START_RETRY_DELAY)
    return facts


class Stock:
    def __init__(self, ticker:str, standard_measures):
        self.initialized_measures = {"static":[], "dynamic":[]}
        self.ticker = ticker.upper()
        self.cik = getcik(self.ticker)
        try:
            with open(os.path.join(FACTS_PATH, f"CIK{self.cik}.json"), "r") as file:
                self.data = flatten(json.load(file), self.ticker)
            with open(os.path.join(SUBMISSIONS_PATH, f"CIK{self.cik}.json"), "r") as file:
                data = json.load(file)
                self.sic = int(data["sic"])
                self.sic_desc = data["sicDescription"]
                self.foreign = False
                for form in ["20-F", "40-F", "6-K"]:
                    if form in data["filings"]["recent"]["form"]:
                        self.foreign = True
            
            self.success = self.time_init(standard_measures)
        except FileNotFoundError:
            self.success = "del"

    def time_init(self, standard_measures, static_start_threshold = 1, static_end_threshold = 1, dynamic_start_threshold = 1, dynamic_end_threshold = 1):
        start_thresholds = {"static": static_start_threshold, "dynamic":dynamic_start_threshold}
        end_thresholds = {"static": static_end_threshold, "dynamic": dynamic_end_threshold}
        #Also serves as a filter for the companies with wrong currencies
        #Check to see if already initialized with the measures
        missing = False
        self.start_year = {}
        self.end_year = {}
        self.date_range = {}
        dates = {"static":[], "dynamic":[]}
        needed_measures = []
        #Both static and dynamic time init
        for motion in ["static", "dynamic"]:
            for measure in standard_measures[motion]:
                if not measure in self.initialized_measures[motion]:
                    missing = True
                    break
            if not missing:
                #We do this because even if initialized with the same measures, thresholds could be different
                start_index = math.ceil(start_thresholds[motion]*len(start_dates[motion])) -1 
                end_index = math.ceil(end_thresholds[motion]*len(end_dates[motion])) -1
                self.start_year[motion] = pd.to_datetime(start_dates[start_index], format=r"%Y-%m-%d")
                self.end_year[motion] = pd.to_datetime(end_dates[end_index], format=r"%Y-%m-%d")
                self.date_range[motion] = pd.date_range(start=self.start_year[motion], end=self.end_year[motion])
                return 1
            # Get all the constituent measures needed through recursivefact
            for measure in standard_measures[motion]:
                #Make a copy.deepcopy if not working 
                if measure in deprecate_conversion:
                    measure = deprecate_conversion[measure]
                value, unit = recursive_fact(self, measure, approx = True, printout =False, date_gather= True)
                if value.empty != True:
                    if unit == "del":
                        print("returned delete")
                        return "del"
                    dates[motion].append(value.attrs["date"]) 
                    # attrs["measures"] is list 
                    needed_measures += value.attrs["measures"]
                else:
                    continue
            if dates[motion] == []:
                return "del"
            
            start_dates, end_dates = map(lambda x: sorted(list(x)),zip(*dates[motion]))
            start_index = math.ceil(start_thresholds[motion]*len(start_dates)) -1 
            end_index = math.ceil(end_thresholds[motion]*len(end_dates)) -1
            self.start_dates = start_dates
            self.end_dates = end_dates
            self.start_year[motion] = pd.to_datetime(start_dates[start_index], format=r"%Y-%m-%d")
            self.end_year[motion] = pd.to_datetime(end_dates[end_index], format=r"%Y-%m-%d")
            self.date_range[motion] = pd.date_range(start=self.start_year[motion], end=self.end_year[motion])

        #remove duplicates
        needed_measures = list(set(needed_measures))
        self.measures_and_dates = zip(standard_measures,start_dates,end_dates)
        self.initialized_measures = standard_measures

        #Change the strings to datetime for the initialized measures
        # Step 1: Flatten batches into a single DataFrame with an identifier
        all_data = []
        for measure in needed_measures:
            if measure in deprecate_conversion:
                measure = deprecate_conversion[measure]
            datapoints, unit = unitrun(self.data[measure]["units"], self.ticker)
            for datapoint in datapoints:
                datapoint["batch_name"] = measure  # Add identifier
                all_data.append(datapoint)

        df = pd.DataFrame(all_data)
        df['filed'] = pd.to_datetime(df['filed'], format='%Y-%m-%d')
        df['end'] = pd.to_datetime(df['end'], format='%Y-%m-%d')
        df['start'] = pd.to_datetime(df['start'], format='%Y-%m-%d', errors='coerce')
        separated_batches = {name: df[df['batch_name'] == name].drop(columns=['batch_name']).to_dict('records') for name in needed_measures}
        self.converted_data = separated_batches
        return 1 
    
    def date_reset(self):
        self.initialized_measures = {"static":[], "dynamic":[]}

    async def async_init(self,client, semaphore, standard_measures):
        #Get all of the data for the company, ALL of it 
        data = await companysubmissions(self.ticker, client, semaphore)
        #If the response wasn't recieved, skips the rest of the code 
        if type(data) == str:
            return "del"
        elif type(data) != int:
            data = data.json()
            self.sic = data["sic"]
            self.sicDescription = data["sicDescription"]
            return 1
        
    async def price_init(self,semaphore):
        #Get the price and set the self.price
        self.fullprice = await yahoo_fetch(self.ticker,self.start_year, self.end_year, semaphore, RETRIES, START_RETRY_DELAY)
        if type(self.fullprice) == int:
            return 0
        Price = self.fullprice[["close", "adjclose"]].copy()
        # Price = Price.reindex(self.date_range)
        self.price = Price.ffill().bfill()
        return 1 
    
    def fact(self, measure, row_delta = pd.Timedelta(days=1), column_delta = pd.Timedelta(days=365),static_tolerance=pd.Timedelta(days =0), dynamic_row_delta=pd.Timedelta(days=1), dynamic_tolerance=pd.Timedelta(days=91), lookbehind =5, annual=False, date_gather=False):
        """  
        If date_gather, then it returns a dataframe to allow recursive_fact to get the date.
        Returns a dataframe that has rows indexed row_delta away, with lookbehind columns that are column_delta away.
        If the data is dynamic then the row and column deltas are fixed.
        Dynamic tolerance is how much into the future the price we are predicting is.
        """
        #Propagate the 0 
        if self.data == 0:
            return 0
        try:
            #If the measure is deprecated switch to the undeprecated version
            if measure in deprecate_conversion:
                measure = deprecate_conversion[measure]
                # frame = pd.concat([frame, frame_undep], axis=0).reset_index(drop=True)
            if date_gather:
                try:
                    data= self.data[measure]
                except KeyError:
                    # return pd.DataFrame(), None
                    return None
                point_list, unit = unitrun(data["units"], self.ticker)
                if point_list == False:
                    # return pd.DataFrame(), "del"
                    return "del"
                # frame = pd.DataFrame({'A': [1]})
                # frame.attrs["date"] = (point_list[0]["end"], point_list[-1]["end"])
                # frame.attrs["measures"] = [measure]
                # # #Use the previous method to gather dates
                # return frame, unit
                return (point_list[0]["end"], point_list[-1]["end"])

            
            #Get the index dates for the datpoints for measure
            frame_list = []
            if measure in self.converted_data:
                data= self.converted_data[measure]
            else:
                # print(f"{self.ticker}: Data not converted or available for {measure}")
                return pd.DataFrame(), None
            reshaped, dynamic = reshape(measure, data, annual)
            if dynamic:
                motion = "dynamic"
            else:
                motion = "static"
            if dynamic == True:
                row_delta = dynamic_row_delta
                column_delta = pd.Timedelta(days=95)
            tolerance = dynamic_tolerance * int(dynamic) + static_tolerance * (int(not dynamic))
            #Vectorized version
            dates = list(reshaped.keys())
            #Generate all the needed row_dates in advance
            # num_of_rows = (self.end_year[motion] - self.start_year[motion]- column_delta*(lookbehind-1)).days/row_delta.days
            # row_dates = [[self.end_year[motion] +row_delta*l -column_delta*i for i in range(0,lookbehind)] for l in num_of_rows]

            base_dates = pd.date_range(start=self.end_year[motion], end=self.start_year[motion], freq=-row_delta)
            i_values = np.arange(lookbehind)  # An array [0, 1, ..., lookbehind-1]
            adjustments = i_values * column_delta
            # Broadcasted subtraction: for each date in base_dates, subtract each value in adjustments
            row_dates = np.subtract.outer(base_dates, adjustments)
            dimensions = row_dates.shape
            flat_row_dates = row_dates.flatten()
            date_series = pd.Series(flat_row_dates)
            # Use apply to run closest_date on each date
            results_series = date_series.apply(lambda x: closest_date(tuple(dates), x, self.ticker, fallback=True))  # m rows and n columns, replace with actual values
            row_indexes = results_series.to_numpy().reshape(dimensions)
            row_indexes = pd.DataFrame(row_indexes, index =base_dates)
            #each item is a tuple of dates 

            frame_dict = {row_index:[] for row_index in base_dates}
            for row_index, row in row_indexes.iterrows():
                barrier = row_index + tolerance
                for index_tuple in row: 
                    index, fallback_index = index_tuple
                    nearest_filed = pd.Timestamp.min
                    uptodate = {"val": np.nan}
                    if index!= None:
                        for value in reshaped[index]:  
                            if nearest_filed < value["filed"] <= barrier:
                                uptodate = value
                                nearest_filed = value["filed"]
                    if np.isnan(uptodate["val"]):
                        if fallback_index != None:
                            for value in reshaped[fallback_index]:  
                                if nearest_filed < value["filed"] <= barrier:
                                    uptodate = value
                                    nearest_filed = value["filed"]
                    frame_dict[row_index].append(uptodate["val"])


            frame = pd.DataFrame.from_dict(frame_dict, orient='index')
            frame = frame.iloc[::-1]
            frame.columns = [f"{measure}-{i}" for i in range(0,lookbehind)]
            return frame, "Some_unit"   
        except KeyError as e:
            print(f"in fact keyerror: {e}")
            print(f"Fact {measure} not available for {self.ticker}.")
        except Exception as e:
            print(f"in fact: {e}")

def recursive_fact(comp, measure, depth=0, approx = True, row_delta = pd.Timedelta(days=1), column_delta = pd.Timedelta(days=365),static_tolerance=pd.Timedelta(days =0), dynamic_row_delta=pd.Timedelta(days=1), dynamic_tolerance=pd.Timedelta(days=91), lookbehind =5 , annual=False, printout =False, date_gather= False):
    #The branch closes with a LookupError that gets propagated backward, means that one of the parts is missing
    #Deletes are propagated backwards and returned
    #If there is a loop the recursion will terminate after 4 cycles max
    #Date gather uses the frame.attrs["date"] to get the max date of all the parts that make up the measure
    #Date gather also gets the needed measures via frame.attrs["measures"]
    #   - Date gather skips the addition of the frames so you can just use attrs["date"]
    #Printout is for debugging
    #If approximate is set to False then the measures will be missing instead 
    if depth>4:
        return pd.DataFrame(), None
    if printout:
        print(f"Entering recursive with {measure}")
    value, unit = comp.fact(measure, row_delta, column_delta, static_tolerance, dynamic_row_delta,  dynamic_tolerance, lookbehind, annual, date_gather)
    if unit == "del":
        return value,unit
    #value will be false if measure is unavailable
    if value.empty != True:
        return value,unit
    #Just a different name
    if measure in measure_conversion:
        for replacement in measure_conversion[measure]:
            value, unit = recursive_fact(comp, replacement, depth+1, approx,row_delta, column_delta, static_tolerance,  dynamic_row_delta, dynamic_tolerance, lookbehind, annual, printout,date_gather)
            if value.empty == True:
                pass
            else:
                #We dont need to use date attrs here, because this just passes on the date without change
                return value, unit
    #Not stated, but can be derived by adding
    if measure in additive_conversion:
        values = []
        if additive_conversion[measure] != []:
            abort = False
            for part in additive_conversion[measure]:
                if abort == False:
                    add_value, unit = recursive_fact(comp,part,depth+1,approx,row_delta, column_delta,static_tolerance, dynamic_row_delta, dynamic_tolerance, lookbehind, annual, printout,date_gather)
                    if unit == "del":
                        return add_value, unit
                    if add_value.empty == True:
                        if part not in optional:
                            abort = True
                        continue
                    values.append(add_value)
            if not abort:
                #Get all the possible indexes for the frame
                frame1 = values[0]
                for idx, value in enumerate(values[1:], start=1):
                    frame1, value = frame1.align(value, join="outer", axis=0)
                    values[idx] = value
                    
                result_frame = pd.DataFrame(index=frame1.index)
                for i, col in enumerate(frame1.columns):
                    result_frame[f'{measure} {i - lookbehind +1}'] = frame1[col]  # Initialize with frame1's columns
                for value in values:
                    for i, col in enumerate(value.columns):
                        result_frame[f'{measure} {i - lookbehind +1}'] = result_frame[f'{measure} {i - lookbehind +1}'].add(value[col])

                return result_frame, unit
    #Not stated, but can be inferred by subtracting
    if measure in subtract_conversion:
        add, unit = recursive_fact(comp,subtract_conversion[measure][0],depth+1, approx,row_delta, column_delta, static_tolerance,dynamic_row_delta, dynamic_tolerance, lookbehind, annual,printout,date_gather)
        #Things in subtract should have the same unit so one check is ok.
        if unit == "del":
            return add,unit 
        sub, unit = recursive_fact(comp,subtract_conversion[measure][1],depth+1, approx,row_delta, column_delta, static_tolerance, dynamic_row_delta, dynamic_tolerance, lookbehind, annual,printout,date_gather)
        if add.empty != True and sub.empty !=True:
            #Get all the possible indexes for the frame
            values = [add, sub]
            add, sub = add.align(sub, join="outer", axis=0)
            result_frame = pd.DataFrame(index=add.index)
            for i, col in enumerate(add.columns):
                result_frame[f'{measure} {i - lookbehind +1}'] = add[col]  # Initialize with add's columns
            for i, col in enumerate(sub.columns):
                result_frame[f'{measure} {i - lookbehind +1}'] = result_frame[f'{measure} {i - lookbehind +1}'].sub(sub[col])

            return result_frame,unit
    if approx:
        if measure in approximate_measure_conversion:
            for replacement in approximate_measure_conversion[measure]:
                value, unit = recursive_fact(comp,replacement,depth+1,approx, row_delta, column_delta, static_tolerance, dynamic_row_delta, dynamic_tolerance, lookbehind, annual, printout,date_gather)
                if value.empty != True:
                    print(f"approximate_conversion {measure} to {replacement}")
                    return value, unit
        if measure in approximate_additive_conversion:
            values = []
            if approximate_additive_conversion[measure] != []:
                abort = False
                for part in approximate_additive_conversion[measure]:
                    if abort == False:
                        add_value, unit = recursive_fact(comp,part,depth+1,approx,row_delta, column_delta, dynamic_row_delta,static_tolerance, dynamic_tolerance, lookbehind, annual, printout,date_gather)
                        if unit == "del":
                            return add_value, unit
                        if add_value.empty == True:
                            if part not in optional:
                                abort = True
                            continue
                        values.append(add_value)
                if not abort:
                    #Get all the possible indexes for the frame
                    frame1 = values[0]
                    for idx, value in enumerate(values[1:], start=1):
                        frame1, value = frame1.align(value, join="outer", axis=0)
                        values[idx] = value
                        
                    result_frame = pd.DataFrame(index=frame1.index)
                    for i, col in enumerate(frame1.columns):
                        result_frame[f'{measure} {i - lookbehind +1}'] = frame1[col]  # Initialize with frame1's columns
                    for value in values:
                        for i, col in enumerate(value.columns):
                            result_frame[f'{measure} {i - lookbehind +1}'] = result_frame[f'{measure} {i - lookbehind +1}'].add(value[col])
                    print(f"appoximate_addition {measure}")
                    return result_frame, unit
    if printout:
        print(f"{measure} not available for {comp.ticker}")
    return pd.DataFrame(), None

def path_selector(comp, measure, path,row_delta = pd.Timedelta(days=1), column_delta = pd.Timedelta(days=365),static_tolerance=pd.Timedelta(days =0), dynamic_row_delta=pd.Timedelta(days=1), dynamic_tolerance=pd.Timedelta(days=91), lookbehind =5 , annual=False):
    """
    Takes in the desired path to the data and outputs the data
    New recursive_fact
    """
    if path == ():              
        value, unit = comp.fact(measure, row_delta, column_delta, static_tolerance, dynamic_row_delta, dynamic_tolerance, lookbehind, annual)
        #We need to account for measure conversion after add and sub
        if value.empty == True:
            if measure in measure_conversion:
                for replacement in measure_conversion[measure]:
                    value, unit = comp.fact(replacement, row_delta, column_delta, static_tolerance, dynamic_row_delta, dynamic_tolerance, lookbehind, annual)
                    if value.empty != True:
                        break
        return value, unit
    #Add path
    elif path[0] == "add":
        values = []
        if additive_conversion[measure] != []:
            for part in additive_conversion[measure]:
                add_value, unit = path_selector(comp, part, (), row_delta, column_delta, static_tolerance, dynamic_row_delta, dynamic_tolerance, lookbehind, annual)
                values.append(add_value)
            #Get all the possible indexes for the frame
            frame1 = values[0]
            for idx, value in enumerate(values[1:], start=1):
                frame1, value = frame1.align(value, join="outer", axis=0)
                values[idx] = value
            result_frame = pd.DataFrame(index=frame1.index)
            for i, col in enumerate(frame1.columns):
                result_frame[f'{measure} {i - lookbehind +1}'] = frame1[col]  # Initialize with frame1's columns
            for value in values:
                for i, col in enumerate(value.columns):
                    result_frame[f'{measure} {i - lookbehind +1}'] = result_frame[f'{measure} {i - lookbehind +1}'].add(value[col])
            return result_frame, unit
        
    elif path[0] == "sub":
        add, unit = path_selector(comp, subtract_conversion[measure][0], (), row_delta, column_delta, static_tolerance, dynamic_row_delta, dynamic_tolerance, lookbehind, annual)
        sub, unit = path_selector(comp, subtract_conversion[measure][1], (), row_delta, column_delta, static_tolerance, dynamic_row_delta, dynamic_tolerance, lookbehind, annual)
        #Get all the possible indexes for the frame
        values = [add, sub]
        add, sub = add.align(sub, join="outer", axis=0)
        result_frame = pd.DataFrame(index=add.index)
        for i, col in enumerate(add.columns):
            result_frame[f'{measure} {i - lookbehind +1}'] = add[col]  # Initialize with add's columns
        for i, col in enumerate(sub.columns):
            result_frame[f'{measure} {i - lookbehind +1}'] = result_frame[f'{measure} {i - lookbehind +1}'].sub(sub[col])
        return result_frame,unit
    
    elif path[0] == "approx_add":
        values = []
        if approximate_additive_conversion[measure] != []:
            for part in approximate_additive_conversion[measure]:
                add_value, unit = path_selector(comp, part, (), row_delta, column_delta, static_tolerance, dynamic_row_delta, dynamic_tolerance, lookbehind, annual)
                values.append(add_value)
            #Get all the possible indexes for the frame
            frame1 = values[0]
            for idx, value in enumerate(values[1:], start=1):
                frame1, value = frame1.align(value, join="outer", axis=0)
                values[idx] = value
            result_frame = pd.DataFrame(index=frame1.index)
            for i, col in enumerate(frame1.columns):
                result_frame[f'{measure} {i - lookbehind +1}'] = frame1[col]  # Initialize with frame1's columns
            for value in values:
                for i, col in enumerate(value.columns):
                    result_frame[f'{measure} {i - lookbehind +1}'] = result_frame[f'{measure} {i - lookbehind +1}'].add(value[col])
            return result_frame, unit
    else:
        path_selector(comp, path[0], path[1:], row_delta, column_delta, static_tolerance, dynamic_row_delta, dynamic_tolerance, lookbehind, annual, approx)

    
        

def recursive_date_gather(comp, measure, depth=0, current_path= [], path_date = None, blind = False, approx = True, printout =False):
    #path = ("first_step", "second_step"...)
    #The whole thing return the best path with the date that you get
    #Individual calls do not return anything, the tree stops whenever there is a return
    if path_date is None and depth==0:
        path_date = {"del": False}
    date_list = []
    if depth>4:
        return None
    if printout:
        print(f"Entering recursive with {measure}")
    dates = comp.fact(measure, date_gather = True)
    if dates == "del":
        path_date["del"] = True
    if dates != None:
        if not blind:
            #We leave this empty becuase it represents the call to the current path, imagine adding teh measure intput to the star of all paths 
            path_date[tuple(current_path)] = dates
        date_list.append(dates)
        
    if measure in measure_conversion:
        for replacement in measure_conversion[measure]:
            #we do not add anything to the current path because its handled 
            dates = recursive_date_gather(comp, replacement, depth+1, current_path + [replacement], path_date, blind, approx, printout)
            if dates != None:
                date_list.append(dates)

    if measure in additive_conversion:
        start_list = []
        end_list = []
        if additive_conversion[measure] != []:
            abort = False
            for part in additive_conversion[measure]:
                if abort == False:
                    dates = recursive_date_gather(comp, part,depth+1, current_path + ["add"],path_date, True, approx, printout) #set blind to true to avoid adding false paths
                    if dates == None:
                        if part not in optional:
                            abort = True
                        continue
                    else:
                        start, end = dates
                        start_list.append(start)
                        end_list.append(end)
            if not abort:
                start = max(start_list)
                end = min(end_list)
                if not blind:
                    path_date[tuple(current_path + ["add"])] = (start, end)
                date_list.append((start,end))
                        
    if measure in subtract_conversion:
        dates_add = recursive_date_gather(comp,subtract_conversion[measure][0],depth+1, current_path + ["sub"], path_date, True, approx, printout)
        dates_sub = recursive_date_gather(comp,subtract_conversion[measure][1],depth+1, current_path + ["sub"], path_date, True, approx, printout)
        if dates_add != None and dates_sub != None:
            dates = (max(dates_add[0], dates_sub[0]), min(dates_add[1], dates_sub[1]))
            if not blind:
                path_date[tuple(current_path + ["sub"])] = dates
            date_list.append(dates)

    if approx:
        if measure in approximate_measure_conversion:
            for replacement in approximate_measure_conversion[measure]:
                dates = recursive_date_gather(comp, replacement, depth+1, current_path + [replacement], path_date, blind, approx, printout)

        if measure in approximate_additive_conversion:
            start_list = []
            end_list = []
            if approximate_additive_conversion[measure] != []:
                abort = False
                for part in approximate_additive_conversion[measure]:
                    if abort == False:
                        dates = recursive_date_gather(comp, part,depth+1,current_path + ["approx_add"],path_date, True, approx, printout) #set blind to true to avoid adding false paths
                        if dates == None:
                            if part not in optional:
                                abort = True
                            continue
                        else:
                            start, end = dates
                            start_list.append(start)
                            end_list.append(end)
                if not abort:
                    start = max(start_list)
                    end = min(end_list)
                    if not blind:
                        path_date[tuple(current_path + ["approx_add"])] = (start, end)
                    date_list.append((start,end))
    if printout:
        print(f"{measure} not available for {comp.ticker}")
    if depth ==0:
        if path_date["del"] == True:
            return "del"
        #some logic to find the best path here
        best_interval = pd.Timedelta(days=0)
        best_path = None
        best_dates = None
        for path, dates in list(path_date.items())[1:]: #skip the dele
            interval = pd.to_datetime(dates[1]) - pd.to_datetime(dates[0])
            if interval > best_interval:
                best_interval = interval
                best_dates = dates
                best_path = path
        #We can also use the path from here to get the needed measures.
        return (best_path,best_dates)
    #Find the best dates
    if date_list == []:
        return None
    start_dates, end_dates = [item[0] for item in date_list], [item[1] for item in date_list]
    first_start = min(start_dates)
    index = start_dates.index(first_start)
    return date_list[index]

def get_category(sic):
    for category, number_ranges in categories.items():
        for number_range in number_ranges:
            if number_range[0]<=sic<=number_range[-1]:
                return category
    return "Uncategorized"

#(comp, measure, depth=0, approx = True, row_delta = pd.Timedelta(days=1), column_delta = pd.Timedelta(days=365),static_tolerance=pd.Timedelta(days =0), dynamic_row_delta=pd.Timedelta(days=1), dynamic_tolerance=pd.Timedelta(days=91), lookbehind =5 , annual=False, printout =False, date_gather= False
def acquire_frame(comp, measures:dict, indicator_frame, static_start_threshold = 1, static_end_threshold = 1, dynamic_start_threshold = 1, dynamic_end_threshold = 1, approx= True, threshold = 1, row_delta = pd.Timedelta(days=1), column_delta = pd.Timedelta(days=365), static_tolerance=pd.Timedelta(days =0), dynamic_row_delta=pd.Timedelta(days=1), dynamic_tolerance=pd.Timedelta(days=91),  lookbehind =5, annual=False, printout =False):
    #Get a dataframe from the saved data of some stock 
    #Returns 0 in all the columns where data is missing
    comp.time_init(measures, static_start_threshold, static_end_threshold , dynamic_start_threshold , dynamic_end_threshold)
    catg = get_category(comp.sic)
    frames_list = []
    unit_list = []
    df = {}
    for motion in ["static","dynamic"]:
        for measure in measures[motion]:
            data, unit = recursive_fact(comp, measure, 0, approx, row_delta , column_delta , static_tolerance, dynamic_row_delta,dynamic_tolerance, lookbehind, annual, printout)
            if data.empty != True:
                data.name = measure
                frames_list.append(data)
                unit_list.append(unit)
            else:
                frames_list.append(pd.DataFrame(data={measure: np.nan}, index=comp.date_range))
                unit_list.append("missing")
                catandticker =  comp.ticker +":"+ comp.sic_desc
                if catandticker in Unavailable_Measures[catg]:
                    Unavailable_Measures[catg][catandticker] += [measure]
                else:
                    Unavailable_Measures[catg][catandticker] = [measure] 

        df[motion] = pd.concat(frames_list, axis =1, join="outer")
    #If units are necessary
    # columns_multiindex = pd.MultiIndex.from_tuples([(col, unit) for col, unit in zip(df.columns, unit_list)],names=['Variable', 'Unit'])
    # df.columns = columns_multiindex
    #Economic indicators 
    # indicator_frame = indicator_frame.reindex(comp.date_range)
    # indicator_frame = indicator_frame.ffill().bfill()
    # df = df.join(indicator_frame, how="left")
        df[motion] = df[motion].join(comp.price, how= "left")
        df[motion].attrs["units"] = unit_list
        df[motion].attrs["category"] = catg
    return df

#Initializes and appends the stock object
async def async_task(ticker, client, semaphore_edgar, semaphore_yahoo, measures):
    # Measures are used to get the date when all the financial info is available
    print(f"Loading {ticker}")
    stock = Stock(ticker, measures)
    if stock.success == "del":
        return (ticker, "del")
    # successful_sic = await stock.async_init(client,semaphore_edgar,measures)
    # if successful_sic == "del":
    #     return (ticker, "del")
    if stock.success:
        print(f"Price pinging {ticker}$")
        succesful_price = await stock.price_init(semaphore_yahoo)
    else:
        succesful_price = 0
    with open(f'companies\{ticker}.pkl', 'wb') as file:
        pickle.dump(stock,file)
    success = stock.success
    del stock
    #Return (ticker, availability of data, availability of price)
    print(f"||Done {ticker}||")
    return (ticker, [success, succesful_price])

#Get the success rate for the api call
def success_rate(availability_list):
    edgar_success = 0
    yahoo_success = 0
    for i in availability_list:
        ticker, available = i
        if available != "del":
            edgar_success += available[0]
            yahoo_success += available[1]
        else:
            edgar_success += 1
            yahoo_success += 1
    try:
        edgar_success = edgar_success/len(availability_list)
        yahoo_success = yahoo_success/len(availability_list)
        print(f"Edgar success rate: {edgar_success}")
        print(f"Yahoo success rate: {yahoo_success}")
    except ZeroDivisionError:
        print("No list to analyze")
            
#Function to call again for missing data
def ticker_fill(company_frames_availability):
    ticker_list = []
    for ticker, available in company_frames_availability.items():
        if available[0] and available[1]:
            continue
        else:
            ticker_list.append(ticker)
    return ticker_list

In [273]:
comp = comp_load("AAPL")
path_date= recursive_date_gather(comp, "FreeCashFlow")
print(path_date[0])
data, unit = path_selector(comp,"FreeCashFlow", path_date[0])
data.head(-5)


('sub',)


Unnamed: 0,FreeCashFlow -4,FreeCashFlow -3,FreeCashFlow -2,FreeCashFlow -1,FreeCashFlow 0
2016-09-24,,,,,
2016-09-25,,,,,
2016-09-26,,,,,
2016-09-27,,,,,
2016-09-28,,,,,
...,...,...,...,...,...
2023-12-21,1.943500e+10,2.428700e+10,2.564400e+10,3.021800e+10,2.083800e+10
2023-12-22,1.943500e+10,2.428700e+10,2.564400e+10,3.021800e+10,2.083800e+10
2023-12-23,1.943500e+10,2.428700e+10,2.564400e+10,3.021800e+10,2.083800e+10
2023-12-24,1.943500e+10,2.428700e+10,2.564400e+10,3.021800e+10,2.083800e+10


In [191]:
comp = comp_load("AAPL")
with open("other_pickle\measures.json", "r") as file:
    fund_measures = json.load(file)
#{"dynamic":["FreeCashFlow"], "static":[ "Assets", "Liabilities",""]}
comp.date_reset()
aframe= acquire_frame(comp, fund_measures, None, static_tolerance=pd.Timedelta(days=365))

KeyboardInterrupt: 

Indicator frame


In [None]:
indicators = ["TB3MS", "DCOILWTICO"]
indicator_frame = fred_info(indicators, START, datetime.now().date())
with open("other_pickle\indicator_frame.pkl", "wb") as file:
    pickle.dump(indicator_frame, file)

Data grab

In [None]:
#write out measures based on importance in descending order
with open("other_pickle\measures.json", "r") as file:
    fund_measures = json.load(file)
#Load out the indicators 
with open("other_pickle\indicator_frame.pkl", "rb") as file:
    indicator_frame = pickle.load(file)

#Load the info about the companies that we already have (ticker,edgar,yahoo)
with open(r"other_pickle\frame_availability.pkl", "rb") as file:
    company_frames_availability = pickle.load(file)

#Load the unavailable data so far 
with open(r'other_pickle\unavailable.json', 'r') as file:
        Unavailable_Measures = json.load(file)

edgar_client =  httpx.AsyncClient()
sem_edgar = asyncio.Semaphore(9)
#Separate sem for yahoo to spread the work and connections
sem_yahoo = asyncio.Semaphore(9)

#Create tasks to ge the first companies_num companies by valuation
# company_frames_availability = {"TSM":(0,0)}
companies_num = 100
ticker_dict = dict(islice(company_frames_availability.items(), companies_num))
ticker_list = ticker_fill(ticker_dict)
tasks = []
for ticker in ticker_list:
    tasks.append(async_task(ticker, edgar_client, sem_edgar, sem_yahoo, fund_measures))

availability_list = await asyncio.gather(*tasks)

for ping in availability_list:
    ticker, avail = ping
    company_frames_availability[ticker] = avail
    if avail == "del":
         del company_frames_availability[ticker]

success_rate(availability_list)

print(company_frames_availability)
with open(r"other_pickle\frame_availability.pkl", "wb") as file:
    pickle.dump(company_frames_availability, file)

#Save the unavailable measures for later use
with open(r'other_pickle\unavailable.json', 'w') as file:
        json.dump(Unavailable_Measures,file, indent= 1)

Resets

In [None]:
#BIG RESET!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#GATHER THE FIRST companies_num companies ciks and pass them to the gather with the tasks
company_frames_availability = {}
for company, values in cikdata.items():
    company_frames_availability[values["ticker"]]=[0,0]
print(company_frames_availability)
with open(r"other_pickle\frame_availability.pkl", "wb") as file:
    pickle.dump(company_frames_availability, file)


In [None]:
#UNAVAILABLE RESET
with open(r"categories\categories.json", "r") as file:
      categories = json.load(file)
Unavailable_Measures = {}
for category, num_range in categories.items():
    Unavailable_Measures[category] = {}
with open(r'other_pickle\unavailable.json', 'w') as file:
        json.dump(Unavailable_Measures,file, indent = 1)

In [None]:
comp = comp_load("TSM")
print(comp)
for i,(ticker, availability) in enumerate(company_frames_availability.items()):
    edgar, yahoo = availability
    if edgar == 1 and yahoo ==1:
        comp = comp_load(ticker)
        print(comp.foreign)

Statistics

In [None]:
PEs = []
companies_count = 0
missing_tickers = []
#company_frames_availability
for i,(ticker, availability) in enumerate({"GE": (1,1)}.items()):
    print(ticker)
    edgar, yahoo = availability
    if edgar == 1 and yahoo ==1:
        companies_count +=1
        comp = comp_load(ticker)
        if comp.foreign == False:
            earnings, unit = recursive_fact(comp, "NetIncomeLoss", lookbehind =1, annual=True)
            shares, unit = recursive_fact(comp, "EntityCommonStockSharesOutstanding", lookbehind =1, static_tolerance=pd.DateOffset(years =1))
            try:
                shares = shares[shares.columns[0]]
                earnings = earnings[earnings.columns[0]]
                price = comp.price["close"]
                EPS = earnings / shares
                PE = price / EPS
                PE = PE.dropna()
                if PE.empty == True:
                    missing_tickers.append(ticker)
                PE = PE[PE >= 0]
                # PEs.append(PE.mean())
                if np.isnan(PE.mean()):
                    print(PE)
                # print("|Done|")
            except Exception as e:
                print(e)
                continue
for i in PEs:
    print(i)
print(missing_tickers)

In [None]:
pure = []
for i in PEs:
    if type(i) == np.float64:
        if 100>i:
            pure.append(i)
mean = np.float64(0)
for i in pure:
    mean = mean + i
mean = mean/len(pure)
print(mean)
print(len(pure))
print(len(PEs))

companies_count = 0
#company_frames_availability
for ticker, availability in company_frames_availability.items():
    edgar, yahoo = availability
    if edgar == 1 and yahoo ==1:
        companies_count +=1
print(companies_count)

Get the frames 

In [None]:
with open("other_pickle\indicator_frame.pkl", "rb") as file:
    indicator_frame = pickle.load(file)

#Load the info about the companies that we already have (ticker,edgar,yahoo)
with open(r"other_pickle\frame_availability.pkl", "rb") as file:
    company_frames_availability = pickle.load(file)

with open(r'other_pickle\unavailable.json', 'r') as file:
        Unavailable_Measures = json.load(file)

with open(r"other_pickle\measures.json", "r") as file:
    measures = json.load(file)
measures = ["Assets", "Liabilities", "AssetsCurrent", "AssetsNoncurrent"]
for ticker, availability in company_frames_availability.items():
    edgar, yahoo = availability
    if edgar and yahoo:
        try:
            with open(f'companies\{ticker}.pkl', 'rb') as file:
                comp = pickle.load(file)
        except FileNotFoundError:
            print(f"{ticker} is not available for loading")
        catg = get_category(comp.sic)
        frame = acquire_frame(comp, measures, indicator_frame, approx=True, threshold=0.8, lookbehind =5) #category_measures[catg]
        #Save it again because of time_init and also save some memory
        with open(f'companies\{ticker}.pkl', 'wb') as file:
            pickle.dump(comp,file)
            del comp
        if "missing" in frame.attrs["units"]:
            frame.to_csv(f"companies_data_missing\{catg}\{ticker}.csv")
            print(f"{ticker} missing")
        frame.to_csv(f"companies_data\{catg}\{ticker}.csv")
        print(f"{ticker} saved.")

with open(r'other_pickle\unavailable.json', 'w') as file:
        json.dump(Unavailable_Measures,file, indent= 1)
        

XBRL 

In [None]:
Elementframe = pd.read_csv("ListOfElements.csv")
# Elementframe = Elementframe[Elementframe["approvalStatus"].apply(lambda rem: rem=="Final")]
# Elementframe.to_csv("ListOfElements.csv")
def xbrl_wordsearch(word, length):
    matches = Elementframe[Elementframe["elementName"].apply(lambda x: x if (len(x)<length) else "").str.contains(word, na=False)]
    if matches.empty:
        print("No match")
        return
    print(json.dumps(matches[["elementName","definition","Deprecated"]].to_dict(orient="records"), indent = 2))

with open(r"companies\MSFT.pkl", "rb") as file:
    company = pickle.load(file)

with open(r"units.json", "w") as file:
    json.dump(compdict,file, indent= 1)

In [None]:
# Apple.data["facts"]["dei"]["EntityCommonStockSharesOutstanding"]["units"]["shares"]
frame  = fred_info(["TB3MS", "DCOILWTICO"], '2015-02-24', '2017-02-24')
frame.head(40)
# print(frame)

frame["index"] = frame["index"].astype(str)
with open("FRED.json", "w") as file:
    json.dump(frame.to_dict(orient="records"), file, indent=1)
    file.close()


Deprecated.

In [None]:
dictionary = {}
tickers = ["META"]
for ticker in tickers:
    data = sync_companyfacts(ticker).json()
    data = data["facts"]["us-gaap"]
    for key,value in data.items():
        del value["units"]
        if not key in dictionary:
            dictionary[key] = value

with open("deprecated.json", "w")as file:
    json.dump(dictionary, file, indent= 1)
    file.close()

*Get all the measure names*

In [None]:
ticker = "MSFT"
unit_list =[]
for ticker, availability in company_frames_availability.items():
    edgar, yahoo = availability
    if edgar and yahoo:
        with open(f"companies\{ticker}.pkl", "rb")as file:
            company = pickle.load(file)
            data = company.data
            compdict = {}
            for name,value in data.items():
                units = value["units"]
                unit_list += list(units.keys())

print(set(unit_list))

In [None]:
comp = comp_load("MSFT")
for i in list(comp.data.keys()):
    print(i)
comp_data = comp.data['Revenues']["units"]["USD"]
filtered = []
end_prev = comp_data[0]["end"]
end_next = comp_data[2]["end"]
# Always add the first element; comparison starts from the second element
filtered.append(comp_data[0])
for i in range(1, len(comp_data) - 1):
    end = comp_data[i]["end"]
    end_next = comp_data[i + 1]["end"]
    # Check if current 'end' matches either the previous or next 'end'
    if end == end_prev or end == end_next:
        filtered.append(comp_data[i])
    # Update 'end_prev' for the next iteration
    end_prev = end
# Always add the last element; its 'end' was compared in the last iteration
filtered.append(comp_data[-1])

form_and_date = []
for datapoint in comp_data:
    form_and_date.append({datapoint["fp"]: datapoint["end"]})

val = []
for datapoint in comp_data:
    val.append(datapoint["val"])
cleaner = []
for datapoint in comp_data:
    cleaner.append((datapoint["end"], datapoint["val"], datapoint["form"]))


#changes the data, so keep last
for datapoint in comp_data:
    del datapoint["form"]
    del datapoint["accn"]
    del datapoint["fy"]


with open("examples\clearer.json", "w") as file:
    json.dump(comp_data,file, indent=1)

with open("examples\quintuple_fuckers_filtered.json", "w") as file:
    json.dump(filtered,file, indent=1)
with open("examples\quintuple_fuckers_unfiltered.json", "w") as file:
    json.dump(comp_data, file, indent =1)
with open(r"examples\forms.json", "w") as file:
    json.dump(form_and_date, file, indent =0)
with open(r"examples\cleaner.json", "w") as file:
    json.dump(cleaner, file, indent =0)
with open(r"examples\value.json", "w") as file:
    json.dump(val, file, indent =0)

