Net stuff

In [1]:
import httpx
import asyncio
from fredapi import Fred
import requests
import numpy as np
import json
import pandas as pd
import pydash
import pickle
import yahoo_fin.stock_info as si
from datetime import datetime
import copy

fred = Fred(api_key='0c34c4dd2fd6943f6549f1c990a8a0f0') 

async def fetch(url, url_headers, semaphore, client, timeout, max_retries, start_retry_delay):
    async with semaphore:
        for attempt in range(1,max_retries):
            try:
                response = await client.get(url, timeout=timeout, headers= url_headers)
                response.raise_for_status()
                return response  # Successful request, exit the loop
            except httpx.HTTPStatusError as e:
                    headers = response.headers
                    #Sometimes a retry-after header is returned
                    retry_after = headers.get('Retry-After')
                    if retry_after != None:
                        #Just for debugging
                        print(retry_after)
                        await asyncio.sleep(retry_after.astype(int))
                        continue
                    print(f"Error response {e.response.status_code} for {url}")
            except httpx.TimeoutException as e:
                print(f"Timeout reached: {e}")
                print(f"Retrying in {attempt*start_retry_delay} seconds...")
                await asyncio.sleep(attempt*start_retry_delay)
            except httpx.RequestError as e:
                print(f"An error occurred: {e}.")
                await asyncio.sleep(attempt*start_retry_delay)
        return 0
                

def fred_info(ids:list, start:str, end:str):
    #start and end are datatime objects
    start = start.strftime('%Y-%m-%d')
    end = end.strftime('%Y-%m-%d')
    frame = pd.DataFrame()
    for id in ids:
        series = fred.get_series(id,observation_start=start, observation_end=end)
        frame[id] = series
    frame = frame.reset_index()
    frame["index"] = frame["index"].astype(str)
    frame = frame.ffill()
    return frame.bfill()


#Kinda obsolete
async def fred_fetch(ids:list, start:str, end:str):
    fred_data = await asyncio.to_thread(fred_info,ids,start,end)
    return fred_data

async def yahoo_fetch(ticker, start_year, end_year, semaphore, max_retries, start_retry_delay):
    async with semaphore:
        for attempt in range(1,max_retries):
            try:
                response = await asyncio.to_thread(si.get_data,ticker,start_year, end_year)
                return response  # Successful request, exit the loop
            except requests.exceptions.ConnectionError as ce:
                print("Yahoo connection error.")
                await asyncio.sleep(attempt*start_retry_delay)
            except Exception as e:
                print(f"Yahoo error:{e}")
                await asyncio.sleep(attempt*start_retry_delay)
            # except HTTPError as e:
            #         headers = response.headers
            #         #Sometimes a retry-after header is returned
            #         retry_after = headers.get('Retry-After')
            #         if retry_after != None:
            #             #Just for debugging
            #             print(retry_after)
            #             await asyncio.sleep(retry_after.astype(int))
            #             continue
            #         print(f"Error response {e.response.status_code}.")
            # except Timeout as e:
            #     print(f"Yahoo Timeout reached: {e}")
            #     print(f"Retrying in {attempt*start_retry_delay} seconds...")
            #     await asyncio.sleep(attempt*start_retry_delay)
            # except RequestException as e:
            #     print(f"A Yahoo error occurred: {e}.")
            #     await asyncio.sleep(attempt*start_retry_delay)
        return 0

Functions

In [4]:
#Decompose the measure into its constituents
measure_conversion = {"Assets":[["Assets"],["AssetsNoncurrent", "AssetsCurrent"]],
                    "Liabilities":[["Liabilities"],["LiabilitiesCurrent", "LiabilitiesNoncurrent"]],
                    "AssetsCurrent":[["AssetsCurrent"]],
                    "LiabilitiesCurrent":[["LiabilitiesCurrent"]],
                    "revenues": [["revenues"]]
}

#Lookup table for the undeprecated version of a measure
with open(r"C:\Programming\Python\Finance\EDGAR\deprecated_to_current.json", "r") as file:
    deprecate_conversion = json.load(file)
    file.close()

#The first entry date into the EDGARD database
START = datetime.strptime('1993-01-01', r"%Y-%m-%d")

def runlist(dict, nameslist):
    idx = 0
    while (idx<len(nameslist)):
        try:
            data = dict[nameslist[idx]]
            return data
        except KeyError:
            idx +=1
    return False

def searchdict(dict, nameslist:list, ticker):
    for key in ["dei", "us-gaap", "ifrs-full", "invest"]:
        try:
            data = runlist(dict[key], nameslist) 
            if  data != False:
                return data
        except KeyError:
            continue
    print(f"{nameslist} not available for {ticker}")
    return False
    # raise KeyError(f"{ticker} doesnt have any of {nameslist}")


def getcik(ticker):
    #Convert the ticker into the proper cik
    for key,value in cikdata.items():
        if value["ticker"] == ticker:
            cik = value["cik_str"]
            break
    return str(cik).zfill(10)

#Headers for EDGAR call
headers = {
    "User-Agent":"ficakc@seznam.cz",
    "Accept-Encoding":"gzip, deflate",
}

TIMEOUT = 8
RETRIES = 2
START_RETRY_DELAY = 1
# cik_url =  "https://www.sec.gov/files/company_tickers.json"
# cikdata = requests.get(cik_url, headers=headers).json()

with open(r"C:\Programming\Python\Finance\EDGAR\cik.json","r") as file:
    cikdata = json.load(file)
    file.close()
with open(r"C:\Programming\Python\Finance\EDGAR\apple.json","r") as file:
    Apple = json.load(file)
    file.close()


def sync_companyfacts(ticker:str):
    cik = getcik(ticker)
    data_url = f"https://data.sec.gov/api/xbrl/companyfacts/CIK{cik}.json"
    data  = httpx.get(data_url, headers= headers)
    return data
    
async def companyfacts(ticker:str, client, semaphore):
    #Get all the financial data for a ticker
    cik = getcik(ticker)
    data_url = f"https://data.sec.gov/api/xbrl/companyfacts/CIK{cik}.json"
    facts = await fetch(data_url, headers, semaphore, client, TIMEOUT,RETRIES,START_RETRY_DELAY)
    return facts

def endtodatetime(dataframe):
    dataframe.loc[:,"end"] = pd.datetime(dataframe["end"])
    return dataframe



class Stock:
    def __init__(self, ticker:str):
        self.ticker = ticker.upper()
        self.cik = getcik(self.ticker)
        
    async def async_init(self,client, semaphore, standard_measures):
        #Get all of the data for the company, ALL of it 
        data = await companyfacts(self.ticker, client, semaphore)
        #If the response wasn't recieved, skips the rest of the code 
        if type(data) != int:
            self.data = data.json()
        else:
            return 0
        #Get the share amount 
        self.share_name_list = ["EntityCommonStockSharesOutstanding", "CommonStockSharesOutstanding", "WeightedAverageNumberOfSharesOutstandingBasic", "WeightedAverageNumberOfDilutedSharesOutstanding"]
        meta = copy.deepcopy(self.data["facts"])
        #searches the company dict for the first occurence of something in the names list
        share_info = searchdict(meta, self.share_name_list, self.ticker)
        share_date= datetime.strptime(share_info["units"]["shares"][0]["end"], r"%Y-%m-%d")
        #Get the earliest date with all the info about the company
        start_dates = []
        for measure in standard_measures:
            stuff = copy.deepcopy(self.data)
            if measure in deprecate_conversion:
                fact = searchdict(stuff["facts"],[deprecate_conversion[measure]],self.ticker)
                if fact ==False:
                    continue
                date = datetime.strptime(fact["units"]["USD"][0]["end"],r"%Y-%m-%d")
            else:
                fact =searchdict(stuff["facts"],[measure],self.ticker)
                if fact ==False:
                    continue
                date = datetime.strptime(fact["units"]["USD"][0]["end"],r"%Y-%m-%d")
            start_dates.append(date)
        self.start_year = max([share_date] + start_dates)
        self.end_year = datetime.now().date()
        return 1
    async def price_init(self,semaphore):
        #Get the price and set the self.price
        self.fullprice = await yahoo_fetch(self.ticker,self.start_year, self.end_year, semaphore, RETRIES, START_RETRY_DELAY)
        if type(self.fullprice) == int:
            return 0
        self.fullprice = self.fullprice.reset_index()
        Price = self.fullprice[[self.fullprice.columns[0],"close", "adjclose"]].copy()
        Price["end"] = Price["index"].astype(str)
        Price.drop(columns=["index"],inplace=True)
        date_range = pd.date_range(start=self.start_year, end=self.end_year).astype(str)
        self.date_range = pd.DataFrame(date_range, columns=['end'])
        Price = pd.merge(self.date_range, Price, on = ["end"],how="left" )
        self.price = Price.ffill().bfill()
        return 1 
    def fact(self,measure,simple=True):
        #Propagate the 0 
        if self.data == 0:
            return   
        try:
            if measure in deprecate_conversion:
                measure = deprecate_conversion[measure]
                # frame = pd.concat([frame, frame_undep], axis=0).reset_index(drop=True)
            point_list = self.data["facts"]["us-gaap"][measure]["units"]["USD"]
            frame = pd.DataFrame(point_list)
            frame = frame.drop_duplicates(subset='end', keep='last')
            frame[measure] = frame["val"]
            if simple:
                frame = frame[["end", measure]]
            #If the measure is deprecated switch to the undeprecated version
            frame = pd.merge(self.date_range,frame,on="end",how="left")
            frame = frame.ffill().bfill()
            return frame
        except KeyError:
            print(f"Measure {measure} not available for {self.ticker}.")
    def shares(self,simple=True):
        #Propagate the 0 
        if self.data == 0:
            return 0
        if simple:
            meta = copy.deepcopy(self.data)
            share_count = pd.DataFrame(searchdict(meta["facts"],self.share_name_list)["units"]["shares"][0], index=[0])[["end","val"]]
        share_count["shares"] = share_count["val"]
        share_count.drop(columns=["val"], inplace = True)
        share_count = share_count.drop_duplicates(subset="end", keep="last")
        share_count = pd.merge(self.date_range, share_count, on=["end"], how="left")
        return share_count.ffill()
    

#Initializes and appends the stock object
async def async_task(ticker, client, semaphore_edgar, semaphore_yahoo, measures):
    # Measures are used to get the date when all the financial info is available
    stock = Stock(ticker)
    print(f"Currently pinging {ticker}")
    successful_edgar = await stock.async_init(client,semaphore_edgar,measures)
    if successful_edgar:
        print(f"Price pinging {ticker}$")
        succesful_price = await stock.price_init(semaphore_yahoo)
    else:
        succesful_price = False
    with open(f'C:\Programming\Python\Finance\EDGAR\companies\{ticker}.pkl', 'wb') as file:
        pickle.dump(stock,file)
    del stock
    #Return (ticker, availability of data, availability of price)
    return (ticker, successful_edgar, succesful_price)



def acquire_frame(ticker, measures, indicator_frame):
    #Get a dataframe from the saved data of some stock 
    try:
        with open(f'C:\Programming\Python\Finance\EDGAR\companies\{ticker}.pkl', 'rb') as file:
            stock = pickle.load(file)
            file.close()
    except FileNotFoundError:
        print(f"{ticker} is not available for loading")
    try:
        #Price and shares oustanding 
        shares = stock.shares().copy()
        stock_price = stock.price.copy()
        df = pd.merge(shares, stock_price, left_on=["end"], right_on=["end"], how = "left")
        frames_list = [stock.fact(measure) for measure in measures]
        for frame in frames_list:
            df = pd.merge(df,frame, on=["end"], how="left")
    except AttributeError:
        return
    #Economic indicators 
    df = pd.merge(df, indicator_frame, left_on =["end"], right_on=["index"], how="left")
    return df
    
#Get the success rate for the api call
def success_rate(company_frames_availability):
    edgar_success = 0
    yahoo_success = 0
    for ticker, edgar, yahoo in company_frames_availability:
        edgar_success += edgar
        yahoo_success += yahoo
    edgar_success = edgar_success/len(company_frames_availability)
    yahoo_success = yahoo_success/len(company_frames_availability)
    print(f"Edgar success rate: {edgar_success}")
    print(f"Yahoo success rate: {yahoo_success}")

#Function to call again for missing data
async def async_filler(company_frames_availability):
    ticker_list = []
    ticker_list_price = []
    for ticker, edgar, yahoo in company_frames_availability:
        if edgar:
            if yahoo == 0:
                #only price ping 
                ticker_list_price.append(ticker)
        else:
            ticker_list.append(ticker)
    pass


Indicator frame


In [4]:
indicators = ["TB3MS", "DCOILWTICO"]
indicator_frame = fred_info(indicators, START, datetime.now().date())
with open("C:\Programming\Python\Finance\EDGAR\indicator_frame.pkl", "wb") as file:
    pickle.dump(indicator_frame, file)

Data grab

In [5]:
#write out measures based on importance in descending order
# tracemalloc.start()
measures = ["Assets", "Liabilities", "AssetsCurrent", "LiabilitiesCurrent"]
#Load out the indicators 
with open("C:\Programming\Python\Finance\EDGAR\indicator_frame.pkl", "rb") as file:
    indicator_frame = pickle.load(file)
#Get the first n companies sorted by market cap 
companies_num = 3
comp = 0
edgar_client =  httpx.AsyncClient()
sem_edgar = asyncio.Semaphore(9)
#Separate sem for yahoo to spread the work and connections
sem_yahoo = asyncio.Semaphore(9)
#GATHER THE FIRST companies_num companies ciks and pass them to the gather with the tasks
ticker_list = []
for company, values in cikdata.items():
    if comp<companies_num:
        ticker_list.append(values["ticker"])
        comp+=1
    else:
        break
# A list of tuples with the availability
company_frames_availability = await asyncio.gather(*[async_task(ticker, edgar_client, sem_edgar, sem_yahoo, measures) for ticker in ticker_list])
success_rate(company_frames_availability)


    

Currently pinging AAPL
Currently pinging MSFT
Currently pinging GOOGL
Price pinging MSFT$
Price pinging AAPL$
Price pinging GOOGL$


NameError: name 'success_rate' is not defined

Get the frames 

In [None]:
company_frames_tuples = [(ticker,acquire_frame(ticker, measures, indicator_frame)) for ticker,value_edg,value_yah in company_frames_availability if value_edg and value_yah]
company_frames_dict = {}
for key,value in company_frames_tuples:
    company_frames_dict[key] = value

In [6]:

# with open("C:\Programming\Python\Finance\EDGAR\companies\AAPL.pkl", "br") as file:
#     apple =  pickle.load(file)

# dictionary = copy.deepcopy(apple.data)
# searchdict(dictionary["facts"],["Assets"], "AAPL")
# frame = apple.fact("Assets")
# frame.head(5)
print(company_frames_availability)

[('AAPL', 1, 1), ('MSFT', 1, 1), ('GOOGL', 1, 1), ('AMZN', 1, 1), ('NVDA', 1, 1), ('META', 1, 1), ('TSLA', 1, 1), ('BRK-B', 1, 1), ('LLY', 1, 1), ('TSM', 0, False)]


In [None]:
# Apple.data["facts"]["dei"]["EntityCommonStockSharesOutstanding"]["units"]["shares"]
frame  = fred_info(["TB3MS", "DCOILWTICO"], '2015-02-24', '2017-02-24')
frame.head(40)
# print(frame)

frame["index"] = frame["index"].astype(str)
with open("C:\Programming\Python\Finance\EDGAR\FRED.json", "w") as file:
    json.dump(frame.to_dict(orient="records"), file, indent=1)
    file.close()


Deprecated fucks.

In [38]:
dictionary = {}
tickers = ["META"]
for ticker in tickers:
    data = sync_companyfacts(ticker).json()
    data = data["facts"]["us-gaap"]
    for key,value in data.items():
        del value["units"]
        if not key in dictionary:
            dictionary[key] = value

with open("C:\Programming\Python\Finance\EDGAR\deprecated.json", "w")as file:
    json.dump(dictionary, file, indent= 1)
    file.close()

Manual testing section

In [4]:
def checkout(ticker, name):
    data = sync_companyfacts(ticker).json()
    dict1  = data["facts"]["us-gaap"]
    dict2  = data["facts"]["dei"]
    dict = {**dict1, **dict2}
    compdict = {}
    for key,value in dict.items():
        compdict[key] = value["description"]

    with open(f"C:\Programming\Python\Finance\EDGAR\{name}.json", "w") as file:
        json.dump(compdict, file, indent =1)
        file.close()

# meta = sync_companyfacts("META").json()
# dictionary = amazon["facts"]["dei"].keys()
# dictionary  = [i for i in dictionary]
# with open(r"C:\Programming\Python\Finance\EDGAR\amazontotal.json", "w") as file:
#         json.dump(dictionary, file, indent =1)
#         file.close()


# print(amazon["facts"].keys())
checkout("META","meta")


In [20]:
measures = ["Assets", "Liabilities", "AssetsCurrent", "LiabilitiesCurrent"]
Apple = Stock("aapl", measures)
stock = Apple
shares = stock.shares()
stock_num = stock.price
if isinstance(shares, int) or isinstance(stock_num, int): 
    pass
    # break
df = pd.merge(shares.copy(), stock_num.copy(), on=["end"], how = "left")
frames_list = [stock.fact(measure) for measure in measures]
for frame in frames_list:
    df = pd.merge(df,frame, on=["end"], how="left")
df.head(16)

TypeError: Stock.__init__() takes 2 positional arguments but 3 were given

*Get all the measure names*

In [None]:
# with open("C:\Programming\Python\Finance\EDGAR\stock.json", "w") as file:
#     json.dump(df.to_dict(orient='records'), file, indent=1)
#     file.close()

with open("C:\Programming\Python\Finance\EDGAR\shares.json", "w") as file:
    json.dump(stock.shares().copy().to_dict(orient='records'), file, indent=1)
    file.close()

with open(r"C:\Programming\Python\Finance\EDGAR\assets.json", "w") as file:
    json.dump(Apple["facts"]["us-gaap"]["Assets"]["units"]["USD"], file, indent=1)
    file.close()


    

In [None]:
measures = {}
for key,value in Apple["facts"]["us-gaap"].items() :
    measures[key.ljust(100)] = value["label"]

measures["METADATA".ljust(200,"/")] = ""

for key,value in Apple["facts"]["dei"].items():
    measures[key.ljust(100)] = value["label"]


with open(r"C:\Programming\Python\Finance\EDGAR\measures.json","w") as file:
    json.dump(measures, file, indent=1)


#create price reference list:
with open("C:\Programming\Python\Finance\EDGAR\price.json", "w") as file:
    json.dump(Apple.price.to_dict(orient='records'), file, indent=1)
    file.close()


In [None]:
for key in Apple["facts"]["us-gaap"]["AssetsCurrent"]["units"]["USD"] :
    try:
        del key["frame"]
        print(key)
    except KeyError:
        print(key)



In [None]:
# print(Apple["facts"]["us-gaap"]["Assets"]["units"]["USD"][0]["end"])
