In [5]:
import os
import json
from src.utils.json_io import read_json

def load_all_stocks(path='data/stocks/raw'):
    """
    Generator function to load all stock data from JSON files in a given directory.

    Parameters
    ----------
    path : str, optional
        Path to the directory containing stock JSON files (default is 'stocks').

    Yields
    ------
    dict
        Parsed JSON content of each file as a Python dictionary.

    Notes
    -----
    - Each file in the directory is expected to be a valid JSON file.
    - Files are read one at a time, making this function memory-efficient for large datasets.
    """

    for fname in os.listdir(path):
        with open(os.path.join(path, fname)) as f:
            yield json.load(f)

In [None]:
stocks = load_all_stocks()
no_market_cap = 0
for stock in stocks:
    if stock["market_cap"] is None:
        no_market_cap+=1
        print(stock["symbol"])

In [4]:
no_market_cap

2321

In [None]:
stocks = load_all_stocks()
eod_market_not_equal = 0
for stock in stocks:
    eod = stock["eod"]
    cap = stock["market_cap"]
    if len(eod) != len(cap):
        eod_market_not_equal+=1
        print(stock["symbol"])
        cap_dates = set(entry["date"] for entry in cap)
        eod_dates = set(entry["date"] for entry in eod)
        in_eod_not_in_cap_dates = set(entry["date"] for entry in eod if entry["date"] not in cap_dates)
        in_cap_not_in_eod_dates = set(entry["date"] for entry in cap if entry["date"] not in eod_dates)
        print("in_eod_not_in_cap_dates", )
        
eod_market_not_equal

In [6]:
stocks = load_all_stocks()
eod_market_not_equal_dict = {}

for stock in stocks:
    print(stock["symbol"])
    eod = stock["eod"]
    cap = stock["market_cap"]
    if cap is None:
        eod_market_not_equal_dict[stock["symbol"]] = None
    else:
        cap_dates = set(entry["date"] for entry in cap)
        eod_dates = set(entry["date"] for entry in eod)
        in_eod_not_in_cap_dates = [entry["date"] for entry in eod if entry["date"] not in cap_dates]
        in_cap_not_in_eod_dates = [entry["date"] for entry in cap if entry["date"] not in eod_dates]
        eod_market_not_equal_dict[stock["symbol"]] = {"in_eod_not_in_cap_dates": in_eod_not_in_cap_dates, "in_cap_not_in_eod_dates":in_cap_not_in_eod_dates}
        
eod_market_not_equal_dict

A
AA
AAC-UN
AAC-WT
AAC
AACG
AACI
AACIU
AACIW
AACQU
AACT
AADI
AAGC
AAGH
AAGRW
AAIC-PB
AAIC-PC
AAIC
AAIN
AAL
AAM-PA
AAM-PB
AAM
AAMC
AAME
AAN
AAOI
AAON
AAP
AAPL
AAPT
AAQC-UN
AAQC-WT
AAQC
AAQCW
AASP
AAT
AATC
AATP
AAU
AAWH
AAWW
AB
ABB
ABBV
ABC
ABCB
ABCE
ABCL
ABCM
ABCP
ABCZF
ABEO
ABEV
ABG
ABGI
ABILF
ABIO
ABIT
ABL
ABLV
ABLVW
ABLZF
ABM
ABMC
ABMD
ABML
ABMT
ABNB
ABOS
ABQQ
ABR-PB
ABR-PC
ABR-PD
ABR-PE
ABR-PF
ABR
ABSI
ABST
ABT
ABTI
ABTS
ABTX
ABUS
ABVC
ABVE
ABVEW
ABVN
ABVX
ABWN
ABXXF
AC
ACA
ACAB
ACABU
ACABW
ACAC
ACACU
ACACW
ACAD
ACAH
ACAHU
ACAHW
ACAN
ACAQ-UN
ACAQ-WT
ACAQ
ACAX
ACAXR
ACAXU
ACAXW
ACB
ACBA
ACBAU
ACBAW
ACBI
ACBM
ACC
ACCA
ACCD
ACCO
ACCR
ACDC
ACDCW
ACDI-UN
ACDI
ACEL
ACER
ACET
ACEV
ACEVU
ACEVW
ACFN
ACGL
ACGLN
ACGLO
ACGLP
ACGN
ACGX
ACH
ACHC
ACHHY
ACHL
ACHN
ACHR-WT
ACHR
ACHV
ACI
ACIC
ACII-UN
ACII-WT
ACII
ACIU
ACIW
ACKIT
ACKIU
ACKIW
ACLS
ACLX
ACM
ACMB
ACMC
ACMR
ACMSY
ACN
ACNB
ACNT
ACNV
ACOG
ACON
ACONW
ACOR
ACPS
ACQR
ACQRU
ACQRW
ACR-PC
ACR-PD
ACR
ACRDF
ACRE
ACRG
ACRHF
ACRO-UN
ACRO-WT
ACRO
ACRS


{'A': {'in_eod_not_in_cap_dates': [], 'in_cap_not_in_eod_dates': []},
 'AA': {'in_eod_not_in_cap_dates': ['1975-04-20',
   '1975-04-13',
   '1975-04-06',
   '1975-03-30',
   '1975-03-23',
   '1975-03-16',
   '1975-03-09',
   '1975-03-02',
   '1975-02-23',
   '1974-04-21',
   '1974-04-14',
   '1974-04-07',
   '1974-03-31',
   '1974-03-24',
   '1974-03-17',
   '1974-03-10',
   '1974-03-03',
   '1974-02-24',
   '1974-02-18',
   '1974-02-10',
   '1974-02-03',
   '1974-01-27',
   '1974-01-20',
   '1974-01-13',
   '1974-01-06'],
  'in_cap_not_in_eod_dates': ['1975-04-25',
   '1975-04-18',
   '1975-04-11',
   '1975-04-04',
   '1975-03-27',
   '1975-03-21',
   '1975-03-14',
   '1975-03-07',
   '1975-02-28',
   '1974-04-26',
   '1974-04-19',
   '1974-04-11',
   '1974-04-05',
   '1974-03-29',
   '1974-03-22',
   '1974-03-15',
   '1974-03-08',
   '1974-03-01',
   '1974-02-22',
   '1974-02-15',
   '1974-02-08',
   '1974-02-01',
   '1974-01-25',
   '1974-01-18',
   '1974-01-11']},
 'AAC-UN': {'in_e

In [4]:
with open('aapl.json', 'r',encoding="utf-8") as file:
    aapl = json.load(file)
    
len(aapl["eod"]) == len(aapl["market_cap"])

True

In [7]:
with open("data/metadata/eod_cap_discrepancies.json", 'w') as f_out:
    json.dump(eod_market_not_equal_dict, f_out)

In [None]:
in_eod_not_in_cap_dates_count = 0
in_cap_not_in_eod_dates_count = 0
for symbol in eod_market_not_equal_dict:
    in_cap_not_in_eod_dates = eod_market_not_equal_dict["in_cap_not_in_eod_dates"]
    in_eod_not_in_cap_dates = eod_market_not_equal_dict["in_eod_not_in_cap_dates"]
    
    if len(in_eod_not_in_cap_dates)>0:
        in_eod_not_in_cap_dates_count+=1
        
    if  len(in_cap_not_in_eod_dates)>0:
        in_cap_not_in_eod_dates_count+=1

print("in_eod_not_in_cap_dates_count", in_eod_not_in_cap_dates_count)
print("in_cap_not_in_eod_dates_count", in_cap_not_in_eod_dates_count)

KeyError: 'in_cap_not_in_eod_dates'

In [3]:
stocks = load_all_stocks()
symbols = set()
for stock in stocks:
    symbols.add(stock["symbol"])

In [None]:
print(len(symbols))

13364


In [6]:
from src.config.settings import REFERENCE_DIR
stock_profiles_path = REFERENCE_DIR / "stock_profiles.json"
stock_profiles = read_json(stock_profiles_path)

In [17]:
missing_stocks = [stock["symbol"] for stock in stock_profiles if stock["symbol"] not in symbols]
missing_stocks

['AMBC-WT',
 'AXAC-RI',
 'MIMO-WT',
 'NICH',
 'OPA-WT',
 'PNTM-WT',
 'PSPC-WT',
 'TRAQ-WT']

In [None]:
from src.data_extraction.stock_info import get_stock_info

In [15]:
info = [get_stock_info(stock) for stock in missing_stocks]

IndexError: list index out of range

In [16]:
NICH_info = get_stock_info(missing_stocks[3])

OverflowError: cannot convert float infinity to integer