In [None]:
import pandas as pd
import requests
from datetime import datetime
import time

In [2]:
headers = {'User-Agent': "kotavenkateshgnanashri@gmail.com"}

In [3]:
company_tickers_all = requests.get("https://www.sec.gov/files/company_tickers.json", headers = headers)

In [5]:
first_ticker_preview = company_tickers_all.json()['0']
print(first_ticker_preview)

{'cik_str': 1045810, 'ticker': 'NVDA', 'title': 'NVIDIA CORP'}


In [6]:
cik_val_check = company_tickers_all.json()['0']['cik_str']
print(cik_val_check)


1045810


In [7]:
company_data = pd.DataFrame.from_dict(company_tickers_all.json(), orient='index')
company_data[:1]

Unnamed: 0,cik_str,ticker,title
0,1045810,NVDA,NVIDIA CORP


In [8]:
company_data['cik_str'] = company_data['cik_str'].astype(str).str.zfill(10)
company_data[:10]

Unnamed: 0,cik_str,ticker,title
0,1045810,NVDA,NVIDIA CORP
1,320193,AAPL,Apple Inc.
2,789019,MSFT,MICROSOFT CORP
3,1652044,GOOGL,Alphabet Inc.
4,1018724,AMZN,AMAZON COM INC
5,1730168,AVGO,Broadcom Inc.
6,1326801,META,"Meta Platforms, Inc."
7,1318605,TSLA,"Tesla, Inc."
8,1067983,BRK-B,BERKSHIRE HATHAWAY INC
9,59478,LLY,ELI LILLY & Co


In [9]:
len(company_data)

10196

In [10]:
company_data = company_data.head(200)
len(company_data)

200

In [11]:
cik = company_data['cik_str'].iloc[0]
print(cik)

0001045810


In [12]:

submission_dara_url =  requests.get(f'https://data.sec.gov/submissions/CIK{cik}.json', headers=headers)


In [13]:
print(submission_dara_url.json().keys())

dict_keys(['cik', 'entityType', 'sic', 'sicDescription', 'ownerOrg', 'insiderTransactionForOwnerExists', 'insiderTransactionForIssuerExists', 'name', 'tickers', 'exchanges', 'ein', 'lei', 'description', 'website', 'investorWebsite', 'category', 'fiscalYearEnd', 'stateOfIncorporation', 'stateOfIncorporationDescription', 'addresses', 'phone', 'flags', 'formerNames', 'filings'])


In [14]:
print(submission_dara_url.json()['cik'])
entityType = submission_dara_url.json()['entityType']
print(entityType)
print(submission_dara_url.json()['sicDescription'])
print(submission_dara_url.json()['name'])
print(submission_dara_url.json()['addresses']['business']["zipCode"])

0001045810
operating
Semiconductors & Related Devices
NVIDIA CORP
95051


In [15]:
company_facts = requests.get(
    f'https://data.sec.gov/api/xbrl/companyfacts/CIK{cik}.json',
    headers=headers
    )

In [98]:
# print(company_facts.json().keys())
# print(company_facts.json()['facts'].keys())
# print(company_facts.json()['facts']['us-gaap'].keys())
# print(company_facts.json()['facts']['us-gaap'].keys())
# print(company_facts.json()['facts']['us-gaap']['Revenues'].keys())
# print(company_facts.json()['facts']['us-gaap']['Revenues']["label"])
# print(company_facts.json()['facts']['us-gaap']['Revenues']["units"]["USD"])
# revenues = company_facts.json()['facts']['us-gaap']['Revenues']["units"]["USD"]
# vals = [entry['val'] for entry in revenues]
# print(vals)

In [43]:
from collections import defaultdict

revenues = company_facts.json()['facts']['us-gaap']['Revenues']["units"]["USD"]

current_year = datetime.now().year

def frame_year(frame):
    """Extract calendar year from frame like 'CY2024' or 'CY2024Q1'."""
    if frame and frame.startswith("CY"):
        return int(frame[2:6])
    return None

calendar_revenue = {}

for entry in revenues:
    if 'frame' in entry:
        year = frame_year(entry['frame'])
        if year and current_year - 5 < year <= current_year:
            if entry['frame'] == f"CY{year}":
                calendar_revenue[year] = entry['val']

quarterly_sums = defaultdict(int)
for entry in revenues:
    if 'frame' in entry:
        year = frame_year(entry['frame'])
        if year and current_year - 5 < year <= current_year:
            if year not in calendar_revenue and entry['frame'].startswith(f"CY{year}Q"):
                quarterly_sums[year] += entry['val']

for year, val in quarterly_sums.items():
    calendar_revenue[year] = val

for year in sorted(calendar_revenue):
    print(f"{year}: {calendar_revenue[year]:,} USD")


2021: 26,914,000,000 USD
2022: 26,974,000,000 USD
2023: 60,922,000,000 USD
2024: 130,497,000,000 USD
2025: 147,811,000,000 USD


In [62]:
resp = requests.get("https://www.sec.gov/files/company_tickers.json", headers=headers)
sec_data = resp.json()

tickers_df = pd.DataFrame.from_dict(sec_data, orient='index')
tickers_df.columns = ["cik", "ticker", "company_name"]

company_list = tickers_df["ticker"].head(400).tolist()

print(company_list)



['NVDA', 'AAPL', 'MSFT', 'GOOGL', 'AMZN', 'AVGO', 'META', 'TSLA', 'BRK-B', 'LLY', 'WMT', 'JPM', 'V', 'ORCL', 'SPY', 'XOM', 'JNJ', 'MA', 'NFLX', 'ABBV', 'COST', 'ASML', 'BAC', 'PLTR', 'BABA', 'PG', 'AMD', 'HD', 'GE', 'KO', 'CVX', 'CSCO', 'UNH', 'CYATY', 'AZN', 'SAP', 'IBM', 'WFC', 'CAT', 'MS', 'TM', 'PM', 'AXP', 'NVS', 'MRK', 'TMUS', 'GS', 'HSBC', 'RTX', 'QQQ', 'MU', 'MCD', 'ABT', 'CRM', 'TMO', 'NVO', 'SHEL', 'RY', 'PEP', 'ISRG', 'LIN', 'HDB', 'SHOP', 'DIS', 'AMGN', 'T', 'C', 'INTU', 'APP', 'LRCX', 'NEE', 'AMAT', 'UBER', 'QCOM', 'VZ', 'MUFG', 'BX', 'NOW', 'SONY', 'TJX', 'SCHW', 'INTC', 'PDD', 'APH', 'DHR', 'DTEGY', 'GILD', 'BLK', 'GEV', 'ANET', 'ACN', 'SPGI', 'SAN', 'BKNG', 'UL', 'KLAC', 'BSX', 'ARM', 'TD', 'TXN', 'PFE', 'SYK', 'TTE', 'HTHIY', 'BA', 'RTNTF', 'WELL', 'BHP', 'PGR', 'ADBE', 'UNP', 'UTX', 'COF', 'DE', 'LOW', 'ETN', 'MDT', 'HON', 'PANW', 'CRWD', 'SPOT', 'SNY', 'BTI', 'BUD', 'CB', 'BBVA', 'UBS', 'PLD', 'RIO', 'IBN', 'HCA', 'ADI', 'LMT', 'COP', 'VRTX', 'CEG', 'SMFG', 'KKR', 'E

In [83]:
def get_company_info(ticker):
    try:
        stock = yf.Ticker(ticker)
        info = stock.info

        company_name = info.get("longName")
        industry = info.get("industry")
        country = info.get("country")

        currency = info.get("financialCurrency") or info.get("currency")

        return company_name, industry, country, currency

    except Exception as e:
        print(f"Error in company info for {ticker}: {e}")
        return None, None, None, None


In [84]:
def get_financials(ticker):
    try:
        stock = yf.Ticker(ticker)
        fin = stock.income_stmt  

        if "Total Revenue" not in fin.index:
            print(f"No revenue found for {ticker}")
            return []

        revenue_series = fin.loc["Total Revenue"]

        rev_list = []
        for date, value in revenue_series.items():
            rev_list.append({
                "year": date.year,
                "revenue": value
            })
        rev_list = sorted(rev_list, key=lambda x: x["year"], reverse=True)
        return rev_list[:3]

    except Exception as e:
        print("Error in financials for", ticker, e)
        return []


In [85]:
def process_company(ticker):
    name, industry, country , currency= get_company_info(ticker)
    revenues = get_financials(ticker)

    rows = []

    for r in revenues:
        rows.append({
            "ticker": ticker,
            "company_name": name,
            "country": country,
            "industry": industry,
            "year": r["year"],
            "revenue": r["revenue"],
            "revenue_unit": currency
        })

    return rows

In [None]:
all_rows = []
for t in company_list:
    rows = process_company(t)
    all_rows.extend(rows)
    time.sleep(0.3)

df = pd.DataFrame(all_rows)


 → NVDA
 → AAPL
 → MSFT
 → GOOGL
 → AMZN
 → AVGO
 → META
 → TSLA
 → BRK-B
 → LLY
 → WMT
 → JPM
 → V
 → ORCL
 → SPY
No revenue found for SPY
 → XOM
 → JNJ
 → MA
 → NFLX
 → ABBV
 → COST
 → ASML
 → BAC
 → PLTR
 → BABA
 → PG
 → AMD
 → HD
 → GE
 → KO
 → CVX
 → CSCO
 → UNH
 → CYATY
 → AZN
 → SAP
 → IBM
 → WFC
 → CAT
 → MS
 → TM
 → PM
 → AXP
 → NVS
 → MRK
 → TMUS
 → GS
 → HSBC
 → RTX
 → QQQ
No revenue found for QQQ
 → MU
 → MCD
 → ABT
 → CRM
 → TMO
 → NVO
 → SHEL
 → RY
 → PEP
 → ISRG
 → LIN
 → HDB
 → SHOP
 → DIS
 → AMGN
 → T
 → C
 → INTU
 → APP
 → LRCX
 → NEE
 → AMAT
 → UBER
 → QCOM
 → VZ
 → MUFG
 → BX
 → NOW
 → SONY
 → TJX
 → SCHW
 → INTC
 → PDD
 → APH
 → DHR
 → DTEGY
 → GILD
 → BLK
 → GEV
 → ANET
 → ACN
 → SPGI
 → SAN
 → BKNG
 → UL
 → KLAC
 → BSX
 → ARM
 → TD
 → TXN
 → PFE
 → SYK
 → TTE
 → HTHIY
 → BA
 → RTNTF
 → WELL
 → BHP
 → PGR
 → ADBE
 → UNP
 → UTX
No revenue found for UTX
 → COF
 → DE
 → LOW
 → ETN
 → MDT
 → HON
 → PANW
 → CRWD
 → SPOT
 → SNY
 → BTI
 → BUD
 → CB
 → BBVA
 → UBS
 → PLD


In [87]:
print(df)

     ticker        company_name        country                   industry  \
0      NVDA  NVIDIA Corporation  United States             Semiconductors   
1      NVDA  NVIDIA Corporation  United States             Semiconductors   
2      NVDA  NVIDIA Corporation  United States             Semiconductors   
3      AAPL          Apple Inc.  United States       Consumer Electronics   
4      AAPL          Apple Inc.  United States       Consumer Electronics   
...     ...                 ...            ...                        ...   
1174    PUK      Prudential plc      Hong Kong           Insurance - Life   
1175    PUK      Prudential plc      Hong Kong           Insurance - Life   
1176   CRWV     CoreWeave, Inc.  United States  Software - Infrastructure   
1177   CRWV     CoreWeave, Inc.  United States  Software - Infrastructure   
1178   CRWV     CoreWeave, Inc.  United States  Software - Infrastructure   

      year       revenue revenue_unit  
0     2025  1.304970e+11          U

In [89]:
comapnies_number=len(df["ticker"].unique())
print(comapnies_number)

comapnies_number=df["country"].unique()
print(comapnies_number)

comapnies_number= df["revenue_unit"].unique()
print(comapnies_number)

393
['United States' 'Netherlands' 'Hong Kong' 'China' 'United Kingdom'
 'Germany' 'Japan' 'Switzerland' 'Denmark' 'Canada' 'India' 'Ireland'
 'Spain' 'France' 'Australia' 'Luxembourg' 'Belgium' 'Uruguay' 'Brazil'
 'Singapore' 'Mexico' 'Italy' 'Norway' 'Indonesia']
['USD' 'EUR' 'CNY' 'JPY' 'DKK' 'CAD' 'INR' 'GBP' 'BRL' 'MXN']


In [90]:
df.to_csv('financials.csv', index=False)

In [91]:
df.head()

Unnamed: 0,ticker,company_name,country,industry,year,revenue,revenue_unit
0,NVDA,NVIDIA Corporation,United States,Semiconductors,2025,130497000000.0,USD
1,NVDA,NVIDIA Corporation,United States,Semiconductors,2024,60922000000.0,USD
2,NVDA,NVIDIA Corporation,United States,Semiconductors,2023,26974000000.0,USD
3,AAPL,Apple Inc.,United States,Consumer Electronics,2025,416161000000.0,USD
4,AAPL,Apple Inc.,United States,Consumer Electronics,2024,391035000000.0,USD


In [95]:
df.isna().sum()

ticker           0
company_name    18
country          0
industry         0
year             0
revenue          2
revenue_unit     0
dtype: int64

In [96]:
df = df.dropna(subset=["company_name"]).copy()

In [97]:
df.isna().sum()

ticker          0
company_name    0
country         0
industry        0
year            0
revenue         2
revenue_unit    0
dtype: int64

TO DO

1. Add KPIs and calculate them
2. Create streamlit app for the same