<h1>
Stock ML Scraping, Cleaning, and Analysis
</h1>

<p>
No AI tools were used.
</p>


The architecture I am planning is:

yfinance (scrape most active tickers) -> Google Finance (get basic financial info) -> 12api (get stock price for a certain period)

Put these together into data to be analyzed with ML to try and define a strategy to "beat" the market.

In [2]:
# first step scrape yfinance for later ingesting with stock class.
from bs4 import BeautifulSoup
import requests
import time
import re



def get_yfinance_tickers():
    """
    input: None
    output: returns tickers of most active stocks, stops collecting when yfinance runs out (1-3 loops)
    """
    # list to hold data
    tickers = []

    # default starting index for 100 is ?start=0&count=100
    start_ind = 0
    count = 100

    
    

    #header for request
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
    }

    while True:
        #base URL (needs to change every loop)
        url = f"https://finance.yahoo.com/markets/stocks/most-active/?start={start_ind}&count={count}"
        # stop on error, call soup
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        # grab main container
        container = soup.select_one("div.container.yf-1bczin")

        # if not main container stop
        if not container:
            break


        # if no more data stop
        if container.select_one("div.no-data"):
            break

        links = soup.find_all("a", {"data-testid": "table-cell-ticker"})

        
        for link in links:
            href = link.get("href")

            # These are the link extensions to go to individual pages, 
            # in format /quote/SNAP/, which would be accessed like
            # https://finance.yahoo.com/quote/MU/
            # instead of drilling TWO levels deeper using BS4,
            # which is a buggy and annoying process,
            # the best practice I've found is to use RegEx.
            #
            # RegEx reduces time and memory complexity
            # by skpping over redundant soup layering.
            #
            # I am opting to use regex to remove the chars:
            #   /quote/ /

            match = re.search(r"/quote/([A-Z0-9.-]+)", href)
            if match:
                cleaned_title = match.group(1)
                tickers.append(cleaned_title)
        time.sleep(2)

        start_ind += 100

    return tickers

tickers = get_yfinance_tickers()
print(tickers)




['NVDA', 'U', 'F', 'INTC', 'HOOD', 'PLUG', 'LYFT', 'RIG', 'ONDS', 'DAY', 'TSLA', 'AMZN', 'SOFI', 'PLTR', 'SHOP', 'SNAP', 'GGB', 'MU', 'MARA', 'T', 'IREN', 'AAPL', 'MAT', 'WULF', 'BBAI', 'BMNR', 'LYG', 'VZ', 'BTG', 'OPEN', 'KHC', 'HIMS', 'GRAB', 'BBD', 'CIFR', 'MSFT', 'AMD', 'GOOGL', 'RGTI', 'NFLX', 'DNN', 'VALE', 'AVTR', 'PBR', 'PATH', 'APLD', 'QBTS', 'ORCL', 'COMP', 'SMR', 'NOW', 'NIO', 'ITUB', 'AAL', 'VRT', 'SNDK', 'BAC', 'EOSE', 'ACHR', 'CLSK', 'AG', 'SOUN', 'RKT', 'PFE', 'UUUU', 'NOK', 'GTM', 'MSTR', 'CRWV', 'TSM', 'NU', 'FRSH', 'CMG', 'UBER', 'UPST', 'SMCI', 'GOOG', 'SCHW', 'RKLB', 'SW', 'CLF', 'AXL', 'ALAB', 'RIVN', 'CYBR', 'PBR-A', 'IBRX', 'NET', 'CPNG', 'PYPL', 'KVUE', 'CSCO', 'HL', 'NBIS', 'MRNA', 'PSLV', 'IONQ', 'XOM', 'RIOT', 'ABEV', 'CDE', 'KO', 'BWA', 'ASTS', 'QXO', 'NVTS', 'OWL', 'WBD', 'HBAN', 'CMCSA', 'FIG', 'AEG', 'BE', 'RUN', 'KVYO', 'JOBY', 'SLB', 'AVGO', 'NKE', 'PANW', 'NGD', 'CFLT', 'COIN', 'FCX', 'USAR', 'OSCR', 'CRM', 'TDC', 'NCLH', 'CCL', 'STLA', 'CCC', 'QS', 'B

In [3]:
# scraping yfinance info
# base url https://finance.yahoo.com/quote/MU/

# price will be gotten later with twelve api
#

def scrape_yfinance_info(list_of_tickers):
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
    }
    data_dic = {}
    for ticker in list_of_tickers:

        url = f"https://finance.yahoo.com/quote/{ticker}/"
        # stop on error, call soup
        response = requests.get(url, headers=headers)
        try:
            response.raise_for_status()
        except:
            continue
            
        soup = BeautifulSoup(response.text, "html.parser")

        # Find the container div
        statistics_div = soup.find("div", {"data-testid": "quote-statistics"})

        if not statistics_div:
            print(f"No stats found for {ticker}")
            data_dic[ticker] = {}
            continue


        rows = statistics_div.find_all("li")

        data = {}
        
        for row in rows:

            label = row.find("span", class_="label")
            value = row.find("span", class_="value")

            if label and value:
                label_text = label.get_text(strip=True)
                value_text = value.get_text(strip=True)

                data[label_text] = value_text

        data_dic[ticker] = data
        time.sleep(2.5)

    return data_dic

enriched_tickers = scrape_yfinance_info(tickers)
print(enriched_tickers)

No stats found for DAY
{'NVDA': {'Previous Close': '188.54', 'Open': '192.42', 'Bid': '191.01 x 100', 'Ask': '191.27 x 200', "Day's Range": '188.77 - 193.26', '52 Week Range': '86.62 - 212.19', 'Volume': '77,313,358', 'Avg. Volume': '180,745,911', 'Market Cap (intraday)': '4.649T', 'Beta (5Y Monthly)': '2.31', 'PE Ratio (TTM)': '47.38', 'EPS (TTM)': '4.03', 'Earnings Date': 'Feb 25, 2026', 'Forward Dividend & Yield': '0.04 (0.02%)', 'Ex-Dividend Date': 'Dec 4, 2025', '1y Target Est': '253.79'}, 'U': {'Previous Close': '29.06', 'Open': '21.41', 'Bid': '20.84 x 20000', 'Ask': '20.82 x 10000', "Day's Range": '18.80 - 21.97', '52 Week Range': '15.33 - 52.15', 'Volume': '72,882,314', 'Avg. Volume': '11,062,314', 'Market Cap (intraday)': '8.67B', 'Beta (5Y Monthly)': '2.09', 'PE Ratio (TTM)': '--', 'EPS (TTM)': '-1.05', 'Earnings Date': 'Feb 11, 2026', 'Forward Dividend & Yield': '--', 'Ex-Dividend Date': '--', '1y Target Est': '45.74'}, 'F': {'Previous Close': '13.72', 'Open': '13.78', 'Bid

In [None]:
"""
'NVDA': {'Previous Close': '174.19', 'Open': '174.93', 'Bid': '171.76 x 5000', 'Ask': '180.37 x 100',
 "Day's Range": '171.03 - 176.81', '52 Week Range': '86.62 - 212.19', 'Volume': '201,785,799',
   'Avg. Volume': '181,588,861', 'Market Cap (intraday)': '4.185T', 'Beta (5Y Monthly)': '2.31',
     'PE Ratio (TTM)': '42.54', 'EPS (TTM)': '4.04', 'Earnings Date': 'Feb 25, 2026',
       'Forward Dividend & Yield': '0.04 (0.02%)', 'Ex-Dividend Date': 'Dec 4, 2025', '1y Target Est': '253.62'},
"""

# we are going to add a function now that cleans this data, that was gathered from yfinance... not sure why CIVI didn't
# get properly scraped, nothing seemed broken about the link, but I will investigate that at a later date.



def clean_yfinance_data(yfinance_data: dict):
    """
    input: yfinance data
    output cleaned yfinance data
    """
    for yfin_ent in yfinance_data.keys():
        try:
          print(yfinance_data[yfin_ent]["Previous Close"])
        except KeyError:
           print("KEY ERROR", yfin_ent)

clean_yfinance_data(enriched_tickers)


NVDA
U
F
INTC
HOOD
PLUG
LYFT
RIG
ONDS
DAY


KeyError: 'Previous Close'

In [5]:
# quick function to save the dict to a file after cleaning and date the file today's date.

In [6]:
# next up one by one we are going to assemble a dictionary that has key's stocks, and google finance
# info as a nested dictionary of information, e.g. {'F':{'q1_2025_profit':100000, 'market_cap':10000, etc.}}


# going to still do google also but google requires knowing the index too which is annoying, but just means
# I need to get this information from yfinance first.


def get_google_info_w_yfin_tickers(list_of_tickers):
    """
    input: List of tickers
    output: returns dictionary of dictionary of tickers and info.
    in the format of {'F':{'q1_2025_profit':100000, market_cap, etc.}}
    """
    #header for request, in the future state this can be a class var
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
    }
    base_url = f'https://'
    for i in list_of_tickers:

IndentationError: expected an indented block (1024306176.py, line 20)