<h1>
Stock ML Scraping, Cleaning, and Analysis
</h1>

<p>
No AI tools were used.
</p>


The architecture I am planning is:

yfinance (scrape most active tickers) -> Google Finance (get basic financial info) -> 12api (get stock price for a certain period)

Put these together into data to be analyzed with ML to try and define a strategy to "beat" the market.

In [2]:
# first step scrape yfinance for later ingesting with stock class.
from bs4 import BeautifulSoup
import requests
import time
import re



def get_yfinance_tickers():
    """
    input: None
    output: returns tickers of most active stocks, stops collecting when yfinance runs out (1-3 loops)
    """
    # list to hold data
    tickers = []

    # default starting index for 100 is ?start=0&count=100
    start_ind = 0
    count = 100

    
    

    #header for request
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
    }

    while True:
        #base URL (needs to change every loop)
        url = f"https://finance.yahoo.com/markets/stocks/most-active/?start={start_ind}&count={count}"
        # stop on error, call soup
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        # grab main container
        container = soup.select_one("div.container.yf-1bczin")

        # if not main container stop
        if not container:
            break


        # if no more data stop
        if container.select_one("div.no-data"):
            break

        links = soup.find_all("a", {"data-testid": "table-cell-ticker"})

        
        for link in links:
            href = link.get("href")

            # These are the link extensions to go to individual pages, 
            # in format /quote/SNAP/, which would be accessed like
            # https://finance.yahoo.com/quote/MU/
            # instead of drilling TWO levels deeper using BS4,
            # which is a buggy and annoying process,
            # the best practice I've found is to use RegEx.
            #
            # RegEx reduces time and memory complexity
            # by skpping over redundant soup layering.
            #
            # I am opting to use regex to remove the chars:
            #   /quote/ /

            match = re.search(r"/quote/([A-Z0-9.-]+)", href)
            if match:
                cleaned_title = match.group(1)
                tickers.append(cleaned_title)
        time.sleep(2)

        start_ind += 100

    return tickers

tickers = get_yfinance_tickers()
print(tickers)




['NVDA', 'SNAP', 'NIO', 'PLUG', 'KVUE', 'INTC', 'ONDS', 'PLTR', 'CADE', 'GOOGL', 'F', 'SOFI', 'MARA', 'AMZN', 'GRAB', 'HIMS', 'OWL', 'TSLA', 'BMNR', 'PFE', 'CFLT', 'CCC', 'BBD', 'OPEN', 'AMD', 'MSFT', 'NU', 'SMCI', 'HOOD', 'ABEV', 'IREN', 'MSTR', 'NVO', 'ACHR', 'AAPL', 'VALE', 'NFLX', 'CPNG', 'GOOG', 'DNN', 'BTG', 'WULF', 'AAL', 'CIFR', 'PYPL', 'AVGO', 'CLSK', 'RIG', 'JOBY', 'GGB', 'ORCL', 'MU', 'BSX', 'ITUB', 'PATH', 'VZ', 'HBAN', 'QBTS', 'T', 'NOW', 'RGTI', 'SAN', 'UBER', 'RIOT', 'WMT', 'SOUN', 'DAY', 'PSLV', 'WBD', 'EOSE', 'ADT', 'CDE', 'QCOM', 'IONQ', 'APLD', 'COIN', 'BAC', 'LUMN', 'SNDK', 'B', 'RIVN', 'SIRI', 'CMCSA', 'NGD', 'CSCO', 'BMY', 'NOK', 'CRWV', 'ARM', 'SLB', 'HL', 'BTE', 'TTD', 'AG', 'CARR', 'RKT', 'AUR', 'SMR', 'HPE', 'HPQ', 'PTEN', 'U', 'RKLB', 'JBLU', 'MRK', 'CRM', 'USAR', 'FIG', 'PBR', 'FLNC', 'QS', 'VOD', 'FCX', 'CMG', 'BULL', 'KO', 'IBRX', 'EL', 'CORZ', 'UWMC', 'SHOP', 'TOST', 'APH', 'MRVL', 'RBLX', 'STLA', 'C', 'NBIS', 'CCL', 'XOM', 'LYFT', 'LYG', 'ASTS', 'KKR', '

In [None]:
# scraping yfinance info
# base url https://finance.yahoo.com/quote/MU/

# price will be gotten later with twelve api
#

def scrape_yfinance_info(list_of_tickers):
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
    }
    data_dic = {}
    for ticker in list_of_tickers:

        url = f"https://finance.yahoo.com/quote/{ticker}/"
        # stop on error, call soup
        response = requests.get(url, headers=headers)
        try:
            response.raise_for_status()
        except:
            continue
            
        soup = BeautifulSoup(response.text, "html.parser")

        # Find the container div
        statistics_div = soup.find("div", {"data-testid": "quote-statistics"})

        if not statistics_div:
            print(f"No stats found for {ticker}")
            data_dic[ticker] = {}
            continue


        rows = statistics_div.find_all("li")

        data = {}
        
        for row in rows:

            label = row.find("span", class_="label")
            value = row.find("span", class_="value")

            if label and value:
                label_text = label.get_text(strip=True)
                value_text = value.get_text(strip=True)

                data[label_text] = value_text

        data_dic[ticker] = data
        time.sleep(2.5)

    return data_dic

enriched_tickers = scrape_yfinance_info(tickers)
print(enriched_tickers)

No stats found for CIVI
{'NVDA': {'Previous Close': '174.19', 'Open': '174.93', 'Bid': '171.76 x 5000', 'Ask': '180.37 x 100', "Day's Range": '171.03 - 176.81', '52 Week Range': '86.62 - 212.19', 'Volume': '201,785,799', 'Avg. Volume': '181,588,861', 'Market Cap (intraday)': '4.185T', 'Beta (5Y Monthly)': '2.31', 'PE Ratio (TTM)': '42.54', 'EPS (TTM)': '4.04', 'Earnings Date': 'Feb 25, 2026', 'Forward Dividend & Yield': '0.04 (0.02%)', 'Ex-Dividend Date': 'Dec 4, 2025', '1y Target Est': '253.62'}, 'SNAP': {'Previous Close': '6.93', 'Open': '6.90', 'Bid': '6.69 x 2280000', 'Ask': '6.70 x 4930000', "Day's Range": '6.64 - 6.93', '52 Week Range': '6.64 - 11.71', 'Volume': '51,429,354', 'Avg. Volume': '44,972,706', 'Market Cap (intraday)': '11.451B', 'Beta (5Y Monthly)': '0.80', 'PE Ratio (TTM)': '--', 'EPS (TTM)': '-0.28', 'Earnings Date': 'Feb 4, 2026', 'Forward Dividend & Yield': '--', 'Ex-Dividend Date': '--', '1y Target Est': '9.81'}, 'NIO': {'Previous Close': '4.4400', 'Open': '4.7800

In [None]:
# next up one by one we are going to assemble a dictionary that has key's stocks, and google finance
# info as a nested dictionary of information, e.g. {'F':{'q1_2025_profit':100000, 'market_cap':10000, etc.}}


# going to still do google also but google requires knowing the index too which is annoying, but just means
# I need to get this information from yfinance first.


def get_google_info_w_yfin_tickers(list_of_tickers):
    """
    input: List of tickers
    output: returns dictionary of dictionary of tickers and info.
    in the format of {'F':{'q1_2025_profit':100000, market_cap, etc.}}
    """
    #header for request, in the future state this can be a class var
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
    }
    base_url = f'https://'
    for i in list_of_tickers: