<h1>
Stock ML Scraping, Cleaning, and Analysis
</h1>

<p>
No AI tools were used.
</p>


The architecture I am planning is:

yfinance (scrape most active tickers) -> Google Finance (get basic financial info) -> 12api (get stock price for a certain period)

Put these together into data to be analyzed with ML to try and define a strategy to "beat" the market.

In [1]:
# first step scrape yfinance for later ingesting with stock class.
from bs4 import BeautifulSoup
import requests
import time
import re



def get_yfinance_tickers():
    """
    input: None
    output: returns tickers of most active stocks, stops collecting when yfinance runs out (1-3 loops)
    """
    # list to hold data
    tickers = []

    # default starting index for 100 is ?start=0&count=100
    start_ind = 0
    count = 100

    
    

    #header for request
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
    }

    while True:
        #base URL (needs to change every loop)
        url = f"https://finance.yahoo.com/markets/stocks/most-active/?start={start_ind}&count={count}"
        # stop on error, call soup
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        # grab main container
        container = soup.select_one("div.container.yf-1bczin")

        # if not main container stop
        if not container:
            break


        # if no more data stop
        if container.select_one("div.no-data"):
            break

        links = soup.find_all("a", {"data-testid": "table-cell-ticker"})

        
        for link in links:
            href = link.get("href")

            # These are the link extensions to go to individual pages, 
            # in format /quote/SNAP/, which would be accessed like
            # https://finance.yahoo.com/quote/MU/
            # instead of drilling TWO levels deeper using BS4,
            # which is a buggy and annoying process,
            # the best practice I've found is to use RegEx.
            #
            # RegEx reduces time and memory complexity
            # by skpping over redundant soup layering.
            #
            # I am opting to use regex to remove the chars:
            #   /quote/ /

            match = re.search(r"/quote/([A-Z0-9.-]+)", href)
            if match:
                cleaned_title = match.group(1)
                tickers.append(cleaned_title)
        time.sleep(2)

        start_ind += 100

    return tickers

tickers = get_yfinance_tickers()
print(tickers)




['NVDA', 'F', 'RIG', 'U', 'INTC', 'PLUG', 'LYFT', 'SNAP', 'HOOD', 'AMZN', 'ONDS', 'GGB', 'TSLA', 'SHOP', 'BBAI', 'T', 'SOFI', 'PLTR', 'MARA', 'BBD', 'MU', 'VALE', 'AAL', 'PATH', 'KVUE', 'AAPL', 'VZ', 'RKT', 'BAC', 'OPEN', 'GOOGL', 'LYG', 'GRAB', 'MAT', 'WULF', 'DNN', 'ITUB', 'IREN', 'PFE', 'AVDL', 'NFLX', 'MSFT', 'COMP', 'KHC', 'ACHR', 'BMNR', 'AMD', 'AVTR', 'PBR', 'NOK', 'RGTI', 'HIMS', 'CIFR', 'BTG', 'CLSK', 'NU', 'SOUN', 'HBAN', 'NIO', 'CMCSA', 'CPNG', 'QBTS', 'UBER', 'SMR', 'GOOG', 'APLD', 'CMG', 'CSCO', 'GTM', 'MSTR', 'RIVN', 'FRSH', 'NOW', 'ORCL', 'ABEV', 'SCHW', 'SMCI', 'CLF', 'EOSE', 'WBD', 'SLB', 'PCG', 'SNDK', 'BSX', 'AG', 'AGNC', 'PYPL', 'JOBY', 'WMT', 'VRT', 'OWL', 'CRWV', 'XOM', 'KO', 'TSM', 'PBR-A', 'CNH', 'HL', 'SW', 'AVGO', 'UPST', 'AUR', 'CDE', 'NVTS', 'RKLB', 'IBRX', 'CRM', 'BTE', 'QXO', 'CCL', 'UUUU', 'RIOT', 'Z', 'ALAB', 'TOST', 'PR', 'STM', 'PANW', 'SAN', 'NET', 'RNG', 'NGD', 'DOC', 'CVE', 'CCC', 'META', 'MRNA', 'NCLH', 'BWA', 'NKE', 'JHX', 'FCX', 'RF', 'IONQ', 'ST

In [2]:
# scraping yfinance info
# base url https://finance.yahoo.com/quote/MU/

# price will be gotten later with twelve api
#

def scrape_yfinance_info(list_of_tickers):
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
    }
    data_dic = {}
    for ticker in list_of_tickers:

        url = f"https://finance.yahoo.com/quote/{ticker}/"
        # stop on error, call soup
        response = requests.get(url, headers=headers)
        try:
            response.raise_for_status()
        except:
            continue
            
        soup = BeautifulSoup(response.text, "html.parser")

        # Find the container div
        statistics_div = soup.find("div", {"data-testid": "quote-statistics"})

        if not statistics_div:
            print(f"No stats found for {ticker}")
            data_dic[ticker] = {}
            continue


        rows = statistics_div.find_all("li")

        data = {}
        
        for row in rows:

            label = row.find("span", class_="label")
            value = row.find("span", class_="value")

            if label and value:
                label_text = label.get_text(strip=True)
                value_text = value.get_text(strip=True)

                data[label_text] = value_text

        data_dic[ticker] = data
        time.sleep(2.5)

    return data_dic

enriched_tickers = scrape_yfinance_info(tickers)
print(enriched_tickers)

No stats found for AXL
{'NVDA': {'Previous Close': '188.54', 'Open': '192.45', 'Bid': '189.96 x 300', 'Ask': '190.09 x 300', "Day's Range": '188.77 - 193.26', '52 Week Range': '86.62 - 212.19', 'Volume': '143,008,331', 'Avg. Volume': '180,745,911', 'Market Cap (intraday)': '4.627T', 'Beta (5Y Monthly)': '2.31', 'PE Ratio (TTM)': '47.16', 'EPS (TTM)': '4.03', 'Earnings Date': 'Feb 25, 2026', 'Forward Dividend & Yield': '0.04 (0.02%)', 'Ex-Dividend Date': 'Dec 4, 2025', '1y Target Est': '253.79'}, 'F': {'Previous Close': '13.72', 'Open': '13.78', 'Bid': '13.72 x 5490000', 'Ask': '13.73 x 19460000', "Day's Range": '13.67 - 13.88', '52 Week Range': '8.44 - 14.50', 'Volume': '23,003,662', 'Avg. Volume': '57,779,440', 'Market Cap (intraday)': '54.746B', 'Beta (5Y Monthly)': '1.67', 'PE Ratio (TTM)': '11.74', 'EPS (TTM)': '1.17', 'Earnings Date': 'Feb 10, 2026', 'Forward Dividend & Yield': '0.60 (4.37%)', 'Ex-Dividend Date': 'Feb 13, 2026', '1y Target Est': '13.76'}, 'RIG': {'Previous Close':

In [3]:
"""
'NVDA': {'Previous Close': '174.19', 'Open': '174.93', 'Bid': '171.76 x 5000', 'Ask': '180.37 x 100',
 "Day's Range": '171.03 - 176.81', '52 Week Range': '86.62 - 212.19', 'Volume': '201,785,799',
   'Avg. Volume': '181,588,861', 'Market Cap (intraday)': '4.185T', 'Beta (5Y Monthly)': '2.31',
     'PE Ratio (TTM)': '42.54', 'EPS (TTM)': '4.04', 'Earnings Date': 'Feb 25, 2026',
       'Forward Dividend & Yield': '0.04 (0.02%)', 'Ex-Dividend Date': 'Dec 4, 2025', '1y Target Est': '253.62'},
"""

# we are going to add a function now that cleans this data, that was gathered from yfinance... not sure why CIVI didn't
# get properly scraped, nothing seemed broken about the link, but I will investigate that at a later date.




# this is a helper function to clean data values of "," such that they are properly removed before
# conversion to a float.

def safe_float(value):
    try:
        return float(value.replace(',', ''))
    except (AttributeError, ValueError):
        return None


def clean_yfinance_data(yfinance_data: dict):
    """
    input: yfinance data
    output cleaned yfinance data
    """

    # first make a copy of the dictionary such that this is non-destructive
    yfinance_data_b = yfinance_data.copy()

    for yfin_ent in yfinance_data.keys():
        try:

            # I could do:
            # try to convert if it works great if not then step later, instead of doing that though
            # I am just going to do it manually such that it's good the first time, (doesn't seem
            # worth the time that would take.)
            # 
            # .replace(',', '') to remove commas
            # could have been done earlier, but keeping it here helps modularity
            # and keeps parsing logic separate from data-cleaning logic for debugging.
            
            # Previous Close
            yfinance_data_b[yfin_ent]["Previous Close"] = safe_float(yfinance_data[yfin_ent]["Previous Close"])
            
            # Open
            yfinance_data_b[yfin_ent]["Open"] = safe_float(yfinance_data[yfin_ent]["Open"])
            
            # Volume
            yfinance_data_b[yfin_ent]["Volume"] = safe_float(yfinance_data[yfin_ent]["Volume"])

            # Avg. Volume
            yfinance_data_b[yfin_ent]["Avg. Volume"] = safe_float(yfinance_data[yfin_ent]["Avg. Volume"])

            # beta really doesn't need the .replace(), just adds time complexity keeping it
            yfinance_data_b[yfin_ent]["Beta (5Y Monthly)"] = safe_float(yfinance_data[yfin_ent]["Beta (5Y Monthly)"])  # Beta (5Y Monthly)

            # keeping .replace(',', '') for PE ratio, as there is no limit to how big negative or positive this can be.
            yfinance_data_b[yfin_ent]["PE Ratio (TTM)"] = safe_float(yfinance_data[yfin_ent]["PE Ratio (TTM)"])  # PE Ratio (TTM)

            # same for EPS .replace(',', '') as there is no limit to how big negative or positive this can be (BRK.A is good exmple ).
            yfinance_data_b[yfin_ent]["EPS (TTM)"] = safe_float(yfinance_data[yfin_ent]["EPS (TTM)"])  # EPS (TTM)

            # Same for 1y target est could also have commma's so need removal
            yfinance_data_b[yfin_ent]["1y Target Est"] = safe_float(yfinance_data[yfin_ent]["1y Target Est"])  # 1y Target Est



            # Checking a couple stocks rq to see the format of "Market Cap" yields
            # T: Trillion 
            # B: Billion 
            # M: Million

            mcap_str = str(yfinance_data_b[yfin_ent]["Market Cap (intraday)"]).replace(',', '')
            

            # I could do this in one line but it wouldn't be very legible.
            # so just know that I know how to use a lambda

            if "T" in mcap_str:
                mcap_value = float(mcap_str.replace("T", "")) * 1e12

            elif "B" in mcap_str:
                mcap_value = float(mcap_str.replace("B", "")) * 1e9

            elif "M" in mcap_str:
                # M last option, safe to assume mcap > 1 million
                mcap_value = float(mcap_str.replace("M", "")) * 1e6

            else:
                # if M, B, T are not found, something went wrong, print to terminal.
                print(f"Unexpected market cap format: {mcap_str}")
                mcap_value = None
            # assign the new float-erized number

            yfinance_data_b[yfin_ent]["Market Cap (intraday)"] = mcap_value

        except KeyError:
           print("KEY ERROR", yfin_ent)

           
    return yfinance_data_b

cleaned_yfinance_data = clean_yfinance_data(enriched_tickers)

print(cleaned_yfinance_data)


KEY ERROR AXL
{'NVDA': {'Previous Close': 188.54, 'Open': 192.45, 'Bid': '189.96 x 300', 'Ask': '190.09 x 300', "Day's Range": '188.77 - 193.26', '52 Week Range': '86.62 - 212.19', 'Volume': 143008331.0, 'Avg. Volume': 180745911.0, 'Market Cap (intraday)': 4627000000000.0, 'Beta (5Y Monthly)': 2.31, 'PE Ratio (TTM)': 47.16, 'EPS (TTM)': 4.03, 'Earnings Date': 'Feb 25, 2026', 'Forward Dividend & Yield': '0.04 (0.02%)', 'Ex-Dividend Date': 'Dec 4, 2025', '1y Target Est': 253.79}, 'F': {'Previous Close': 13.72, 'Open': 13.78, 'Bid': '13.72 x 5490000', 'Ask': '13.73 x 19460000', "Day's Range": '13.67 - 13.88', '52 Week Range': '8.44 - 14.50', 'Volume': 23003662.0, 'Avg. Volume': 57779440.0, 'Market Cap (intraday)': 54746000000.0, 'Beta (5Y Monthly)': 1.67, 'PE Ratio (TTM)': 11.74, 'EPS (TTM)': 1.17, 'Earnings Date': 'Feb 10, 2026', 'Forward Dividend & Yield': '0.60 (4.37%)', 'Ex-Dividend Date': 'Feb 13, 2026', '1y Target Est': 13.76}, 'RIG': {'Previous Close': 4.97, 'Open': 4.83, 'Bid': '4

In [4]:
# quick function to save the dict to a file after cleaning and date the file today's date.

In [5]:
# next up one by one we are going to assemble a dictionary that has key's stocks, and google finance
# info as a nested dictionary of information, e.g. {'F':{'q1_2025_profit':100000, 'market_cap':10000, etc.}}


# going to still do google also but google requires knowing the index too which is annoying, but just means
# I need to get this information from yfinance first.


def get_google_info_w_yfin_tickers(list_of_tickers):
    """
    input: List of tickers
    output: returns dictionary of dictionary of tickers and info.
    in the format of {'F':{'q1_2025_profit':100000, market_cap, etc.}}
    """
    #header for request, in the future state this can be a class var
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
    }
    base_url = f'https://'
    for i in list_of_tickers:

IndentationError: expected an indented block (1024306176.py, line 20)