<h1>
Stock ML Scraping, Cleaning, and Analysis
</h1>

<p>
This project is intended to be a body of work showing my ability to code and analyze data.

No AI tools were used.
</p>


The architecture I am planning is:

yfinance (scrape most active tickers) -> Google Finance (get basic financial info) -> 12api (get stock price for a certain period)

Put these together into data to be analyzed with ML to try and define a strategy to "beat" the market.

In [None]:
# first step scrape yfinance for later ingesting with stock class.
from bs4 import BeautifulSoup
import requests
import time
import re



def get_yfinance_tickers():
    """
    input: None
    output: returns tickers of most active stocks, stops collecting when yfinance runs out (1-3 loops)
    """
    # list to hold data
    tickers = []

    # default starting index for 100 is ?start=0&count=100
    start_ind = 0
    count = 100

    
    

    #header for request
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
    }

    while True:
        #base URL (needs to change every loop)
        url = f"https://finance.yahoo.com/markets/stocks/most-active/?start={start_ind}&count={count}"
        # stop on error, call soup
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        # grab main container
        container = soup.select_one("div.container.yf-1bczin")

        # if not main container stop
        if not container:
            break


        # if no more data stop
        if container.select_one("div.no-data"):
            break

        links = soup.find_all("a", {"data-testid": "table-cell-ticker"})

        
        for link in links:
            href = link.get("href")

            # These are the link extensions to go to individual pages, 
            # in format /quote/SNAP/, which would be accessed like
            # https://finance.yahoo.com/quote/MU/
            # instead of drilling TWO levels deeper using BS4,
            # which is a buggy and annoying process,
            # the best practice I've found is to use RegEx.
            #
            # RegEx reduces time and memory complexity
            # by skpping over redundant soup layering.
            #
            # I am opting to use regex to remove the chars:
            #   /quote/ /

            match = re.search(r"/quote/([A-Z0-9.-]+)", href)
            if match:
                cleaned_title = match.group(1)
                tickers.append(cleaned_title)
        time.sleep(2)

        start_ind += 100

    return tickers

tickers = get_yfinance_tickers()
print(tickers)


['NVDA', 'SNAP', 'NIO', 'PLUG', 'INTC', 'CADE', 'ONDS', 'KVUE', 'PLTR', 'GOOGL', 'SOFI', 'HIMS', 'F', 'CFLT', 'MARA', 'TSLA', 'OWL', 'AMZN', 'AMD', 'GRAB', 'BMNR', 'PFE', 'OPEN', 'SMCI', 'NVO', 'ABEV', 'NU', 'BBD', 'IREN', 'NFLX', 'ACHR', 'GOOG', 'HOOD', 'VALE', 'MSFT', 'DNN', 'MSTR', 'CPNG', 'AAPL', 'CIFR', 'BTG', 'PYPL', 'MU', 'CCC', 'AVGO', 'DAY', 'WULF', 'GGB', 'AAL', 'CLSK', 'RIG', 'ORCL', 'BSX', 'UBER', 'ITUB', 'QBTS', 'EOSE', 'JOBY', 'RGTI', 'VZ', 'SAN', 'T', 'SOUN', 'PSLV', 'NOW', 'CDE', 'SNDK', 'PATH', 'BAC', 'QCOM', 'APLD', 'RIOT', 'HBAN', 'ADT', 'SIRI', 'IONQ', 'COIN', 'WMT', 'NGD', 'BTE', 'B', 'HL', 'ARM', 'RIVN', 'SMR', 'NOK', 'AG', 'SLB', 'WBD', 'CRWV', 'CMCSA', 'HPQ', 'LUMN', 'VOD', 'RKLB', 'FLNC', 'USAR', 'AUR', 'BMY', 'HPE', 'TTD', 'RKT', 'JBLU', 'PTEN', 'U', 'PBR', 'MRK', 'CRM', 'CARR', 'FCX', 'EL', 'PCH', 'KO', 'CSCO', 'IBRX', 'LYG', 'QS', 'ASTS', 'PHYS', 'SHOP', 'CMG', 'RBLX', 'APH', 'TOST', 'NBIS', 'C', 'CCL', 'MRVL', 'STLA', 'FIG', 'TSM', 'META', 'BULL', 'KGC', 'A

In [None]:
# next up one by one we are going to assemble a dictionary that has key's stocks, and google finance
# info as a nested dictionary of information.

def get_google_info_w_yfin_tickers(list_of_tickers):
    """
    input: List of tickers
    output: returns dictionary of dictionary of tickers and info.
    in the format of {'F':{'q1_2025_profit':100000, market_cap, etc.}}
    """
    #header for request, in the future state this can be a class var
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
    }