In [17]:
# %pip install lxml
# %pip install html5lib
# %pip install selenium
# %pip install pandas
# %pip install numpy
# %pip install bs4
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import os
import time


In [18]:
def select_companies(group_number): #get the group number from your lecturer
    # File to store selected companies
    selection_file = f'selected_companies_group_{group_number}.txt'
    
    # Check if we have already selected companies for this group
    if os.path.exists(selection_file):
        with open(selection_file, 'r') as f:
            return [line.strip().split(',') for line in f.readlines()]
    
    # If not, perform the selection
    rng = np.random.default_rng(group_number)  # Use group number as seed
    companies = pd.read_csv("filtered_sp-500.csv")
    industries = list(set(companies['Sector']))
    selected_industries = rng.choice(industries, size=min(10, len(industries)), replace=False) #select 10 industries

    selected_companies = []
    for industry in selected_industries:
        industry_companies = companies[companies['Sector'] == industry].values.tolist()
        selected_companies.extend(rng.choice(industry_companies, size=min(20, len(industry_companies)), replace=False)) #select 20 companies in each industry

    # Convert numpy arrays to lists
    selected_companies = [company.tolist() if isinstance(company, np.ndarray) else company for company in selected_companies]

    # Save the selection
    with open(selection_file, 'w') as f:
        for company in selected_companies:
            f.write(','.join(map(str, company)) + '\n')

    return selected_companies

In [19]:
def scrape_company_data(selected_companies, url):

    # Setup Selenium WebDriver (you may need to adjust this based on your browser)
    # You can see this scraping process in action
    from selenium.webdriver.chrome.service import Service
    service = Service('/opt/homebrew/bin/chromedriver') 
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Runs Chrome in headless mode (no UI)
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    driver = webdriver.Chrome(service=service, options=options) # or webdriver.Firefox()
    driver.get(url)

    try:
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "#dataTable tbody tr"))
        )

        all_data = []
        company_codes = [company[0] for company in selected_companies]  # Extract company codes

        while True:
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            # Extract headers from the appropriate div
            header_table = soup.find('div', {'class': 'dataTables_scrollHead'})
            if header_table is None:
                print("Table not found. Please check the HTML structure.")
                break
            headers = [header.text.strip() for header in header_table.find_all('th')]
            
            # Extract rows from the table body
            table_body = soup.find("table", id='dataTable')
            rows = []
            for row in table_body.find_all('tr'):
                cols = [ele.text.strip() for ele in row.find_all('td')]
                if len(cols) > 0 and cols[0].split('-')[0] in company_codes:
                    rows.append(cols)
                
            # Convert to DataFrame and append if data matches selected companies
            if rows:
                df = pd.DataFrame(rows, columns=headers)
                all_data.append(df)

            # Check if there's a next page
            try:
                next_button = driver.find_element(By.CSS_SELECTOR, "#dataTable_next:not(.disabled)")
                next_button.click()
                time.sleep(1)  # Wait for the page to load
            except:
                break  # No more pages

        # Combine all data
        if all_data:
            final_data = pd.concat(all_data, ignore_index=True)
        else:
            final_data = pd.DataFrame()  # Return empty DataFrame if no data is found

        return final_data

    except TimeoutException:
        print("Timed out waiting for page to load")
        return pd.DataFrame()
    
    finally:
        driver.quit()

In [20]:
def main(group_number, url):
    selected_companies = select_companies(group_number)
    company_data = scrape_company_data(selected_companies, url)

    sorted_companies = sorted(selected_companies, key=lambda x: x[0])
    for company in sorted_companies:
        print(f"{company[0]} ({company[2]})")

    print("Data Scraping finished, please check your data in the csv file")
    company_data.to_csv(f"group_{group_number}_data.csv", index=False)
    print(f"Data saved to group_{group_number}_data.csv")

url = "https://unsw-yahoo-finance.github.io/ACCT5943/" # Replace with the actual website
# TODO: update the group_number to your assigned group number, then click run-all
group_number = 53
main(group_number, url)

ABT (Health Care)
ADM (Consumer Staples)
AEE (Utilities)
AEP (Utilities)
AES (Utilities)
AKAM (Information Technology)
ALB (Materials)
ALGN (Health Care)
ALLE (Industrials)
AMGN (Health Care)
AMT (Real Estate)
AMZN (Consumer Discretionary)
ANET (Information Technology)
ANSS (Information Technology)
APH (Information Technology)
APTV (Consumer Discretionary)
AVB (Real Estate)
AVY (Materials)
AWK (Utilities)
AXP (Financials)
BAC (Financials)
BALL (Materials)
BIO (Health Care)
BK (Financials)
BKNG (Consumer Discretionary)
BLK (Financials)
BMY (Health Care)
BSX (Health Care)
BWA (Consumer Discretionary)
BXP (Real Estate)
CB (Financials)
CCI (Real Estate)
CDNS (Information Technology)
CDW (Information Technology)
CE (Materials)
CF (Materials)
CHD (Consumer Staples)
CHRW (Industrials)
CHTR (Communication Services)
CL (Consumer Staples)
CMCSA (Communication Services)
CME (Financials)
CMG (Consumer Discretionary)
CNC (Health Care)
CPT (Real Estate)
CSGP (Real Estate)
CTSH (Information Technolog