In [3]:
import sys
sys.path.append('/Projects/regionintelligenceai/dev_llm/modules/q_and_a_generator/')

In [4]:
import selenium 
# Import driver configuration
import time
import numpy as np
from src.driver_config import get_chrome_driver, navigate_and_print_title

from src.const import CALIFORNIA_UPCODES_URL, LOS_ANGELES_UPCODES_URL, LOS_ANGELES_COUNTY_UPCODES_URL, SAN_FRANCISCO_UPCODES_URL, SAN_JOSE_UPCODES_URL
from src.paths import RAW_DATA_DIR, PROCESSED_DATA_DIR

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import json
import os
from datetime import datetime

# Get the driver
driver = get_chrome_driver()

# Set the URL
URL = CALIFORNIA_UPCODES_URL

# Navigate to the URL and print its title
navigate_and_print_title(driver, URL)

# If you need to close the driver after use (recommended if not using further in the notebook):
driver.quit()

California Building Codes | UpCodes


## Scrape California Codes

In [10]:
class CaliforniaUpCodesScraper:
    def __init__(self, base_url):
        self.driver = get_chrome_driver()
        self.base_url = base_url
        
    def navigate_and_get_title(self):
        print(f"Navigating to {self.base_url}...")
        navigate_and_print_title(self.driver, self.base_url)

    def extract_links_from_main(self, css_selector):
        print("Extracting main links...")
        main = WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, css_selector))
        )
        time.sleep(1)
        links = main.find_elements(By.CSS_SELECTOR, ".group.font-inter")
        extracted_links = [link.get_attribute("href") for link in links]
        print(f"Extracted {len(extracted_links)} links from main page.")
        return extracted_links

    def extract_text_from_sublinks(self, urls):
        timestamp_str = datetime.now().strftime("%Y%m%d")
        self.output_file_path = RAW_DATA_DIR / 'california_building_codes' / f"california_{timestamp_str}.txt"
        section_div = []
        for index, url in enumerate(urls):
            print(f"Processing URL {index + 1}/{len(urls)}: {url}")
            self.driver.get(url)
            elements = self.driver.find_elements(By.CSS_SELECTOR, "a.block.w-full")
            sublinks = [element.get_attribute("href") for element in elements]
            
            for sublink in sublinks:
                self.driver.get(sublink)
                elements = self.driver.find_elements(By.XPATH, "//*[@id='__next']/div[2]/div[2]/div/main/div[1]/div/div[2]")
                new_sections = [element.text for element in elements]
                if new_sections:
                    print(f"Extracted {len(new_sections)} section(s) from {sublink}")
                else:
                    print(f"No sections extracted from {sublink}. This might indicate a scraping issue.")
                section_div.extend(new_sections)
                self._print_sections(new_sections)
            
            #self._save_sections_to_file(section_div, index)
        #self._combine_files(len(urls))
        self._save_all_sections_to_file(section_div)
        
    def _print_sections(self, sections):
        print("Printing extracted sections...")

    def _save_all_sections_to_file(self, sections):
        print(f"Saving data to {self.output_file_path}...")
        with open(self.output_file_path, "w", encoding="utf-8") as f:
            for section in sections:
                f.write(section + "\n\n")
        print("Data saved successfully!")
                
    def _combine_files(self, num_files):
        timestamp_str = datetime.now().strftime("%Y%m%d")
        directory_path = str(RAW_DATA_DIR / 'california_building_codes')
        output_file_path = os.path.join(directory_path, f"california_{timestamp_str}.txt")
        
        with open(output_file_path, "w", encoding="utf-8") as outfile:
            for i in range(num_files):
                file_path = os.path.join(directory_path, f"testing{i}.txt")
                
                if not os.path.exists(file_path):
                    print(f"File {file_path} does not exist. Skipping.")
                    continue
                
                with open(file_path, "r", encoding="utf-8") as infile:
                    outfile.write(infile.read())
                    
        print(f"Combined file saved to {output_file_path}")
    
    def close(self):
        print("Closing the browser...")
        self.driver.quit()

# Usage:
scraper = CaliforniaUpCodesScraper(CALIFORNIA_UPCODES_URL)
scraper.navigate_and_get_title()
urls = scraper.extract_links_from_main("div.flex.flex-row")  # limiting to first URL for testing
scraper.extract_text_from_sublinks(urls)
scraper.close()


Navigating to https://up.codes/codes/california...
California Building Codes | UpCodes
Extracting main links...
Extracted 45 links from main page.
Processing URL 1/2: https://up.codes/viewer/california/ca-building-code-2022
Extracted 1 section(s) from https://up.codes/viewer/california/ca-building-code-2022/chapter/1/scope-and-administration#1
Printing extracted sections...
Extracted 1 section(s) from https://up.codes/viewer/california/ca-building-code-2022/chapter/2/definitions#2
Printing extracted sections...
Extracted 1 section(s) from https://up.codes/viewer/california/ca-building-code-2022/chapter/3/occupancy-classification-and-use#3
Printing extracted sections...
Extracted 1 section(s) from https://up.codes/viewer/california/ca-building-code-2022/chapter/4/special-detailed-requirements-based-on-occupancy-and-use#4
Printing extracted sections...
Extracted 1 section(s) from https://up.codes/viewer/california/ca-building-code-2022/chapter/5/general-building-heights-and-areas#5
Print

## Scrape Los Angeles Codes

In [5]:
class LosAngelesUpCodesScraper:
    def __init__(self, base_url):
        self.driver = get_chrome_driver()
        self.base_url = base_url
        
    def navigate_and_get_title(self):
        print(f"Navigating to {self.base_url}...")
        navigate_and_print_title(self.driver, self.base_url)

    def extract_links_from_main(self, css_selector):
        print("Extracting main links...")
        main = WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, css_selector))
        )
        time.sleep(1)
        links = main.find_elements(By.CSS_SELECTOR, ".group.font-inter")
        extracted_links = [link.get_attribute("href") for link in links]
        print(f"Extracted {len(extracted_links)} links from main page.")
        return extracted_links

    def extract_text_from_sublinks(self, urls):
        timestamp_str = datetime.now().strftime("%Y%m%d")
        self.output_file_path = RAW_DATA_DIR / 'los_angeles_building_codes' / f"los_angeles_{timestamp_str}.txt"
        section_div = []
        for index, url in enumerate(urls):
            print(f"Processing URL {index + 1}/{len(urls)}: {url}")
            self.driver.get(url)
            elements = self.driver.find_elements(By.CSS_SELECTOR, "a.block.w-full")
            sublinks = [element.get_attribute("href") for element in elements]
            
            for sublink in sublinks:
                self.driver.get(sublink)
                elements = self.driver.find_elements(By.XPATH, "//*[@id='__next']/div[2]/div[2]/div/main/div[1]/div/div[2]")
                new_sections = [element.text for element in elements]
                if new_sections:
                    print(f"Extracted {len(new_sections)} section(s) from {sublink}")
                else:
                    print(f"No sections extracted from {sublink}. This might indicate a scraping issue.")
                section_div.extend(new_sections)
                self._print_sections(new_sections)
            
            #self._save_sections_to_file(section_div, index)
        #self._combine_files(len(urls))
        self._save_all_sections_to_file(section_div)
        
    def _print_sections(self, sections):
        print("Printing extracted sections...")

    def _save_all_sections_to_file(self, sections):
        print(f"Saving data to {self.output_file_path}...")
        with open(self.output_file_path, "w", encoding="utf-8") as f:
            for section in sections:
                f.write(section + "\n\n")
        print("Data saved successfully!")
                
    def _combine_files(self, num_files):
        timestamp_str = datetime.now().strftime("%Y%m%d")
        directory_path = str(RAW_DATA_DIR / 'los_angeles_building_codes')
        output_file_path = os.path.join(directory_path, f"los_angeles_{timestamp_str}.txt")
        
        with open(output_file_path, "w", encoding="utf-8") as outfile:
            for i in range(num_files):
                file_path = os.path.join(directory_path, f"testing{i}.txt")
                
                if not os.path.exists(file_path):
                    print(f"File {file_path} does not exist. Skipping.")
                    continue
                
                with open(file_path, "r", encoding="utf-8") as infile:
                    outfile.write(infile.read())
                    
        print(f"Combined file saved to {output_file_path}")
    
    def close(self):
        print("Closing the browser...")
        self.driver.quit()

# Usage:
scraper = LosAngelesUpCodesScraper(LOS_ANGELES_UPCODES_URL)
scraper.navigate_and_get_title()
urls = scraper.extract_links_from_main("div.flex.flex-row")[:2]  # limiting to first URL for testing
scraper.extract_text_from_sublinks(urls)
scraper.close()


Navigating to https://up.codes/codes/los_angeles...
Los Angeles City Building Codes | UpCodes
Extracting main links...
Extracted 36 links from main page.
Processing URL 1/2: https://up.codes/viewer/los_angeles/ca-building-code-2022
Extracted 1 section(s) from https://up.codes/viewer/los_angeles/ca-building-code-2022/chapter/new_1/administration#new_1
Printing extracted sections...
Extracted 1 section(s) from https://up.codes/viewer/los_angeles/ca-building-code-2022/chapter/2/definitions#2
Printing extracted sections...
Extracted 1 section(s) from https://up.codes/viewer/los_angeles/ca-building-code-2022/chapter/3/occupancy-classification-and-use#3
Printing extracted sections...
Extracted 1 section(s) from https://up.codes/viewer/los_angeles/ca-building-code-2022/chapter/4/special-detailed-requirements-based-on-occupancy-and-use#4
Printing extracted sections...
Extracted 1 section(s) from https://up.codes/viewer/los_angeles/ca-building-code-2022/chapter/5/general-building-heights-and-ar

In [None]:
class LosAngelesCountyUpCodesScraper:
    def __init__(self, base_url):
        self.driver = get_chrome_driver()
        self.base_url = base_url
        
    def navigate_and_get_title(self):
        print(f"Navigating to {self.base_url}...")
        navigate_and_print_title(self.driver, self.base_url)

    def extract_links_from_main(self, css_selector):
        print("Extracting main links...")
        main = WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, css_selector))
        )
        time.sleep(1)
        links = main.find_elements(By.CSS_SELECTOR, ".group.font-inter")
        extracted_links = [link.get_attribute("href") for link in links]
        print(f"Extracted {len(extracted_links)} links from main page.")
        return extracted_links

    def extract_text_from_sublinks(self, urls):
        timestamp_str = datetime.now().strftime("%Y%m%d")
        self.output_file_path = RAW_DATA_DIR / 'los_angeles_county_building_codes' / f"los_angeles_county_{timestamp_str}.txt"
        section_div = []
        for index, url in enumerate(urls):
            print(f"Processing URL {index + 1}/{len(urls)}: {url}")
            self.driver.get(url)
            elements = self.driver.find_elements(By.CSS_SELECTOR, "a.block.w-full")
            sublinks = [element.get_attribute("href") for element in elements]
            
            for sublink in sublinks:
                self.driver.get(sublink)
                elements = self.driver.find_elements(By.XPATH, "//*[@id='__next']/div[2]/div[2]/div/main/div[1]/div/div[2]")
                new_sections = [element.text for element in elements]
                if new_sections:
                    print(f"Extracted {len(new_sections)} section(s) from {sublink}")
                else:
                    print(f"No sections extracted from {sublink}. This might indicate a scraping issue.")
                section_div.extend(new_sections)
                self._print_sections(new_sections)
            
            #self._save_sections_to_file(section_div, index)
        #self._combine_files(len(urls))
        self._save_all_sections_to_file(section_div)
        
    def _print_sections(self, sections):
        print("Printing extracted sections...")

    def _save_all_sections_to_file(self, sections):
        print(f"Saving data to {self.output_file_path}...")
        with open(self.output_file_path, "w", encoding="utf-8") as f:
            for section in sections:
                f.write(section + "\n\n")
        print("Data saved successfully!")
                
    def _combine_files(self, num_files):
        timestamp_str = datetime.now().strftime("%Y%m%d")
        directory_path = str(RAW_DATA_DIR / 'los_angeles_county_building_codes')
        output_file_path = os.path.join(directory_path, f"los_angeles_county_{timestamp_str}.txt")
        
        with open(output_file_path, "w", encoding="utf-8") as outfile:
            for i in range(num_files):
                file_path = os.path.join(directory_path, f"testing{i}.txt")
                
                if not os.path.exists(file_path):
                    print(f"File {file_path} does not exist. Skipping.")
                    continue
                
                with open(file_path, "r", encoding="utf-8") as infile:
                    outfile.write(infile.read())
                    
        print(f"Combined file saved to {output_file_path}")
    
    def close(self):
        print("Closing the browser...")
        self.driver.quit()

LosAngelesCountyUpCodesScraper(LOS_ANGELES_COUNTY_UPCODES_URL)
scraper.navigate_and_get_title()
urls = scraper.extract_links_from_main("div.flex.flex-row")  # limiting to first URL for testing
scraper.extract_text_from_sublinks(urls)
scraper.close()


# Scrape San Francisco Codes

In [None]:
class SanFranciscoUpCodesScraper:
    def __init__(self, base_url):
        self.driver = get_chrome_driver()
        self.base_url = base_url
        
    def navigate_and_get_title(self):
        print(f"Navigating to {self.base_url}...")
        navigate_and_print_title(self.driver, self.base_url)

    def extract_links_from_main(self, css_selector):
        print("Extracting main links...")
        main = WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, css_selector))
        )
        time.sleep(1)
        links = main.find_elements(By.CSS_SELECTOR, ".group.font-inter")
        extracted_links = [link.get_attribute("href") for link in links]
        print(f"Extracted {len(extracted_links)} links from main page.")
        return extracted_links

    def extract_text_from_sublinks(self, urls):
        timestamp_str = datetime.now().strftime("%Y%m%d")
        self.output_file_path = RAW_DATA_DIR / 'san_francisco_building_codes' / f"san_francisco_{timestamp_str}.txt"
        section_div = []
        for index, url in enumerate(urls):
            print(f"Processing URL {index + 1}/{len(urls)}: {url}")
            self.driver.get(url)
            elements = self.driver.find_elements(By.CSS_SELECTOR, "a.block.w-full")
            sublinks = [element.get_attribute("href") for element in elements]
            
            for sublink in sublinks:
                self.driver.get(sublink)
                elements = self.driver.find_elements(By.XPATH, "//*[@id='__next']/div[2]/div[2]/div/main/div[1]/div/div[2]")
                new_sections = [element.text for element in elements]
                if new_sections:
                    print(f"Extracted {len(new_sections)} section(s) from {sublink}")
                else:
                    print(f"No sections extracted from {sublink}. This might indicate a scraping issue.")
                section_div.extend(new_sections)
                self._print_sections(new_sections)
            
            #self._save_sections_to_file(section_div, index)
        #self._combine_files(len(urls))
        self._save_all_sections_to_file(section_div)
        
    def _print_sections(self, sections):
        print("Printing extracted sections...")

    def _save_all_sections_to_file(self, sections):
        print(f"Saving data to {self.output_file_path}...")
        with open(self.output_file_path, "w", encoding="utf-8") as f:
            for section in sections:
                f.write(section + "\n\n")
        print("Data saved successfully!")
                
    def _combine_files(self, num_files):
        timestamp_str = datetime.now().strftime("%Y%m%d")
        directory_path = str(RAW_DATA_DIR / 'san_francisco_building_codes')
        output_file_path = os.path.join(directory_path, f"san_francisco_{timestamp_str}.txt")
        
        with open(output_file_path, "w", encoding="utf-8") as outfile:
            for i in range(num_files):
                file_path = os.path.join(directory_path, f"testing{i}.txt")
                
                if not os.path.exists(file_path):
                    print(f"File {file_path} does not exist. Skipping.")
                    continue
                
                with open(file_path, "r", encoding="utf-8") as infile:
                    outfile.write(infile.read())
                    
        print(f"Combined file saved to {output_file_path}")
    
    def close(self):
        print("Closing the browser...")
        self.driver.quit()

# Usage:
scraper = SanFranciscoUpCodesScraper(SAN_FRANCISCO_UPCODES_URL)
scraper.navigate_and_get_title()
urls = scraper.extract_links_from_main("div.flex.flex-row")  # limiting to first URL for testing
scraper.extract_text_from_sublinks(urls)
scraper.close()


# Scrape San Jose Codes

In [7]:
class SanJoseUpCodesScraper:
    def __init__(self, base_url):
        self.driver = get_chrome_driver()
        self.base_url = base_url
        
    def navigate_and_get_title(self):
        print(f"Navigating to {self.base_url}...")
        navigate_and_print_title(self.driver, self.base_url)

    def extract_links_from_main(self, css_selector):
        print("Extracting main links...")
        main = WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, css_selector))
        )
        time.sleep(1)
        links = main.find_elements(By.CSS_SELECTOR, ".group.font-inter")
        extracted_links = [link.get_attribute("href") for link in links]
        print(f"Extracted {len(extracted_links)} links from main page.")
        return extracted_links

    def extract_text_from_sublinks(self, urls):
        timestamp_str = datetime.now().strftime("%Y%m%d")
        self.output_file_path = RAW_DATA_DIR / 'san_jose_building_codes' / f"san_jose_{timestamp_str}.txt"
        section_div = []
        for index, url in enumerate(urls):
            print(f"Processing URL {index + 1}/{len(urls)}: {url}")
            self.driver.get(url)
            elements = self.driver.find_elements(By.CSS_SELECTOR, "a.block.w-full")
            sublinks = [element.get_attribute("href") for element in elements]
            
            for sublink in sublinks:
                self.driver.get(sublink)
                elements = self.driver.find_elements(By.XPATH, "//*[@id='__next']/div[2]/div[2]/div/main/div[1]/div/div[2]")
                new_sections = [element.text for element in elements]
                if new_sections:
                    print(f"Extracted {len(new_sections)} section(s) from {sublink}")
                else:
                    print(f"No sections extracted from {sublink}. This might indicate a scraping issue.")
                section_div.extend(new_sections)
                self._print_sections(new_sections)
            
            #self._save_sections_to_file(section_div, index)
        #self._combine_files(len(urls))
        self._save_all_sections_to_file(section_div)
        
    def _print_sections(self, sections):
        print("Printing extracted sections...")

    def _save_all_sections_to_file(self, sections):
        print(f"Saving data to {self.output_file_path}...")
        with open(self.output_file_path, "w", encoding="utf-8") as f:
            for section in sections:
                f.write(section + "\n\n")
        print("Data saved successfully!")
                
    def _combine_files(self, num_files):
        timestamp_str = datetime.now().strftime("%Y%m%d")
        directory_path = str(RAW_DATA_DIR / 'san_jose_building_codes')
        output_file_path = os.path.join(directory_path, f"san_jose_{timestamp_str}.txt")
        
        with open(output_file_path, "w", encoding="utf-8") as outfile:
            for i in range(num_files):
                file_path = os.path.join(directory_path, f"testing{i}.txt")
                
                if not os.path.exists(file_path):
                    print(f"File {file_path} does not exist. Skipping.")
                    continue
                
                with open(file_path, "r", encoding="utf-8") as infile:
                    outfile.write(infile.read())
                    
        print(f"Combined file saved to {output_file_path}")
    
    def close(self):
        print("Closing the browser...")
        self.driver.quit()

# Usage:
scraper = SanJoseUpCodesScraper(SAN_JOSE_UPCODES_URL)
scraper.navigate_and_get_title()
urls = scraper.extract_links_from_main("div.flex.flex-row")  # limiting to first URL for testing
scraper.extract_text_from_sublinks(urls)
scraper.close()


Navigating to https://up.codes/codes/san-jose...
San José Building Codes | UpCodes
Extracting main links...
Extracted 33 links from main page.
Processing URL 1/33: https://up.codes/viewer/san-jose/ca-building-code-2022
Extracted 1 section(s) from https://up.codes/viewer/san-jose/ca-building-code-2022/chapter/1/scope-and-administration#1
Printing extracted sections...
Extracted 1 section(s) from https://up.codes/viewer/san-jose/ca-building-code-2022/chapter/2/definitions#2
Printing extracted sections...
Extracted 1 section(s) from https://up.codes/viewer/san-jose/ca-building-code-2022/chapter/3/occupancy-classification-and-use#3
Printing extracted sections...
Extracted 1 section(s) from https://up.codes/viewer/san-jose/ca-building-code-2022/chapter/4/special-detailed-requirements-based-on-occupancy-and-use#4
Printing extracted sections...


NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=117.0.5938.150)
Stacktrace:
	GetHandleVerifier [0x00007FF7F8DD7D12+55474]
	(No symbol) [0x00007FF7F8D477C2]
	(No symbol) [0x00007FF7F8BFE0EB]
	(No symbol) [0x00007FF7F8BDE528]
	(No symbol) [0x00007FF7F8C63B77]
	(No symbol) [0x00007FF7F8C775BF]
	(No symbol) [0x00007FF7F8C5EF33]
	(No symbol) [0x00007FF7F8C33D41]
	(No symbol) [0x00007FF7F8C34F84]
	GetHandleVerifier [0x00007FF7F913B762+3609346]
	GetHandleVerifier [0x00007FF7F9191A80+3962400]
	GetHandleVerifier [0x00007FF7F9189F0F+3930799]
	GetHandleVerifier [0x00007FF7F8E73CA6+694342]
	(No symbol) [0x00007FF7F8D52218]
	(No symbol) [0x00007FF7F8D4E484]
	(No symbol) [0x00007FF7F8D4E5B2]
	(No symbol) [0x00007FF7F8D3EE13]
	BaseThreadInitThunk [0x00007FF96F29257D+29]
	RtlUserThreadStart [0x00007FF97022AA78+40]


# Parallelize the scraping

In [18]:
class CaliforniaUpCodesScraper:
    def __init__(self, base_url):
        self.driver = get_chrome_driver()
        self.base_url = base_url
        
    def navigate_and_get_title(self):
        print(f"Navigating to {self.base_url}...")
        navigate_and_print_title(self.driver, self.base_url)

    def extract_links_from_main(self, css_selector):
        print("Extracting main links...")
        main = WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, css_selector))
        )
        time.sleep(1)
        links = main.find_elements(By.CSS_SELECTOR, ".group.font-inter")
        extracted_links = [link.get_attribute("href") for link in links]
        print(f"Extracted {len(extracted_links)} links from main page.")
        return extracted_links

    def extract_text_from_sublinks(self, urls):
        timestamp_str = datetime.now().strftime("%Y%m%d")
        self.output_file_path = RAW_DATA_DIR / 'california_building_codes' / f"california_{timestamp_str}.txt"
        section_div = []
        for index, url in enumerate(urls):
            if not url:
                print("Encountered empty URL, skipping...")
                continue
            print(f"Processing URL {index + 1}/{len(urls)}: {url}")
            self.driver.get(url)
            elements = self.driver.find_elements(By.CSS_SELECTOR, "a.block.w-full")
            sublinks = [element.get_attribute("href") for element in elements if element.get_attribute("href")]
            
            for sublink in sublinks:
                self.driver.get(sublink)
                elements = self.driver.find_elements(By.XPATH, "//*[@id='__next']/div[2]/div[2]/div/main/div[1]/div/div[2]")
                new_sections = [element.text for element in elements]
                if new_sections:
                    print(f"Extracted {len(new_sections)} section(s) from {sublink}")
                else:
                    print(f"No sections extracted from {sublink}. This might indicate a scraping issue.")
                section_div.extend(new_sections)
                self._print_sections(new_sections)
            
            #self._save_sections_to_file(section_div, index)
        #self._combine_files(len(urls))
        self._save_all_sections_to_file(section_div)
    
    def _print_sections(self, sections):
        print("Printing extracted sections...")

    def _save_all_sections_to_file(self, sections):
        print(f"Saving data to {self.output_file_path}...")
        with open(self.output_file_path, "w", encoding="utf-8") as f:
            for section in sections:
                f.write(section + "\n\n")
        print("Data saved successfully!")
                
    def _combine_files(self, num_files):
        timestamp_str = datetime.now().strftime("%Y%m%d")
        directory_path = str(RAW_DATA_DIR / 'california_building_codes')
        output_file_path = os.path.join(directory_path, f"california_{timestamp_str}.txt")
        
        with open(output_file_path, "w", encoding="utf-8") as outfile:
            for i in range(num_files):
                file_path = os.path.join(directory_path, f"testing{i}.txt")
                
                if not os.path.exists(file_path):
                    print(f"File {file_path} does not exist. Skipping.")
                    continue
                
                with open(file_path, "r", encoding="utf-8") as infile:
                    outfile.write(infile.read())
                    
        print(f"Combined file saved to {output_file_path}")
    
    def close(self):
        print("Closing the browser...")
        self.driver.quit()
        print("Browser closed successfully!")

def main_california_upcodes():
    scraper = CaliforniaUpCodesScraper(CALIFORNIA_UPCODES_URL)
    scraper.navigate_and_get_title()
    urls = scraper.extract_links_from_main("div.flex.flex-row")
    scraper.extract_text_from_sublinks(urls)
    scraper.close()

# Scrape Los Angeles Codes
class LosAngelesUpCodesScraper:
    def __init__(self, base_url):
        self.driver = get_chrome_driver()
        self.base_url = base_url
        
    def navigate_and_get_title(self):
        print(f"Navigating to {self.base_url}...")
        navigate_and_print_title(self.driver, self.base_url)

    def extract_links_from_main(self, css_selector):
        print("Extracting main links...")
        main = WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, css_selector))
        )
        time.sleep(1)
        links = main.find_elements(By.CSS_SELECTOR, ".group.font-inter")
        extracted_links = [link.get_attribute("href") for link in links]
        print(f"Extracted {len(extracted_links)} links from main page.")
        return extracted_links

    def extract_text_from_sublinks(self, urls):
        timestamp_str = datetime.now().strftime("%Y%m%d")
        self.output_file_path = RAW_DATA_DIR / 'los_angeles_building_codes' / f"los_angeles_{timestamp_str}.txt"
        section_div = []
        for index, url in enumerate(urls):
            if not url:
                print("Encountered empty URL, skipping...")
                continue
            print(f"Processing URL {index + 1}/{len(urls)}: {url}")
            self.driver.get(url)
            elements = self.driver.find_elements(By.CSS_SELECTOR, "a.block.w-full")
            sublinks = [element.get_attribute("href") for element in elements if element.get_attribute("href")]
            
            for sublink in sublinks:
                self.driver.get(sublink)
                elements = self.driver.find_elements(By.XPATH, "//*[@id='__next']/div[2]/div[2]/div/main/div[1]/div/div[2]")
                new_sections = [element.text for element in elements]
                if new_sections:
                    print(f"Extracted {len(new_sections)} section(s) from {sublink}")
                else:
                    print(f"No sections extracted from {sublink}. This might indicate a scraping issue.")
                section_div.extend(new_sections)
                self._print_sections(new_sections)
            
            #self._save_sections_to_file(section_div, index)
        #self._combine_files(len(urls))
        self._save_all_sections_to_file(section_div)
        
    def _print_sections(self, sections):
        print("Printing extracted sections...")

    def _save_all_sections_to_file(self, sections):
        print(f"Saving data to {self.output_file_path}...")
        with open(self.output_file_path, "w", encoding="utf-8") as f:
            for section in sections:
                f.write(section + "\n\n")
        print("Data saved successfully!")
                
    def _combine_files(self, num_files):
        timestamp_str = datetime.now().strftime("%Y%m%d")
        directory_path = str(RAW_DATA_DIR / 'los_angeles_building_codes')
        output_file_path = os.path.join(directory_path, f"los_angeles_{timestamp_str}.txt")
        
        with open(output_file_path, "w", encoding="utf-8") as outfile:
            for i in range(num_files):
                file_path = os.path.join(directory_path, f"testing{i}.txt")
                
                if not os.path.exists(file_path):
                    print(f"File {file_path} does not exist. Skipping.")
                    continue
                
                with open(file_path, "r", encoding="utf-8") as infile:
                    outfile.write(infile.read())
                    
        print(f"Combined file saved to {output_file_path}")
    
    def close(self):
        print("Closing the browser...")
        self.driver.quit()
        print("Browser closed successfully!")

def main_los_angeles_upcodes():
    scraper = LosAngelesUpCodesScraper(LOS_ANGELES_UPCODES_URL)
    scraper.navigate_and_get_title()
    urls = scraper.extract_links_from_main("div.flex.flex-row")
    scraper.extract_text_from_sublinks(urls)
    scraper.close()

# Scrape Los Angeles County Codes
class LosAngelesCountyUpCodesScraper:
    def __init__(self, base_url):
        self.driver = get_chrome_driver()
        self.base_url = base_url
        
    def navigate_and_get_title(self):
        print(f"Navigating to {self.base_url}...")
        navigate_and_print_title(self.driver, self.base_url)

    def extract_links_from_main(self, css_selector):
        print("Extracting main links...")
        main = WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, css_selector))
        )
        time.sleep(1)
        links = main.find_elements(By.CSS_SELECTOR, ".group.font-inter")
        extracted_links = [link.get_attribute("href") for link in links]
        print(f"Extracted {len(extracted_links)} links from main page.")
        return extracted_links

    def extract_text_from_sublinks(self, urls):
        timestamp_str = datetime.now().strftime("%Y%m%d")
        self.output_file_path = RAW_DATA_DIR / 'los_angeles_county_building_codes' / f"los_angeles_county_{timestamp_str}.txt"
        section_div = []
        for index, url in enumerate(urls):
            if not url:
                print("Encountered empty URL, skipping...")
                continue
            print(f"Processing URL {index + 1}/{len(urls)}: {url}")
            self.driver.get(url)
            elements = self.driver.find_elements(By.CSS_SELECTOR, "a.block.w-full")
            sublinks = [element.get_attribute("href") for element in elements if element.get_attribute("href")]
            
            
            for sublink in sublinks:
                self.driver.get(sublink)
                elements = self.driver.find_elements(By.XPATH, "//*[@id='__next']/div[2]/div[2]/div/main/div[1]/div/div[2]")
                new_sections = [element.text for element in elements]
                if new_sections:
                    print(f"Extracted {len(new_sections)} section(s) from {sublink}")
                else:
                    print(f"No sections extracted from {sublink}. This might indicate a scraping issue.")
                section_div.extend(new_sections)
                self._print_sections(new_sections)
            
            #self._save_sections_to_file(section_div, index)
        #self._combine_files(len(urls))
        self._save_all_sections_to_file(section_div)
        
    def _print_sections(self, sections):
        print("Printing extracted sections...")

    def _save_all_sections_to_file(self, sections):
        print(f"Saving data to {self.output_file_path}...")
        with open(self.output_file_path, "w", encoding="utf-8") as f:
            for section in sections:
                f.write(section + "\n\n")
        print("Data saved successfully!")
                
    def _combine_files(self, num_files):
        timestamp_str = datetime.now().strftime("%Y%m%d")
        directory_path = str(RAW_DATA_DIR / 'los_angeles_county_building_codes')
        output_file_path = os.path.join(directory_path, f"los_angeles_county_{timestamp_str}.txt")
        
        with open(output_file_path, "w", encoding="utf-8") as outfile:
            for i in range(num_files):
                file_path = os.path.join(directory_path, f"testing{i}.txt")
                
                if not os.path.exists(file_path):
                    print(f"File {file_path} does not exist. Skipping.")
                    continue
                
                with open(file_path, "r", encoding="utf-8") as infile:
                    outfile.write(infile.read())
                    
        print(f"Combined file saved to {output_file_path}")
    
    def close(self):
        print("Closing the browser...")
        self.driver.quit()
        print("Browser closed successfully!")

def main_los_angeles_county_upcodes():
    scraper = LosAngelesCountyUpCodesScraper(LOS_ANGELES_COUNTY_UPCODES_URL)
    scraper.navigate_and_get_title()
    urls = scraper.extract_links_from_main("div.flex.flex-row")
    scraper.extract_text_from_sublinks(urls)
    scraper.close()

# Scrape San Francisco Codes

class SanFranciscoUpCodesScraper:
    def __init__(self, base_url):
        self.driver = get_chrome_driver()
        self.base_url = base_url
        
    def navigate_and_get_title(self):
        print(f"Navigating to {self.base_url}...")
        navigate_and_print_title(self.driver, self.base_url)

    def extract_links_from_main(self, css_selector):
        print("Extracting main links...")
        main = WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, css_selector))
        )
        time.sleep(1)
        links = main.find_elements(By.CSS_SELECTOR, ".group.font-inter")
        extracted_links = [link.get_attribute("href") for link in links]
        print(f"Extracted {len(extracted_links)} links from main page.")
        return extracted_links

    def extract_text_from_sublinks(self, urls):
        timestamp_str = datetime.now().strftime("%Y%m%d")
        self.output_file_path = RAW_DATA_DIR / 'san_francisco_building_codes' / f"san_francisco_{timestamp_str}.txt"
        section_div = []
        for index, url in enumerate(urls):
            if not url:
                print("Encountered empty URL, skipping...")
                continue
            print(f"Processing URL {index + 1}/{len(urls)}: {url}")
            self.driver.get(url)
            elements = self.driver.find_elements(By.CSS_SELECTOR, "a.block.w-full")
            sublinks = [element.get_attribute("href") for element in elements if element.get_attribute("href")]
            
            for sublink in sublinks:
                self.driver.get(sublink)
                elements = self.driver.find_elements(By.XPATH, "//*[@id='__next']/div[2]/div[2]/div/main/div[1]/div/div[2]")
                new_sections = [element.text for element in elements]
                if new_sections:
                    print(f"Extracted {len(new_sections)} section(s) from {sublink}")
                else:
                    print(f"No sections extracted from {sublink}. This might indicate a scraping issue.")
                section_div.extend(new_sections)
                self._print_sections(new_sections)
            
            #self._save_sections_to_file(section_div, index)
        #self._combine_files(len(urls))
        self._save_all_sections_to_file(section_div)
        
    def _print_sections(self, sections):
        print("Printing extracted sections...")

    def _save_all_sections_to_file(self, sections):
        print(f"Saving data to {self.output_file_path}...")
        with open(self.output_file_path, "w", encoding="utf-8") as f:
            for section in sections:
                f.write(section + "\n\n")
        print("Data saved successfully!")
                
    def _combine_files(self, num_files):
        timestamp_str = datetime.now().strftime("%Y%m%d")
        directory_path = str(RAW_DATA_DIR / 'san_francisco_building_codes')
        output_file_path = os.path.join(directory_path, f"san_francisco_{timestamp_str}.txt")
        
        with open(output_file_path, "w", encoding="utf-8") as outfile:
            for i in range(num_files):
                file_path = os.path.join(directory_path, f"testing{i}.txt")
                
                if not os.path.exists(file_path):
                    print(f"File {file_path} does not exist. Skipping.")
                    continue
                
                with open(file_path, "r", encoding="utf-8") as infile:
                    outfile.write(infile.read())
                    
        print(f"Combined file saved to {output_file_path}")
    
    def close(self):
        print("Closing the browser...")
        self.driver.quit()
        print("Browser closed successfully!")

def main_san_francisco_upcodes():
    scraper = SanFranciscoUpCodesScraper(SAN_FRANCISCO_UPCODES_URL)
    scraper.navigate_and_get_title()
    urls = scraper.extract_links_from_main("div.flex.flex-row")
    scraper.extract_text_from_sublinks(urls)
    scraper.close()

# Scrape San Jose Codes

class SanJoseUpCodesScraper:
    def __init__(self, base_url):
        self.driver = get_chrome_driver()
        self.base_url = base_url
        
    def navigate_and_get_title(self):
        print(f"Navigating to {self.base_url}...")
        navigate_and_print_title(self.driver, self.base_url)

    def extract_links_from_main(self, css_selector):
        print("Extracting main links...")
        main = WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, css_selector))
        )
        time.sleep(1)
        links = main.find_elements(By.CSS_SELECTOR, ".group.font-inter")
        extracted_links = [link.get_attribute("href") for link in links if link.get_attribute("href")]
        print(f"Extracted {len(extracted_links)} links from main page.")
        return extracted_links

    def extract_text_from_sublinks(self, urls):
        timestamp_str = datetime.now().strftime("%Y%m%d")
        self.output_file_path = RAW_DATA_DIR / 'san_jose_building_codes' / f"san_jose_{timestamp_str}.txt"
        section_div = []
        for index, url in enumerate(urls):
            if not url:
                print("Encountered empty URL, skipping...")
                continue
            print(f"Processing URL {index + 1}/{len(urls)}: {url}")
            self.driver.get(url)
            elements = self.driver.find_elements(By.CSS_SELECTOR, "a.block.w-full")
            sublinks = [element.get_attribute("href") for element in elements if element.get_attribute("href")]
            
            for sublink in sublinks:
                self.driver.get(sublink)
                elements = self.driver.find_elements(By.XPATH, "//*[@id='__next']/div[2]/div[2]/div/main/div[1]/div/div[2]")
                new_sections = [element.text for element in elements]
                if new_sections:
                    print(f"Extracted {len(new_sections)} section(s) from {sublink}")
                else:
                    print(f"No sections extracted from {sublink}. This might indicate a scraping issue.")
                section_div.extend(new_sections)
                self._print_sections(new_sections)
            
            #self._save_sections_to_file(section_div, index)
        #self._combine_files(len(urls))
        self._save_all_sections_to_file(section_div)
        #return section_div
        
    def _print_sections(self, sections):
        print("Printing extracted sections...")

    def _save_all_sections_to_file(self, sections):
        print(f"Saving data to {self.output_file_path}...")
        with open(self.output_file_path, "w", encoding="utf-8") as f:
            for section in sections:
                f.write(section + "\n\n")
        print("Data saved successfully!")
                
    def _combine_files(self, num_files):
        timestamp_str = datetime.now().strftime("%Y%m%d")
        directory_path = str(RAW_DATA_DIR / 'san_jose_building_codes')
        output_file_path = os.path.join(directory_path, f"san_jose_{timestamp_str}.txt")
        
        with open(output_file_path, "w", encoding="utf-8") as outfile:
            for i in range(num_files):
                file_path = os.path.join(directory_path, f"testing{i}.txt")
                
                if not os.path.exists(file_path):
                    print(f"File {file_path} does not exist. Skipping.")
                    continue
                
                with open(file_path, "r", encoding="utf-8") as infile:
                    outfile.write(infile.read())
                    
        print(f"Combined file saved to {output_file_path}")
    
    def close(self):
        print("Closing the browser...")
        self.driver.quit()
        print("Browser closed successfully!")
    
def main_san_jose_upcodes():
    scraper = SanJoseUpCodesScraper(SAN_JOSE_UPCODES_URL)
    scraper.navigate_and_get_title()
    urls = scraper.extract_links_from_main("div.flex.flex-row")
    scraper.extract_text_from_sublinks(urls)
    scraper.close()


In [19]:

from concurrent.futures import ThreadPoolExecutor

def run_all_code_scrapers():
    with ThreadPoolExecutor() as executor:
        executor.submit(main_los_angeles_county_upcodes())
        executor.submit(main_california_upcodes())
        executor.submit(main_san_francisco_upcodes())
        executor.submit(main_los_angeles_upcodes())
        executor.submit(main_san_jose_upcodes())

run_all_code_scrapers()

Navigating to https://up.codes/codes/los-angeles-county...
Los Angeles County Building Codes | UpCodes
Extracting main links...
Extracted 12 links from main page.
Processing URL 1/12: https://up.codes/viewer/los-angeles-county/ca-building-code-2022
Extracted 1 section(s) from https://up.codes/viewer/los-angeles-county/ca-building-code-2022/chapter/new_duped_1/administration#new_duped_1
Printing extracted sections...
Extracted 1 section(s) from https://up.codes/viewer/los-angeles-county/ca-building-code-2022/chapter/2/definitions#2
Printing extracted sections...
Extracted 1 section(s) from https://up.codes/viewer/los-angeles-county/ca-building-code-2022/chapter/3/occupancy-classification-and-use#3
Printing extracted sections...
Extracted 1 section(s) from https://up.codes/viewer/los-angeles-county/ca-building-code-2022/chapter/4/special-detailed-requirements-based-on-occupancy-and-use#4
Printing extracted sections...
Extracted 1 section(s) from https://up.codes/viewer/los-angeles-county