In [52]:
import sys
sys.path.append('/Projects/regionintelligenceai/')


In [53]:
# Import driver configuration
import time
import numpy as np
from src.config.driver_config import get_chrome_driver, navigate_and_print_title, main

driver = get_chrome_driver()

URL = "https://www.santa-ana.org/major-planning-projects-and-monthly-development-project-reports/"

# Navigate to the URL and print its title
navigate_and_print_title(driver, URL)

# If you need to close the driver after use (recommended if not using further in the notebook):
driver.quit()

Major planning projects and monthly development reports - City of Santa Ana


In [54]:
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

def extract_listing_names_from_url(driver, url):
    """
    Navigates to a specified URL using the given driver and extracts listing names (or project names) from a table.
    
    Parameters:
    - driver: Selenium WebDriver
    - url: Webpage URL

    Returns:
    - list: A list of extracted listing names
    """
    
    # Navigate to the URL
    driver.get(url)
    
    listing_names = []
    
    # Wait for the table to be present and then locate it
    main = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "#projectList"))
    )
    
    # Extract the names from the first cell of each row
    titles = main.find_elements(By.CSS_SELECTOR, "#projectList tr td:first-of-type")
    for title in titles:
        listing_names.append(title.text)

    return listing_names

# Example usage:
driver = get_chrome_driver()
URL = "https://www.santa-ana.org/major-planning-projects-and-monthly-development-project-reports/"
listing_names = extract_listing_names_from_url(driver, URL)
for name in listing_names:
    print(name)
driver.quit()


1st and Harbor Mix-Use Development
3rd and Broadway Mix-Use Development
4th and Mortimer Mixed-Used Development
5th and Harbor Mixed-Use Development
7-Eleven Service Station (Euclid Street)
AMG Senior Housing
AMG First Point Family Affordable Apartments
Bella Terra Residential Community and Temple
Bewley Townhomes
Billboard Ordinance
Warner Redhill Mixed-Use Development (formerly the Bowery)
Bristol Corridor Specific Plan Amendment
Cabrillo Crossing Townhomes
Cabrillo Town Center
Calvary Church
Central Pointe Mixed-Use Development
Climate Action Plan
Complete Streets Plans
Community Engagement Plan
Coptic Orthodox Church
Crossroads at Washington
First American Title Company Mixed-Use Development
FX Residences
Garry Avenues Business Park
General Plan Update
Grand and Grovemont Development
Haphan Residential
Harbor Boulevard Streetscape Plan
The Heritage
Illumination Foundation Renovation Project
Innovative Housing Opportunities Mixed-Use Project
Legacy Square
Legacy Sunflower
Legado at 

In [55]:
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

def extract_addresses_from_url(driver, url):
    """
    Navigates to a specified URL using the given driver and extracts addresses from a table.
    
    Parameters:
    - driver: Selenium WebDriver
    - url: Webpage URL

    Returns:
    - list: A list of extracted addresses
    """
    
    # Navigate to the URL
    driver.get(url)
    
    project_locations = []
    
    # Wait for the table to be present and then locate it
    main = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "#projectList"))
    )
    
    # Extract the addresses from the second cell of each row
    addresses = main.find_elements(By.CSS_SELECTOR, "#projectList tr td:nth-of-type(2)")
    for address in addresses:
        project_locations.append(address.text)

    return project_locations

# Example usage:
driver = get_chrome_driver()
url = "https://www.santa-ana.org/major-planning-projects-and-monthly-development-project-reports/"
project_addresses = extract_addresses_from_url(driver, url)
for address in project_addresses:
    print(address)
driver.quit()


101 N. Harbor Boulevard (Ward 5)
201 W. 3rd Street (Ward 5)
409 E. 4th Street (Ward 6)
419 N. Harbor Blvd (Ward 5)
813 N. Euclid Street (Ward 1)
2202 E. 1st Street (Ward 3)
2114 E. 1st Street (Ward 3)
4006 W. Hazard Avenue
1122 N. Bewley Street (Ward 1)
Citywide
2300 S. Red Hill Avenue (Ward 6)
N/A
1814 E. First Street (Ward 3)
1901 E. Fourth Street (Ward 3)
1010 N. Tustin Avenue (Ward 3)
1801 E. 4th Street (Ward 3)
N/A
N/A
N/A
4405 W. Edinger Avenue (Ward 1)
1126 E. Washington Avenue (Ward 3)
114 E. 5Th Street (Ward 6)
801 E. Santa Ana Blvd. (Ward 6)
1700 E. Garry Avenue (Ward 6)
N/A
2525 N. Grand Avenue (Ward 3)
3025 W. Edinger Avenue (Ward 1)
Harbor Boulevard
2001 E. Dyer Road (Ward 6)
918 N. Bewley Street   (Ward 5)
2021 E. 4th Street (Ward 3)
609 N. Spurgeon Street (Ward 6)
651 W. Sunflower Avenue (Ward 6)
200 E. First American Way (Ward 6)
200 N. Cabrillo Park Drive (Ward 3)
2800 N. Main Street (Ward 3)
2101 E. Santa Clara Avenue  (Ward 3)
N/A
301 N. Mountain View Street (Ward 1)

In [56]:
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

def extract_applicants_from_url(driver, url):
    """
    Navigates to a specified URL using the given driver and extracts applicants from a table.
    
    Parameters:
    - driver: Selenium WebDriver
    - url: Webpage URL

    Returns:
    - list: A list of extracted applicants
    """
    
    # Navigate to the URL
    driver.get(url)
    
    planner_leads = []
    
    # Wait for the table to be present and then locate it
    main = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "#projectList"))
    )
    
    # Extract the applicants from the third cell of each row
    applicants = main.find_elements(By.CSS_SELECTOR, "#projectList tr td:nth-of-type(3)")
    for applicant in applicants:
        planner_leads.append(applicant.text)

    return planner_leads

# Example usage:
driver = get_chrome_driver()
url = "https://www.santa-ana.org/major-planning-projects-and-monthly-development-project-reports/"
planners = extract_applicants_from_url(driver, url)
for planner in planners:
    print(planner)
driver.quit()


Charles "Chuck" Minyard
Mike Harrah, Caribou Industries
Andrew Nelson, Red Oak Investments, LLC.
Excel Property Management Services, Inc.
Adan Madrid, ASI Development
Kimberly Calica, AMG & Associates
Alexis Gavorgian, AMG & Associates
Vince Fregoso, studio 2 design + partners
Ada Rose, YNG Architects
City of Santa Ana
Ryan Gahagan, Arrimus Capital
City of Santa Ana
Brandywine Acquisition Group, LLC.
Grant Williams, FRH Realty, LLC. (Fairfield Residential)
Michael Welles, Calvary Church
Sean Rawson, Waterford Property Company
City of Santa Ana
City of Santa Ana
City of Santa Ana
Archangel Michael Coptic Orthodox
Related Companies of California
Pam Sapetto, Sapetto Real Estate Solutions
HomeAid Mercy House
Nick Chen
City of Santa Ana
Eric Higuchi, Grand Avenue, LLC.
Quoc Phan, Haphan Group Inc.
City of Santa Ana
N/A
Illumination Foundation
Terri Dickerhoff, CGR Development
Alexa Washburn, National Community Renaissance of California
Legacy Partners
Ki Ryu, Legado at the Met, LLC.
Robert

In [57]:
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

def extract_property_owners_from_url(driver, url):
    """
    Navigates to a specified URL using the given driver and extracts property owners from a table.
    
    Parameters:
    - driver: Selenium WebDriver
    - url: Webpage URL

    Returns:
    - list: A list of extracted property owners
    """
    
    # Navigate to the URL
    driver.get(url)
    
    property_owner = []
    
    # Wait for the table to be present and then locate it
    main = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "#projectList"))
    )
    
    # Extract the property owners from the fourth cell of each row
    owners = main.find_elements(By.CSS_SELECTOR, "#projectList tr td:nth-of-type(4)")
    for owner in owners:
        property_owner.append(owner.text)

    return property_owner

# Example usage:
driver = get_chrome_driver()
url = "https://www.santa-ana.org/major-planning-projects-and-monthly-development-project-reports/"
owners_list = extract_property_owners_from_url(driver, url)
for owner_name in owners_list:
    print(owner_name)
driver.quit()


Primior, Inc.
City of Santa Ana
Los Altos XXI, LP GON-REY, LP.
M&A Gabaee, LP.
Euclid Hazard Capital LLC.
Executive Car Leasing Co.
Broomell Commercial Properties, LP.
Nguyen, Long V.
Nguyen, Kimloan J
N/A
RHW Holdings LLC.
N/A
The Provider Fund, LP.
Mr. Dave Colton, The Colton Company
Church Calvary of Santa Ana Inc.
Park Center Santa Ana Associates LP.
N/A
N/A
N/A
Archangel Michael Coptic Orthodox
Housing Authority of the City of Santa Ana
QOZB IIII LLC.
Housing Authority of the City of Santa Ana
Garry Owners, LLC.
N/A
Grand Avenue Plaza, LLC.
Haphan Group Inc.
N/A
LD Acquisition
IHLLC Bewley, LLC.
Orange County Community Housing Corporation
Santa Ana United Methodist Church
RAR2 651 Sunflower Owner LLC.
Legado at the Met, LLC.
First Credit Bank
Mainplace Shoppingtown LLC.
SRP/Stater Bros, LLC.
N/A
Mountainview Real Estate Investments LLC.
One Broadway Plaza LLC.
Greenlaw Partners, LLC.
MCG Bristol West LLC.
Russell Fischer Partnership, LP.
C.J. Segerstrom & Sons
N/A
KB Home Coastal 

In [58]:
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

def extract_project_status_from_url(driver, url):
    """
    Navigates to a specified URL using the given driver and extracts project statuses from a table.
    
    Parameters:
    - driver: Selenium WebDriver
    - url: Webpage URL

    Returns:
    - list: A list of extracted project statuses
    """
    
    # Navigate to the URL
    driver.get(url)
    
    project_status = []
    
    # Wait for the table to be present and then locate it
    main = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "#projectList"))
    )
    
    # Extract the project statuses from the fifth cell of each row
    status = main.find_elements(By.CSS_SELECTOR, "#projectList tr td:nth-of-type(5)")
    for stats in status:
        project_status.append(stats.text)

    return project_status

# Example usage:
driver = get_chrome_driver()
url = "https://www.santa-ana.org/major-planning-projects-and-monthly-development-project-reports/"
status_list = extract_project_status_from_url(driver, url)
for status_name in status_list:
    print(status_name)
driver.quit()


Under Development Project Review
Under Plan Check Review
Under Plan Check Review
Under Development Project Review
Under Construction
Under Development Project Review
Completed
Under Development Project Review
Under Plan Check Review
Under Review
Under Construction
Completed
Under Plan Check Review
Project Denied
Under Public Hearings
Under Plan Check Review
Completed
Completed
Completed
Under Public Hearing
Under Construction
Under Construction
Under Construction
Under Plan Check Review
In Process
Under Development Project Review
Under Plan Check Review
In Process
Completed
Under Review
Under Development Project Review
Completed
Completed
Under Plan Check Review
Under Plan Check Review
Under Plan Check Review
Under Development Project Review
Completed
Under Plan Check Review
Under Plan Check Review
Under Review
Under Development Project Review
Under Construction
Under Development Project Review
Under Development Project Review
Under Plan Check Review
Under Development Project Review
Un

In [59]:
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd

def extract_table_data_from_url(driver, url):
    """
    Navigates to a specified URL using the given driver and extracts data from the table.
    
    Parameters:
    - driver: Selenium WebDriver
    - url: Webpage URL

    Returns:
    - DataFrame: A pandas DataFrame containing the extracted table data.
    """
    
    # Navigate to the URL
    driver.get(url)
    
    # Wait for the table to be present and then locate it
    main = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "#projectList"))
    )
    
    # Define a helper function to extract column data
    def extract_column_data(column_index):
        return [cell.text for cell in main.find_elements(By.CSS_SELECTOR, f"#projectList tr td:nth-of-type({column_index})")]
    
    # Extract data from the table
    project_names = extract_column_data(1)
    addresses = extract_column_data(2)
    applicants = extract_column_data(3)
    property_owners = extract_column_data(4)
    project_statuses = extract_column_data(5)
    
    # Collate the data into a pandas DataFrame
    df = pd.DataFrame({
        "Project Names": project_names,
        "Addresses": addresses,
        "Applicants": applicants,
        "Property Owners": property_owners,
        "Project Status": project_statuses
    })

    return df

# Example usage:
driver = get_chrome_driver()
url = "https://www.santa-ana.org/major-planning-projects-and-monthly-development-project-reports/"
data_df = extract_table_data_from_url(driver, url)
print(data_df.head())  # To display the first few rows of the DataFrame



                              Project Names                         Addresses  \
0        1st and Harbor Mix-Use Development  101 N. Harbor Boulevard (Ward 5)   
1      3rd and Broadway Mix-Use Development        201 W. 3rd Street (Ward 5)   
2   4th and Mortimer Mixed-Used Development        409 E. 4th Street (Ward 6)   
3      5th and Harbor Mixed-Use Development       419 N. Harbor Blvd (Ward 5)   
4  7-Eleven Service Station (Euclid Street)     813 N. Euclid Street (Ward 1)   

                                 Applicants                 Property Owners  \
0                   Charles "Chuck" Minyard                   Primior, Inc.   
1           Mike Harrah, Caribou Industries               City of Santa Ana   
2  Andrew Nelson, Red Oak Investments, LLC.  Los Altos XXI, LP GON-REY, LP.   
3  Excel Property Management Services, Inc.                 M&A Gabaee, LP.   
4              Adan Madrid, ASI Development      Euclid Hazard Capital LLC.   

                     Project Status  


## Now that data is extracted, we can add more information through the project links on the page. 

In [76]:
import time
import numpy as np
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


class SantaAnaScraper:
    def __init__(self, driver):
        self.driver = driver
        self.listing_names = []
        self.project_locations = []
        self.planner_leads = []
        self.property_owner = []
        self.project_status = []
        self.project_descriptions = []
        self.contact_information = []
        self.last_project_update = []
        self.all_images_urls = []

    def connect(self, url):
        self.driver.get(url)
        print(self.driver.title)

    def scrape_base_directory(self):
        try:
            main = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "#projectList"))
            )
            self._scrape_titles(main)
            self._scrape_addresses(main)
            self._scrape_applicants(main)
            self._scrape_owners(main)
            self._scrape_status(main)
        finally:
            pass

    def _scrape_titles(self, main):
        titles = main.find_elements(By.CSS_SELECTOR, "#projectList tr td:first-of-type")
        for title in titles:
            self.listing_names.append(title.text)

    def _scrape_addresses(self, main):
        addresses = main.find_elements(By.CSS_SELECTOR, "#projectList tr td:nth-of-type(2)")
        for address in addresses:
            self.project_locations.append(address.text)

    def _scrape_applicants(self, main):
        applicants = main.find_elements(By.CSS_SELECTOR, "#projectList tr td:nth-of-type(3)")
        for applicant in applicants:
            self.planner_leads.append(applicant.text)

    def _scrape_owners(self, main):
        owners = main.find_elements(By.CSS_SELECTOR, "#projectList tr td:nth-of-type(4)")
        for owner in owners:
            self.property_owner.append(owner.text)

    def _scrape_status(self, main):
        status = main.find_elements(By.CSS_SELECTOR, "#projectList tr td:nth-of-type(5)")
        for stats in status:
            self.project_status.append(stats.text)

    def scrape_detailed_info(self):
        for link in self.listing_names:
            link = link.strip()
            link = self.driver.find_element(By.LINK_TEXT, link)
            self.driver.execute_script("arguments[0].scrollIntoView();", link)
            link.click()
            self._scrape_project_details()
            self.driver.back()

    def _scrape_project_details(self):
        try:
            element = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located(
                    (By.CSS_SELECTOR, ".so-widget-sow-editor.so-widget-sow-editor-base")
                )
            )
            self._scrape_project_description(element)
            self._scrape_contact_info(element)
            self._scrape_last_updated(element)
            self._scrape_image_urls()
        finally:
            pass

    def _scrape_project_description(self, element):
        try:
            description = element.find_element(By.CSS_SELECTOR, "ul:first-of-type li:first-child")
            self.project_descriptions.append(description.text)
        except:
            self.project_descriptions.append("NA")

    def _scrape_contact_info(self, element):
        try:
            contact_info = element.find_element(By.CSS_SELECTOR, "p")
            self.contact_information.append(contact_info.text)
        except:
            self.contact_information.append("NA")

    def _scrape_last_updated(self, element):
        try:
            last_updated = element.find_element(By.CSS_SELECTOR, "p > em")
            self.last_project_update.append(last_updated.text)
        except:
            self.last_project_update.append("NA")

    def _scrape_image_urls(self):
        image_url = None
        # Locate div elements
        div_elements = self.driver.find_elements(
            By.CSS_SELECTOR, "div.siteorigin-widget-tinymce.textwidget"
        )

        for div in div_elements:
            image_urls = []
            try:
                # Locate every img tag that is inside a p element following an a tag within the current div
                images_in_div = div.find_elements(By.CSS_SELECTOR, "p a img")

                # Extract the src attribute of each img tag found
                for img in images_in_div:
                    image_url = img.get_attribute("src")
                    if image_url:
                        image_urls.append(image_url)
                    else:
                        image_urls.append("NA")
            except Exception as e:
                print("Exception occurred: ", e)
                image_urls.append("NA")

            self.all_images_urls.append(image_urls)

    def create_dataframe(self):
        return pd.DataFrame(
            {
                "Listing Names": self.listing_names,
                "Project Locations": self.project_locations,
                "Planner Leads": self.planner_leads,
                "Owner": self.property_owner,
                "Image Url": self.all_images_urls,
                "Project Status": self.project_status,
                "Description": self.project_descriptions,
                "Planner/Manager": self.contact_information,
                "Last Project Update": self.last_project_update,
            }
        )


# Usage example
driver = get_chrome_driver()
scraper = SantaAnaScraper(driver)
URL = "https://www.santa-ana.org/major-planning-projects-and-monthly-development-project-reports/"
scraper.connect(URL)
scraper.scrape_base_directory()
scraper.scrape_detailed_info()
df = scraper.create_dataframe()


Major planning projects and monthly development reports - City of Santa Ana


In [77]:
df

Unnamed: 0,Listing Names,Project Locations,Planner Leads,Owner,Image Url,Project Status,Description,Planner/Manager,Last Project Update
0,1st and Harbor Mix-Use Development,101 N. Harbor Boulevard (Ward 5),"Charles ""Chuck"" Minyard","Primior, Inc.",[https://storage.googleapis.com/proudcity/sant...,Under Development Project Review,Applicant is proposing to construct a 10-story...,Case Planner: Jerry C. Guevara — Senior Planner,"Updated: September 12, 2023"
1,3rd and Broadway Mix-Use Development,201 W. 3rd Street (Ward 5),"Mike Harrah, Caribou Industries",City of Santa Ana,[https://storage.googleapis.com/proudcity/sant...,Under Plan Check Review,"On September 16, 2014, the City Council direct...","Project Manager: Ali Pezeshkpour, AICP — Princ...","Updated: September 23, 2021"
2,4th and Mortimer Mixed-Used Development,409 E. 4th Street (Ward 6),"Andrew Nelson, Red Oak Investments, LLC.","Los Altos XXI, LP GON-REY, LP.",[https://storage.googleapis.com/proudcity/sant...,Under Plan Check Review,Applicant proposes to construct a new commerci...,"Project Manager: Pedro Gomez, AICP — Associate...","Updated: March 30, 2021"
3,5th and Harbor Mixed-Use Development,419 N. Harbor Blvd (Ward 5),"Excel Property Management Services, Inc.","M&A Gabaee, LP.",[https://storage.googleapis.com/proudcity/sant...,Under Development Project Review,"The applicant, Excel Property Management Servi...","Project Manager: Ali Pezeshkpour, AICP – Princ...","Updated: May 25, 2021"
4,7-Eleven Service Station (Euclid Street),813 N. Euclid Street (Ward 1),"Adan Madrid, ASI Development",Euclid Hazard Capital LLC.,[https://storage.googleapis.com/proudcity/sant...,Under Construction,"7-Eleven proposes to construct a 3,045-square ...","Project Manager: Ali Pezeshkpour, AICP - Princ...","Updated: March 31, 2021"
5,AMG Senior Housing,2202 E. 1st Street (Ward 3),"Kimberly Calica, AMG & Associates",Executive Car Leasing Co.,[https://storage.googleapis.com/proudcity/sant...,Under Development Project Review,Applicant is proposing to construct a new 5-st...,Case Planner: Jerry C. Guevara\nPhone: (714) 6...,"Updated: December 16, 2022"
6,AMG First Point Family Affordable Apartments,2114 E. 1st Street (Ward 3),"Alexis Gavorgian, AMG & Associates","Broomell Commercial Properties, LP.",[https://storage.googleapis.com/proudcity/sant...,Completed,AMG & Associates proposes to construct a mixed...,"Project Manager: Ali Pezeshkpour, AICP - Princ...","Updated: April 1, 2021"
7,Bella Terra Residential Community and Temple,4006 W. Hazard Avenue,"Vince Fregoso, studio 2 design + partners","Nguyen, Long V.",[https://storage.googleapis.com/proudcity/sant...,Under Development Project Review,"The owner of 4006, 4010, and 4018 West Hazard ...","Updated: June 5th, 2023","Updated: June 5th, 2023"
8,Bewley Townhomes,1122 N. Bewley Street (Ward 1),"Ada Rose, YNG Architects","Nguyen, Kimloan J",[https://storage.googleapis.com/proudcity/sant...,Under Plan Check Review,The applicant is proposing to construct a 10-u...,Project Manager: Jerry C. Guevara – Assistant ...,"Updated: December 15, 2020"
9,Billboard Ordinance,Citywide,City of Santa Ana,,[],Under Review,"Approved by City Council, second reading, on J...","Updated: May 10th, 2023","Updated: May 10th, 2023"


In [None]:
listing_names[20:50]