In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Define data sources
data_sources = [
    "https://dot.ca.gov/projects/",
    "https://cironline.com/projects/",
    "https://dcnonline.com/projects/",
    "https://planitcalifornia.org/projects/",
    "https://sphinxintelligence.com/projects/california/",
]

# Define data standardization function
def standardize_data(data):
    # Clean project name
    data["project_name"] = data["project_name"].apply(lambda x: x.strip())

    # Convert project start date to YYYY-MM-DD format
    data["project_start_date"] = pd.to_datetime(data["project_start_date"])

    # Convert project end date to YYYY-MM-DD format
    data["project_end_date"] = pd.to_datetime(data["project_end_date"])

    # Standardize project location
    data["project_location"] = data["project_location"].apply(
        lambda x: x.strip().lower()
    )

    return data

# Scrape and standardize data from each source
scraped_data = []
for source in data_sources:
    response = requests.get(source)
    soup = BeautifulSoup(response.content, "html.parser")

    # Extract project data based on source-specific structure
    project_data = extract_project_data_from_source(soup, source)

    # Standardize extracted data
    standardized_data = standardize_data(project_data)
    scraped_data.append(standardized_data)

# Combine scraped data into a single DataFrame
all_data = pd.concat(scraped_data)

# Save standardized data to a CSV file
all_data.to_csv("construction_projects_california.csv", index=False)


In [None]:
# Example function for extracting project data from a specific source

def extract_project_data_from_source(soup, source):
    if source == "https://dot.ca.gov/projects/":
        # Extract project data from Caltrans website
        project_elements = soup.find_all("div", class_="project-list-item")
        project_data = []
        for project_element in project_elements:
            project_name = project_element.find("h3").text.strip()
            project_location = project_element.find("p", class_="location").text.strip()
            project_start_date = project_element.find("p", class_="start").text.strip()
            project_end_date = project_element.find("p", class_="end").text.strip()

            project_data.append({
                "project_name": project_name,
                "project_location": project_location,
                "project_start_date": project_start_date,
                "project_end_date": project_end_date,
            })

        return project_data

    # Implement similar logic for other data sources
