In [None]:
import time
import random
import logging
import os
import pandas as pd
from datetime import date, timedelta, datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    NoSuchElementException,
    TimeoutException,
    ElementNotInteractableException,
    StaleElementReferenceException,
    WebDriverException
)
from bs4 import BeautifulSoup
# Using webdriver-manager simplifies driver setup
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options

In [None]:
# --- Configuration ---
MTCF_URL = "https://www.michigantrafficcrashfacts.org/data/querytool"
START_YEAR = 2013
END_YEAR = 2022
OUTPUT_DIR = "mtcf_scraped_data_person_level_daily"

# Delays to minimize risk of blocking (adjust as needed, longer is safer)
# Total runtime will be roughly (MIN_DELAY + MAX_DELAY)/2 * 365 * 10 seconds
MIN_DELAY_SECONDS = 45  # Minimum delay between processing each day
MAX_DELAY_SECONDS = 120 # Maximum delay between processing each day
PAGE_LOAD_TIMEOUT = 60  # Max seconds to wait for page elements
RETRY_MAX_ATTEMPTS = 3
RETRY_BASE_DELAY_SECONDS = 30 # Initial delay for retries, increases exponentially

# --- Logging Setup ---
LOG_FORMAT = '%(asctime)s - %(levelname)s - %(message)s'
LOG_FILENAME = 'mtcf_scraper.log'

# Ensure handlers are not added multiple times if script is re-run in same session
logger = logging.getLogger()
if not logger.handlers:
    logger.setLevel(logging.INFO)
    # File handler
    file_handler = logging.FileHandler(LOG_FILENAME)
    file_handler.setLevel(logging.INFO)
    file_handler.setFormatter(logging.Formatter(LOG_FORMAT))
    logger.addHandler(file_handler)
    # Console handler
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.INFO)
    console_handler.setFormatter(logging.Formatter(LOG_FORMAT))
    logger.addHandler(console_handler)

In [None]:
# --- Helper Functions ---

def get_api_field_names(user_attribute_list, api_metadata_fields):
    """
    Maps user-friendly attribute names to actual API field names found in metadata.

    TODO: This function provides a basic mapping attempt. You MUST review
          the actual API field names from the metadata (FEATURE_SERVICE_URL + '?f=json')
          and update this logic or the 'mapping' dictionary below to ensure accuracy.
          Pay close attention to case sensitivity and exact naming conventions.
    """
    # --- Placeholder Mapping (Update based on actual metadata) ---
    # Add more known mappings here as you discover them from the API metadata
    mapping = {
        "Crash Year": "CRASH_YEAR",
        "County": "COUNTY_NAME",
        "Crash Month": "CRASH_MONTH",
        "Crash Day": "CRASH_DAY",
        "Person Age": "PERSON_AGE",
        "Person Gender": "PERSON_GENDER",
        "Person Degree of Injury": "PERSON_INJURY_SEVERITY",
        "Driver Age": "DRIVER_AGE",
        "Driver Gender": "DRIVER_GENDER",
        "Worst Injury in Crash": "MAX_INJURY_SEVERITY",
        "Vehicle Type": "VEHICLE_TYPE",
        "City or Township": "CITY_TOWNSHIP_NAME",
        "Day of Week": "DAY_OF_WEEK",
        "Lighting Conditions": "LIGHTING_CONDITION",
        "Road Conditions": "ROAD_CONDITION",
        "Weather Conditions (2016+)": "WEATHER_CONDITION", # Assuming one field covers recent weather
        "Time of Day": "CRASH_TIME", # Often part of a datetime field
        "Posted Speed Limit": "POSTED_SPEED_LIMIT",
        "Traffic Control": "TRAFFIC_CONTROL_TYPE",
        "Worst Injury in Unit": "UNIT_MAX_INJURY_SEVERITY",
        "Party Type": "PARTY_TYPE",
        "Person Position": "PERSON_POSITION",
        "Person Ejection": "PERSON_EJECTION",
        "Person Restraint": "PERSON_RESTRAINT_USED",
        "Driver Drinking": "DRIVER_DRINKING_INDICATOR",
        "Crash: Drinking": "ALCOHOL_INVOLVED_CRASH", # Example guess
        "Crash: Drug Use": "DRUG_INVOLVED_CRASH", # Example guess
        "Crash: Fatal Crash": "FATAL_CRASH_INDICATOR",
        "Crash: Injury Crash": "INJURY_CRASH_INDICATOR",
        "Crash Type": "CRASH_TYPE_DESC",
        #... add ALL other required fields based on API metadata inspection...
    }

    api_fields_to_request = set()
    missing_attributes =
    available_api_field_names_map = {field['name'].upper(): field['name'] for field in api_metadata_fields} # Map upper case to original case

    print("\n--- Attribute Mapping Attempt ---")
    for user_attr in user_attribute_list:
        found = False
        # 1. Try predefined mapping (case-insensitive key check)
        if user_attr in mapping:
            mapped_name_upper = mapping[user_attr].upper()
            if mapped_name_upper in available_api_field_names_map:
                api_fields_to_request.add(available_api_field_names_map[mapped_name_upper])
                print(f"  Mapped '{user_attr}' -> '{available_api_field_names_map[mapped_name_upper]}' (predefined)")
                found = True

        # 2. Try direct name match (case-insensitive, ignoring spaces/punctuation/years)
        if not found:
            normalized_user_attr = ''.join(filter(str.isalnum, user_attr.split('('))).upper()
            for api_name_upper, original_api_name in available_api_field_names_map.items():
                normalized_api_name = ''.join(filter(str.isalnum, original_api_name)).upper()
                if normalized_user_attr == normalized_api_name:
                    api_fields_to_request.add(original_api_name)
                    print(f"  Mapped '{user_attr}' -> '{original_api_name}' (normalized name match)")
                    found = True
                    break # Found a match

        if not found:
            missing_attributes.append(user_attr)
            # print(f"  Warning: Could not map user attribute '{user_attr}'.") # Keep output cleaner

    if missing_attributes:
        print("\n--- Attributes Not Found/Mapped ---")
        for attr in missing_attributes:
            print(f"  - {attr}")
        print("  Please inspect API metadata and update the 'mapping' dictionary or logic.")
        print("---------------------------------")


    if not api_fields_to_request:
         print("\nError: No attributes could be mapped. Cannot proceed without knowing which fields to request.")
         print("Please update the get_api_field_names function with correct mappings based on API metadata.")
         return None, missing_attributes # Indicate failure

    # Ensure essential fields for filtering and pagination are included
    # TODO: Verify these default names ('OBJECTID', 'CRASH_YEAR', 'CRASH_DATE') against metadata
    oid_field_default = "OBJECTID"
    year_field_default = "CRASH_YEAR"
    date_field_default = "CRASH_DATE"

    essential_fields_to_add = set()
    found_oid = False
    found_year = False
    found_date = False

    for api_name_upper, original_api_name in available_api_field_names_map.items():
        if api_name_upper == oid_field_default.upper():
            essential_fields_to_add.add(original_api_name)
            found_oid = True
        if api_name_upper == year_field_default.upper():
            essential_fields_to_add.add(original_api_name)
            found_year = True
        if api_name_upper == date_field_default.upper():
             essential_fields_to_add.add(original_api_name)
             found_date = True

    if not found_oid: print(f"Warning: Default OBJECTID field '{oid_field_default}' not found. Pagination might fail.")
    if not found_year and not found_date: print(f"Warning: Default Year ('{year_field_default}') or Date ('{date_field_default}') field not found. Filtering will likely fail.")

    final_field_list = list(api_fields_to_request.union(essential_fields_to_add))
    print(f"\nFinal list of API fields to request: {', '.join(final_field_list)}")
    print("---------------------------------")

    return final_field_list, missing_attributes

In [None]:
# --- Placeholder Selectors (MUST BE VERIFIED/UPDATED BY INSPECTING THE LIVE SITE) ---
# Use CSS Selectors (e.g., '#elementId', '.className', 'tag[attribute="value"]')
# Or XPath (less preferred due to fragility)
ANALYSIS_LEVEL_DROPDOWN_SELECTOR = 'select[aria-label="Analysis Level"]' # Placeholder - VERIFY
PERSONS_OPTION_VALUE = "persons" # Placeholder value for the 'Persons' option - VERIFY
START_DATE_INPUT_SELECTOR = '#input-41' # Placeholder - VERIFY
END_DATE_INPUT_SELECTOR = '#input-44[aria-label="End Date"]' # Placeholder - VERIFY
# Date picker interaction might be complex - inspect how dates are selected (e.g., calendar clicks, direct input)
SUBMIT_BUTTON_SELECTOR = 'button:contains("Update Query")' # Placeholder - VERIFY (This might need adjustment based on actual text/structure)
RESULTS_TABLE_SELECTOR = 'table.dataTable' # Placeholder - VERIFY (Look for a table with results)
RESULTS_TABLE_HEADER_SELECTOR = 'thead tr th' # Placeholder - VERIFY
RESULTS_TABLE_BODY_SELECTOR = 'tbody' # Placeholder - VERIFY
RESULTS_TABLE_ROW_SELECTOR = 'tr' # Placeholder - VERIFY
RESULTS_TABLE_CELL_SELECTOR = 'td' # Placeholder - VERIFY
PAGINATION_NEXT_BUTTON_SELECTOR = 'a.paginate_button.next:not(.disabled)' # Placeholder - VERIFY (Check for enabled 'next' link)
# Element indicating results have loaded/updated (e.g., a results count span, or the table itself)
RESULTS_LOADED_INDICATOR_SELECTOR = RESULTS_TABLE_SELECTOR # Placeholder - VERIFY

In [None]:
# --- Helper Functions ---

def setup_driver():
    """Sets up the Selenium WebDriver."""
    options = Options()
    # options.add_argument("--headless")  # Run headless (without opening browser window) - may be detected more easily
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")
    # Rotate User Agent (optional, add more legitimate agents)
    user_agents =
    options.add_argument(f'user-agent={random.choice(user_agents)}')
    options.add_experimental_option('excludeSwitches', ['enable-logging']) # Suppress DevTools messages

    try:
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=options)
        logging.info("WebDriver initialized successfully.")
        return driver
    except Exception as e:
        logging.critical(f"Failed to initialize WebDriver: {e}", exc_info=True)
        raise

def set_date_fields_simple_input(driver, wait, date_selector, target_date_str):
    """Attempts to set date by clearing and sending keys. May need adjustment."""
    try:
        date_input = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, date_selector)))
        date_input.clear()
        # Sometimes clicking first helps activate fields
        date_input.click()
        time.sleep(0.5)
        date_input.send_keys(target_date_str)
        # Add a check to see if the value was set correctly if possible
        time.sleep(0.5) # Pause after sending keys
        logging.info(f"Set date field {date_selector} to {target_date_str}")
        return True
    except Exception as e:
        logging.error(f"Could not set date field {date_selector} to {target_date_str}: {e}")
        return False

def extract_data_from_html(html_content, expected_headers=None):
    """Parses HTML table content using BeautifulSoup."""
    soup = BeautifulSoup(html_content, 'lxml')
    table = soup.select_one(RESULTS_TABLE_SELECTOR)
    if not table:
        logging.warning("Results table not found in HTML content.")
        return expected_headers if expected_headers else, # Return empty data, keep headers if known

    headers =
    header_elements = table.select(RESULTS_TABLE_HEADER_SELECTOR)
    if header_elements:
        headers =
    elif expected_headers:
         logging.warning("Table header row not found, using previously known headers.")
         headers = expected_headers
    else:
        logging.warning("Could not extract table headers.")
        # Attempt fallback: get headers from first data row if necessary (less reliable)

    data_records =
    table_body = table.select_one(RESULTS_TABLE_BODY_SELECTOR)
    if table_body:
        rows = table_body.select(RESULTS_TABLE_ROW_SELECTOR)
        for row in rows:
            cells =
            # Basic validation: check if row looks like data (adjust condition if needed)
            if cells and len(cells) == len(headers):
                 record = dict(zip(headers, cells))
                 data_records.append(record)
            elif cells: # Log if cell count mismatch only if cells were found
                 logging.warning(f"Row skipped due to mismatch: {len(cells)} cells vs {len(headers)} headers. Row data: {cells}")
    else:
        logging.warning("Table body not found.")

    # If headers were just extracted, return them. Otherwise, return the ones passed in.
    final_headers = headers if headers else expected_headers if expected_headers else
    return final_headers, data_records

def safe_click(driver, wait, selector):
    """Clicks an element safely with explicit wait."""
    try:
        element = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, selector)))
        element.click()
        return True
    except (TimeoutException, ElementNotInteractableException) as e:
        logging.warning(f"Could not click element {selector}: {e}")
        return False
    except Exception as e:
        logging.error(f"Unexpected error clicking element {selector}: {e}", exc_info=True)
        return False

In [None]:
# --- Main Script ---
if __name__ == "__main__":
    logging.info("="*50)
    logging.info("Starting MTCF Scraping Script (Daily Iteration)")
    logging.warning("DISCLAIMER: This script attempts to scrape data from the MTCF Data Query Tool.")
    logging.warning("This likely violates their Terms of Use and carries a HIGH RISK of IP blocking.")
    logging.warning("Use of this script is at your own risk. Official data requests are recommended.")
    logging.warning(f"Estimated runtime with delays ({MIN_DELAY_SECONDS}-{MAX_DELAY_SECONDS}s/day): VERY LONG (potentially days).")
    logging.info("Ensure all placeholder selectors (e.g., 'YOUR_SELECTOR_HERE') are updated!")
    logging.info("="*50)

    # Basic check for placeholder URL - replace with checks for selectors if possible
    if "YOUR_SELECTOR_HERE" in ANALYSIS_LEVEL_DROPDOWN_SELECTOR:
         logging.critical("Placeholder selectors detected. Please update them by inspecting the MTCF website.")
         # exit() # Commented out to allow running even if placeholders remain, but WARN heavily

    os.makedirs(OUTPUT_DIR, exist_ok=True)
    driver = None # Initialize driver variable

    try:
        driver = setup_driver()
        base_wait = WebDriverWait(driver, PAGE_LOAD_TIMEOUT)

        for year in range(START_YEAR, END_YEAR + 1):
            logging.info(f"===== Processing Year: {year} =====")
            yearly_data_list =
            yearly_headers_list = None # Store headers found for the year
            start_date_year = date(year, 1, 1)
            end_date_year = date(year, 12, 31)
            current_process_date = start_date_year

            while current_process_date <= end_date_year:
                day_str = current_process_date.strftime('%Y-%m-%d') # Format needed for input
                logging.info(f"--- Processing Date: {day_str} ---")
                attempt = 0
                success = False

                while attempt < RETRY_MAX_ATTEMPTS and not success:
                    attempt += 1
                    if attempt > 1:
                        wait_time = RETRY_BASE_DELAY_SECONDS * (2 ** (attempt - 2)) # Exponential backoff
                        logging.info(f"Retry attempt {attempt}/{RETRY_MAX_ATTEMPTS}. Waiting for {wait_time} seconds...")
                        time.sleep(wait_time)

                    try:
                        logging.info(f"Navigating to MTCF tool for {day_str} (Attempt {attempt})")
                        driver.get(MTCF_URL)

                        # 1. Select Analysis Level: Persons
                        logging.info("Waiting for Analysis Level dropdown...")
                        analysis_dropdown_element = base_wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ANALYSIS_LEVEL_DROPDOWN_SELECTOR)))
                        select = Select(analysis_dropdown_element)
                        select.select_by_value(PERSONS_OPTION_VALUE)
                        logging.info("Selected 'Persons' analysis level.")
                        time.sleep(random.uniform(0.5, 1.5)) # Small pause after selection

                        # 2. Set Date Range (Start and End = current_process_date)
                        logging.info(f"Setting Start Date to {day_str}")
                        if not set_date_fields_simple_input(driver, base_wait, START_DATE_INPUT_SELECTOR, day_str):
                             raise Exception(f"Failed to set Start Date for {day_str}")
                        time.sleep(random.uniform(0.5, 1.5))

                        logging.info(f"Setting End Date to {day_str}")
                        if not set_date_fields_simple_input(driver, base_wait, END_DATE_INPUT_SELECTOR, day_str):
                             raise Exception(f"Failed to set End Date for {day_str}")
                        time.sleep(random.uniform(0.5, 1.5))

                        # 3. Submit Query
                        logging.info("Submitting query...")
                        # Find the submit button - might need adjustment
                        # Using JavaScript click can sometimes bypass overlay issues
                        submit_button_element = base_wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, SUBMIT_BUTTON_SELECTOR)))
                        driver.execute_script("arguments.click();", submit_button_element)
                        # if not safe_click(driver, base_wait, SUBMIT_BUTTON_SELECTOR):
                        #     raise Exception("Failed to click submit button.")
                        logging.info("Query submitted.")

                        # 4. Wait for results to load/update
                        logging.info("Waiting for results to load...")
                        # Wait for the table or a known element within it to be present
                        base_wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, RESULTS_LOADED_INDICATOR_SELECTOR)))
                        logging.info("Results loaded indicator found.")
                        # Add extra pause for safety, especially if content loads dynamically after table structure appears
                        time.sleep(random.uniform(3, 7))

                        # --- Data Extraction and Pagination for the Day ---
                        daily_records_list =
                        current_page = 1
                        while True: # Loop for pagination
                            logging.info(f"Extracting data from page {current_page} for {day_str}...")
                            try:
                                # Store reference to current table to detect changes
                                old_table_element = driver.find_element(By.CSS_SELECTOR, RESULTS_TABLE_SELECTOR)
                            except NoSuchElementException:
                                logging.warning(f"Results table disappeared before extraction on page {current_page}. Assuming no data.")
                                old_table_element = None # Ensure staleness check doesn't block indefinitely

                            page_html = driver.page_source
                            headers, page_data = extract_data_from_html(page_html, yearly_headers_list)

                            if not yearly_headers_list and headers: # Store headers if found for the first time this year
                                yearly_headers_list = headers
                                logging.info(f"Captured headers for year {year}: {yearly_headers_list}")
                            elif headers and yearly_headers_list and headers!= yearly_headers_list:
                                logging.warning(f"Header mismatch detected for {day_str}! Previous: {yearly_headers_list}, Current: {headers}")
                                # Decide handling: stop, use old, use new, merge? Using old for consistency this run.
                                headers = yearly_headers_list

                            if page_data:
                                daily_records_list.extend(page_data)
                                logging.info(f"Extracted {len(page_data)} records from page {current_page}.")
                            else:
                                logging.info(f"No data records found on page {current_page}.")

                            # Check for and click the "Next" button
                            try:
                                next_button_element = driver.find_element(By.CSS_SELECTOR, PAGINATION_NEXT_BUTTON_SELECTOR)
                                logging.info("Next page button found. Clicking...")
                                driver.execute_script("arguments.click();", next_button_element) # JS click often more reliable
                                current_page += 1

                                # Wait for the table to update after clicking next
                                logging.info("Waiting for table to update after pagination...")
                                if old_table_element:
                                     base_wait.until(EC.staleness_of(old_table_element))
                                # Add extra wait for content to actually render
                                time.sleep(random.uniform(3, 6))
                                base_wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, RESULTS_LOADED_INDICATOR_SELECTOR)))
                                logging.info("Table updated for next page.")

                            except (NoSuchElementException, ElementNotInteractableException):
                                logging.info(f"No more 'Next' page button found or interactable for {day_str}. End of results for this day.")
                                break # Exit pagination loop
                            except TimeoutException:
                                 logging.warning(f"Timeout waiting for table update after pagination click for {day_str}. Assuming end of results.")
                                 break # Exit pagination loop
                            except Exception as page_e:
                                 logging.error(f"Unexpected error during pagination for {day_str}: {page_e}", exc_info=True)
                                 break # Exit pagination loop

                        # --- End of Pagination Loop ---
                        logging.info(f"Total records extracted for {day_str}: {len(daily_records_list)}")
                        yearly_data_list.extend(daily_records_list)
                        success = True # Mark day as successful

                    except WebDriverException as e:
                        logging.error(f"WebDriverException on {day_str} (Attempt {attempt}): {e}")
                        if "ERR_CONNECTION_RESET" in str(e) or "timeout" in str(e).lower():
                             logging.warning("Connection error detected, will retry.")
                        else:
                             # If it's not a known retryable error, maybe stop retrying
                             logging.error("Unknown WebDriverException, stopping retries for this day.")
                             attempt = RETRY_MAX_ATTEMPTS # Force break
                    except Exception as e:
                        logging.error(f"Unexpected error processing {day_str} (Attempt {attempt}): {e}", exc_info=True)
                        # Consider if driver needs restart on certain errors

                # --- End of Retry Loop ---
                if not success:
                    logging.error(f"Failed to process date {day_str} after {RETRY_MAX_ATTEMPTS} attempts. Skipping.")

                # Move to next day
                current_process_date += timedelta(days=1)

                # Implement the long delay between daily queries
                delay = random.uniform(MIN_DELAY_SECONDS, MAX_DELAY_SECONDS)
                logging.info(f"--- Waiting for {delay:.1f} seconds before processing next day ---")
                time.sleep(delay)

            # --- End of Year Processing ---
            if yearly_data_list:
                logging.info(f"Aggregating and saving data for year {year}...")
                try:
                    # Use headers captured during the year, or let Pandas infer if none found
                    final_yearly_df = pd.DataFrame(yearly_data_list, columns=yearly_headers_list)

                    output_filename = os.path.join(OUTPUT_DIR, f"mtcf_person_data_{year}.csv")
                    final_yearly_df.to_csv(output_filename, index=False, encoding='utf-8')
                    logging.info(f"Successfully saved {len(final_yearly_df)} records for {year} to {output_filename}")
                except Exception as e:
                    logging.error(f"Failed to create or save DataFrame for year {year}: {e}", exc_info=True)
            else:
                logging.warning(f"No data collected for year {year}.")

            logging.info(f"===== Finished processing year: {year} =====")

    except KeyboardInterrupt:
         logging.warning("Script interrupted by user.")
    except Exception as e:
        logging.critical(f"A critical error occurred during the scraping process: {e}", exc_info=True)
    finally:
        if driver:
            driver.quit()
            logging.info("WebDriver closed.")
        logging.info("="*50)
        logging.info("Script finished.")
        logging.info(f"Log file saved to: {LOG_FILENAME}")
        logging.info(f"CSV files (if any) saved in directory: {os.path.abspath(OUTPUT_DIR)}")
        logging.warning("REMINDER: Review the log file for any errors or warnings.")
        logging.warning("REMINDER: Scraping MTCF is against their Terms of Use and may lead to blocking.")
        logging.info("="*50)