In [8]:
# STEP 1: Install dependencies
!pip install selenium webdriver-manager requests groq pandas

Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting groq
  Downloading groq-0.30.0-py3-none-any.whl.metadata (16 kB)
Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Downloading groq-0.30.0-py3-none-any.whl (131 kB)
Installing collected packages: webdriver-manager, groq

   -------------------- ------------------- 1/2 [groq]
   -------------------- ------------------- 1/2 [groq]
   -------------------- ------------------- 1/2 [groq]
   ---------------------------------------- 2/2 [groq]

Successfully installed groq-0.30.0 webdriver-manager-4.0.2


In [10]:
# STEP 2: Import libraries
import json, time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import requests

In [2]:
# STEP 3: Groq API setup 
GROQ_API_KEY = "<GROQ_API_KEY>"
GROQ_MODEL = "llama3-8b-8192"

In [28]:
# STEP 4: Material Matching Prompt Template
def classify_materials(description):
    prompt = f"""
You are a classification assistant. Given this facility description, return:
- materials_category: categories from this list [Electronics, Batteries, Paint & Chemicals, Medical Sharps, Textiles/Clothing, Other Important Materials]
- materials_accepted: relevant accepted items from the standard list (e.g. 'Computers', 'Lithium-ion Batteries', etc.)

Description:
{description}

Return as JSON:
{{
  "materials_category": [...],
  "materials_accepted": [...]
}}
"""
    response = requests.post(
        "https://api.groq.com/openai/v1/chat/completions",
        headers={
            "Authorization": f"Bearer {GROQ_API_KEY}",
            "Content-Type": "application/json"
        },
        json={
            "model": GROQ_MODEL,
            "messages": [{"role": "user", "content": prompt}],
            "temperature": 0.2
        }
    )
    return json.loads(response.json()['choices'][0]['message']['content'])

In [66]:
# STEP 5: Scraper Function
import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import json
import time

# Define the mapping for materials
# Key is the category name from your requirements
# Value is a list of tuples: (keyword_from_earth911, corresponding_accepted_item_from_your_list)
# We'll try to match keywords from Earth911 material names
MATERIAL_MAPPING = {
    "Electronics": [
        ("Computer", "Computers, Laptops, Tablets"),
        ("Laptop", "Computers, Laptops, Tablets"),
        ("Tablet", "Computers, Laptops, Tablets"),
        ("Monitor", "Monitors, TVs (CRT & Flat Screen)"),
        ("TV", "Monitors, TVs (CRT & Flat Screen)"),
        ("Television", "Monitors, TVs (CRT & Flat Screen)"),
        ("Cell Phone", "Cell Phones, Smartphones"),
        ("Smartphone", "Cell Phones, Smartphones"),
        ("Printer", "Printers, Copiers, Fax Machines"),
        ("Copier", "Printers, Copiers, Fax Machines"),
        ("Fax Machine", "Printers, Copiers, Fax Machines"),
        ("Audio", "Audio/Video Equipment"),
        ("Video", "Audio/Video Equipment"),
        ("DVD", "Audio/Video Equipment"),
        ("CD Player", "Audio/Video Equipment"),
        ("Game Console", "Gaming Consoles"),
        ("Microwave", "Small Appliances (Microwaves, Toasters, etc.)"),
        ("Toaster", "Small Appliances (Microwaves, Toasters, etc.)"),
        ("Blender", "Small Appliances (Microwaves, Toasters, etc.)"),
        ("Kitchen Appliance", "Small Appliances (Microwaves, Toasters, etc.)"),
        ("Keyboard", "Computer Peripherals (Keyboards, Mice, Cables, etc.)"),
        ("Mouse", "Computer Peripherals (Keyboards, Mice, Cables, etc.)"),
        ("Cable", "Computer Peripherals (Keyboards, Mice, Cables, etc.)"),
        ("Peripheral", "Computer Peripherals (Keyboards, Mice, Cables, etc.)"),
        ("E-waste", "Other Electronics"),
        ("VCR", "Audio/Video Equipment"),
        ("Camera", "Audio/Video Equipment"),
        ("Webcam", "Audio/Video Equipment"),
        ("Projector", "Audio/Video Equipment"),
        # --- NEW ADDITIONS FOR PRINTERS ---
        ("Inkjet Printer", "Printers, Copiers, Fax Machines"),
        ("Cartridge Printer", "Printers, Copiers, Fax Machines"),
        ("Printer Cartridge", "Printers, Copiers, Fax Machines"), # General cartridge handling
        ("Toner Cartridge", "Printers, Copiers, Fax Machines"),
        ("Ink Cartridge", "Printers, Copiers, Fax Machines")
    ],
    "Batteries": [
        ("Household Battery", "Household Batteries (AA, AAA, 9V, etc.)"),
        ("AA", "Household Batteries (AA, AAA, 9V, etc.)"),
        ("AAA", "Household Batteries (AA, AAA, 9V, etc.)"),
        ("C Battery", "Household Batteries (AA, AAA, 9V, etc.)"),
        ("D Battery", "Household Batteries (AA, AAA, 9V, etc.)"),
        ("9V Battery", "Household Batteries (AA, AAA, 9V, etc.)"),
        ("Rechargeable Battery", "Rechargeable Batteries"),
        ("Lithium-ion Battery", "Lithium-ion Batteries"),
        ("Button Battery", "Button/Watch Batteries"),
        ("Watch Battery", "Button/Watch Batteries"),
        ("Power Tool Battery", "Power Tool Batteries"),
        ("E-bike Battery", "E-bike/Scooter Batteries"),
        ("Scooter Battery", "E-bike/Scooter Batteries"),
        ("Car Battery", "Car/Automotive Batteries"),
        ("Automotive Battery", "Car/Automotive Batteries")
    ],
    "Paint & Chemicals": [
        ("Latex Paint", "Latex/Water-based Paint"),
        ("Water-based Paint", "Latex/Water-based Paint"),
        ("Oil-based Paint", "Oil-based Paint and Stains"),
        ("Stain", "Oil-based Paint and Stains"),
        ("Spray Paint", "Spray Paint"),
        ("Paint Thinner", "Paint Thinners and Solvents"),
        ("Solvent", "Paint Thinners and Solvents"),
        ("Household Cleaner", "Household Cleaners"),
        ("Cleaning Product", "Household Cleaners"),
        ("Pool Chemical", "Pool Chemicals"),
        ("Pesticide", "Pesticides and Herbicides"),
        ("Herbicide", "Pesticides and Herbicides"),
        ("Automotive Fluid", "Automotive Fluids (Oil, Antifreeze)"),
        ("Motor Oil", "Automotive Fluids (Oil, Antifreeze)"),
        ("Antifreeze", "Automotive Fluids (Oil, Antifreeze)"),
        ("Chemical", "Other Chemicals")
    ],
    "Medical Sharps": [
        ("Needle", "Needles and Syringes"),
        ("Syringe", "Needles and Syringes"),
        ("Lancet", "Lancets"),
        ("Auto-injector", "Auto-injectors (EpiPens)"),
        ("EpiPen", "Auto-injectors (EpiPens)"),
        ("Insulin Pen", "Insulin Pens"),
        ("Home Dialysis", "Home Dialysis Equipment")
    ],
    "Textiles/Clothing": [
        ("Clothing", "Clothing and Shoes"),
        ("Shoe", "Clothing and Shoes"),
        ("Textile", "Household Textiles (Towels, Bedding)"),
        ("Towel", "Household Textiles (Towels, Bedding)"),
        ("Bedding", "Household Textiles (Towels, Bedding)"),
        ("Fabric Scraps", "Fabric Scraps"),
        ("Accessory", "Accessories (Belts, Bags, etc.)"),
        ("Belt", "Accessories (Belts, Bags, etc.)"),
        ("Bag", "Accessories (Belts, Bags, etc.)")
    ],
    "Other Important Materials": [
        ("Fluorescent Bulb", "Fluorescent Bulbs and CFLs"),
        ("CFL", "Fluorescent Bulbs and CFLs"),
        ("Mercury Thermometer", "Mercury Thermometers"),
        ("Smoke Detector", "Smoke Detectors"),
        ("Fire Extinguisher", "Fire Extinguishers"),
        ("Propane Tank", "Propane Tanks"),
        ("Mattress", "Mattresses and Box Springs"),
        ("Box Spring", "Mattresses and Box Springs"),
        ("Large Appliance", "Large Appliances (Fridges, Washers, etc.)"),
        ("Fridge", "Large Appliances (Fridges, Washers, etc.)"),
        ("Refrigerator", "Large Appliances (Fridges, Washers, etc.)"),
        ("Washer", "Large Appliances (Fridges, Washers, etc.)"),
        ("Dryer", "Large Appliances (Fridges, Washers, etc.)"),
        ("Construction Debris", "Construction Debris (Residential Quantities)")
    ]
}

# Add a function to map raw materials to categories and accepted lists
def map_materials(raw_materials_list):
    categories = set()
    accepted_details = set()

    for raw_material in raw_materials_list:
        found_match = False
        # Normalize the raw material string for better matching (lowercase, remove extra spaces)
        # Also, replacing the invisible char here for good measure
        normalized_raw = raw_material.lower().replace(' ﻿', ' ').replace('\ufeff', '').strip()

        for category, mappings in MATERIAL_MAPPING.items():
            for keyword, accepted_item in mappings:
                if keyword.lower() in normalized_raw:
                    categories.add(category)
                    accepted_details.add(accepted_item)
                    found_match = True
                    break
            if found_match:
                break

    return sorted(list(categories)), sorted(list(accepted_details))


def scrape_earth911(material, zipcode, distance):
    driver = webdriver.Chrome()
    wait = WebDriverWait(driver, 30)

    results = []

    try:
        driver.get("https://search.earth911.com/")

        # Step 1: Input 'electronics' in 'what' and '10001' in 'where'
        what_input = wait.until(EC.presence_of_element_located((By.NAME, "what")))
        what_input.send_keys(material)

        where_input = wait.until(EC.presence_of_element_located((By.NAME, "where")))
        where_input.send_keys(zipcode)

        # Step 2: Click SEARCH button
        search_btn = wait.until(EC.element_to_be_clickable((By.ID, "submit-location-search")))
        search_btn.click()

        # --- NEW LOCATION FOR POP-UP HANDLING ---
        # Handle the 'JOIN 160,000+ EARTHLINGS' overlay after search button click
        try:
            overlay_wrapper = wait.until(
                EC.presence_of_element_located((By.CLASS_NAME, "_form-wrapper"))
            )
            print("Pop-up overlay wrapper detected on results page.")

            close_popup_button = wait.until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "#_form_6889DD972341B_ ._close"))
            )
            close_popup_button.click()
            print("Pop-up close button clicked successfully on results page.")

            wait.until(EC.invisibility_of_element_located((By.CLASS_NAME, "_form-wrapper")))
            print("Pop-up has disappeared on results page.")
        except Exception as e:
            print(f"No detectable pop-up or error closing it on results page (likely timed out or locator changed): {e}")
            pass


        # Step 3: Wait for the results page to load and the distance dropdown to be present
        distance_dropdown_element = wait.until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, "div.result-range select")
            )
        )
        distance_dropdown = Select(distance_dropdown_element)

        # Step 4: Select "within 100 miles" filter
        distance_dropdown.select_by_value(str(distance))

        # IMPORTANT: After selecting the filter, the page content will likely update dynamically.
        wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#all-listings-results ul.result-list")))
        time.sleep(2)

        # --- Step 5: Extract data from the results page ---
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        results_list_container = soup.find('ul', class_='result-list')

        if results_list_container:
            facility_listings = results_list_container.find_all('li', class_='result-item')

            if not facility_listings:
                print("Found results list container, but no individual facility listings found within it. Check 'result-item' class for <li>.")

            for listing in facility_listings:
                business_name = None
                phone = None
                street_address = None
                raw_materials_from_site = []
                materials_category = []
                materials_accepted_detailed = []

                # Extract Business Name
                name_h2 = listing.find('h2', class_='title')
                if name_h2 and name_h2.a:
                    business_name = name_h2.a.get_text(strip=True).replace('\ufeff', '')

                # Extract Phone
                phone_p = listing.find('p', class_='phone')
                if phone_p:
                    phone = phone_p.get_text(strip=True).replace('\ufeff', '')

                # Extract Address - Combine address1, address2, address3
                address_parts = []
                address1_p = listing.find('p', class_='address1')
                if address1_p and address1_p.get_text(strip=True):
                    address_parts.append(address1_p.get_text(strip=True).replace('\ufeff', ''))
                address2_p = listing.find('p', class_='address2')
                if address2_p and address2_p.get_text(strip=True):
                    address_parts.append(address2_p.get_text(strip=True).replace('\ufeff', ''))
                address3_p = listing.find('p', class_='address3')
                if address3_p and address3_p.get_text(strip=True):
                    address_parts.append(address3_p.get_text(strip=True).replace('\ufeff', ''))

                street_address = ', '.join(address_parts) if address_parts else None

                # Extract Raw Materials Accepted from the website
                materials_container = listing.find('p', class_='result-materials')
                if materials_container:
                    material_spans = materials_container.find_all('span', class_=lambda x: x and 'material' in x.split() and 'materials-accepted' not in x.split())
                    for span in material_spans:
                        raw_materials_from_site.append(span.get_text(strip=True).replace('\ufeff', ''))

                # Map raw materials to desired categories and accepted list
                materials_category, materials_accepted_detailed = map_materials(raw_materials_from_site)

                last_update_date = datetime.date.today().strftime("%Y-%m-%d")

                results.append({
                    "business_name": business_name,
                    "last_update_date": last_update_date,
                    "street_address": street_address,
                    "phone": phone,
                    "materials_category": materials_category,
                    "materials_accepted": materials_accepted_detailed
                })
        else:
            print("Could not find the main results list container. Please inspect the HTML of the results page to find the correct tag and class/ID for the <ol> or <ul> holding the result items.")

    finally:
        driver.quit()
    return results

In [62]:
# STEP 6: Run the scraper:
results = scrape_earth911(material='Electronics', zipcode='10001', distance=100)
with open("earth911_scraped.json", "w") as f:
     json.dump(results, f, indent=2)

Pop-up overlay wrapper detected on results page.
No detectable pop-up or error closing it on results page (likely timed out or locator changed): Message: 
Stacktrace:
	GetHandleVerifier [0x0x7ff60ecfe415+77285]
	GetHandleVerifier [0x0x7ff60ecfe470+77376]
	(No symbol) [0x0x7ff60eac9a6a]
	(No symbol) [0x0x7ff60eb20406]
	(No symbol) [0x0x7ff60eb206bc]
	(No symbol) [0x0x7ff60eb73ac7]
	(No symbol) [0x0x7ff60eb4864f]
	(No symbol) [0x0x7ff60eb7087f]
	(No symbol) [0x0x7ff60eb483e3]
	(No symbol) [0x0x7ff60eb11521]
	(No symbol) [0x0x7ff60eb122b3]
	GetHandleVerifier [0x0x7ff60efe1efd+3107021]
	GetHandleVerifier [0x0x7ff60efdc29d+3083373]
	GetHandleVerifier [0x0x7ff60effbedd+3213485]
	GetHandleVerifier [0x0x7ff60ed1884e+184862]
	GetHandleVerifier [0x0x7ff60ed2055f+216879]
	GetHandleVerifier [0x0x7ff60ed07084+113236]
	GetHandleVerifier [0x0x7ff60ed07239+113673]
	GetHandleVerifier [0x0x7ff60ecee298+11368]
	BaseThreadInitThunk [0x0x7ffb87ece8d7+23]
	RtlUserThreadStart [0x0x7ffb8873c34c+44]



In [64]:
results

[{'business_name': 'New York City Bulk Item Curbside Program',
  'last_update_date': '2025-07-30',
  'street_address': 'New York, NY 10001',
  'phone': '',
  'materials_category': ['Other Important Materials'],
  'materials_accepted': ['Large Appliances (Fridges, Washers, etc.)']},
 {'business_name': 'IMobile LLC',
  'last_update_date': '2025-07-30',
  'street_address': '370 7th Ave, New York, NY 10001',
  'phone': '(212) 967-9725',
  'materials_category': ['Electronics'],
  'materials_accepted': ['Cell Phones, Smartphones']},
 {'business_name': 'The 4th Bin',
  'last_update_date': '2025-07-30',
  'street_address': '307 7th Ave, New York, NY 10001',
  'phone': '(646) 747-5985',
  'materials_category': ['Electronics'],
  'materials_accepted': ['Cell Phones, Smartphones',
   'Computers, Laptops, Tablets',
   'Monitors, TVs (CRT & Flat Screen)',
   'Printers, Copiers, Fax Machines']},
 {'business_name': 'Sprint Store',
  'last_update_date': '2025-07-30',
  'street_address': '126 W 34th St