In [3]:
import requests
import bs4
import re
import csv

def clean_text(text):
    if not text:
        return ''
    text = text.replace("ï»¿", "")
    text = text.strip()
    return text

def clean_business_name(name):
    if not name:
        return ''
    name = name.replace("ï»¿", "")
    name = re.sub(r'\s*Back\s*$', '', name)
    name = re.split(r'-?Updated.*', name)[0].strip()
    return name

def get_paths(title_lists):
    """Extract href paths from given list of <h2> tags"""
    req_info = []
    for title in title_lists:
        a = title.find('a')
        if a and a.get('href'):
            path = a['href']
            req_info.append(path)
    return req_info

def main():
    base_url = "https://search.earth911.com"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 '
                      '(KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36'
    }

    with open('earth911_recycling_data.csv', mode='a+', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Business_name', 'Last_update_time', 'Address', 'Material_accepted']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for page in range(1, 21):
            search_url = (
                f'https://search.earth911.com/?what=Electronics&where=10001&max_distance=25&country=US'
                f'&province=NY&city=New+York&region=New+York&postal_code=10001&latitude=40.74807084035'
                f'&longitude=-73.99234262099&sponsor=&list_filter=all&page={page}'
            )
            print(f"Fetching search results page {page}...")
            response_init = requests.get(search_url, headers=headers)
            if response_init.status_code != 200:
                print(f"Failed to fetch search page {page}, status: {response_init.status_code}")
                continue

            soup = bs4.BeautifulSoup(response_init.text, 'lxml')
            title_lists = soup.find_all('h2', class_='title')
            path_list = get_paths(title_lists)
            print(f"Found {len(path_list)} detail URLs on page {page}")

            for path in path_list:
                detail_url = base_url + path
                print(f"  Scraping detail page: {detail_url}")
                custom_response = requests.get(detail_url, headers=headers)
                if custom_response.status_code != 200:
                    print(f"    Failed to fetch detail page, status: {custom_response.status_code}")
                    continue

                soup2 = bs4.BeautifulSoup(custom_response.text, 'lxml')

                # Extract and clean business/program name
                business_name_tag = soup2.find('h1', class_='back-to noprint')
                raw_business_name = business_name_tag.get_text(strip=True) if business_name_tag else ''
                business_name = clean_business_name(raw_business_name)

                # Extract and clean last update time
                last_update_tag = soup2.find('span', class_='last-verified')
                last_update = clean_text(last_update_tag.get_text(strip=True)) if last_update_tag else ''

                # Extract and clean address
                address_tag = soup2.find('p', class_='addr')
                address = clean_text(address_tag.get_text(strip=True)) if address_tag else ''

                # Extract and clean materials accepted list
                material_accepted_spans = soup2.find_all('span', class_='material no-link')
                materials = [clean_text(m.get_text(strip=True)) for m in material_accepted_spans]

                field_info = {
                    "Business_name": business_name,
                    "Last_update_time": last_update,
                    "Address": address,
                    "Material_accepted": "; ".join(materials)
                }

                writer.writerow(field_info)
                print(field_info)


if __name__ == '__main__':
    main()


Fetching search results page 1...
Found 10 detail URLs on page 1
  Scraping detail page: https://search.earth911.com/program/Q1RQNVJYWFhKXA/?what=Electronics&where=10001&max_distance=25&country=US&province=NY&city=New+York&region=New+York&postal_code=10001&latitude=40.74807084035&longitude=-73.99234262099&sponsor=&list_filter=all
{'Business_name': 'New York City Bulk Item Curbside Program', 'Last_update_time': 'Updated Feb 23, 2016', 'Address': '', 'Material_accepted': 'Air Conditioners; Barbeque Grills; Carpet; Carpet Padding; Dehumidifiers; Dishwashers; Freezers; Heaters; Household Furniture; Humidifiers; Lumber; Refrigerators; Stoves; Washer/Dryers'}
  Scraping detail page: https://search.earth911.com/location/Q1RQNVJeWFZAUw/?what=Electronics&where=10001&max_distance=25&country=US&province=NY&city=New+York&region=New+York&postal_code=10001&latitude=40.74807084035&longitude=-73.99234262099&sponsor=&list_filter=all
{'Business_name': 'IMobile LLC', 'Last_update_time': 'Updated Feb 29, 