In [13]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [17]:
# define the URL of the B2B directory and search term
BASE_URL = "https://www.yellowpages.com"
SEARCH_PATH = "/search"
SEARCH_QUERY = "food Services"  # change this to the industry/category of your choice
LOCATION = "Detroit, MI"  # can modify if needed

In [18]:
# create an empty list to store the extracted data
companies_data = []


In [19]:
# loop through the first 2-3 pages of search results
for page in range(1, 4):  # adjust the range of page
    print(f"Scraping page {page}...")
    
    # construct the URL with query parameters
    params = {
        "search_terms": SEARCH_QUERY,
        "geo_location_terms": LOCATION,
        "page": page,
    }
    response = requests.get(BASE_URL + SEARCH_PATH, params=params)
    
    # check for HTTP request issues
    if response.status_code != 200:
        print(f"Failed to fetch page {page}. HTTP Status Code: {response.status_code}")
        continue

    # parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    # find all the company listings on the page
    listings = soup.find_all("div", class_="result")

    for listing in listings:
        try:
            # extract company details
            company_name = listing.find("a", class_="business-name").text.strip()
            website_url = listing.find("a", class_="track-visit-website")
            website_url = website_url['href'] if website_url else "N/A"
            phone = listing.find("div", class_="phones phone primary")
            phone = phone.text.strip() if phone else "N/A"
            address = listing.find("div", class_="street-address")
            address = address.text.strip() if address else "N/A"
            category = listing.find("div", class_="categories")
            category = category.text.strip() if category else "N/A"
            description = listing.find("div", class_="snippet")
            description = description.text.strip() if description else "N/A"
            
            ## sometimes it may won't be able to get email
            email_tag = listing.find("a", class_="email-business")
            email = email_tag['href'].replace('mailto:', '').strip() if email_tag else "N/A"


            # append the data to the list
            companies_data.append({
                "Company Name": company_name,
                "Website URL": website_url,
                "Contact Number": phone,
                "Location/Address": address,
                "Industry/Category": category,
                "Company Description": description,
                "Email Address": email
            })
        except Exception as e:
            print(f"Error extracting data for a listing: {e}")
            continue

    # delay to avoid getting blocked
    time.sleep(2)

# save the data to a CSV file
df = pd.DataFrame(companies_data)
df.to_csv(f"{SEARCH_QUERY}_companies_in_{LOCATION}_data.csv", index=False)

print("Data scraping completed. The results are saved")


Scraping page 1...
Scraping page 2...
Scraping page 3...
Data scraping completed. The results are saved
