In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [2]:
def scrape_yellow_pages():
    # Headers to simulate a request from a web browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
    }
    
    # Base URL of the search results for digital marketing agencies in Los Angeles
    base_url = 'https://www.yellowpages.com/search?search_terms=digital+marketing+agency&geo_location_terms=Los+Angeles%2C+CA'
    
    # Starting a session to keep certain parameters across requests
    session = requests.Session()
    session.headers.update(headers)
    
    data = []  # List to store extracted data
    total_companies = 0  # Counter to track the number of companies scraped
    page = 1  # Page counter to navigate through pagination
    
    # Loop until 50 companies are scraped or no more companies are found
    while total_companies < 50:
        url = f'{base_url}&page={page}'  # Constructing URL for each page
        response = session.get(url)  # Making a GET request to the URL
        if response.status_code != 200:
            print(f"Failed to retrieve page with status code: {response.status_code}")
            break
        
        soup = BeautifulSoup(response.content, 'html.parser')  # Parsing the page content
        companies = soup.find_all('div', class_='info')  # Finding all company blocks
        
        if not companies:
            print("No companies found on this page.")
            break
        
        # Extracting data for each company found on the page
        for company in companies:
            try:
                name = company.find('a', class_='business-name').text.strip() if company.find('a', class_='business-name') else 'No name available'
                website_link = company.find('a', class_='track-visit-website')
                website = website_link['href'] if website_link else 'No website'
                phone = company.find('div', class_='phones phone primary').text.strip() if company.find('div', class_='phones phone primary') else 'No phone available'
                address = company.find('div', class_='street-address').text.strip() + ", " + company.find('div', class_='locality').text.strip() if company.find('div', class_='street-address') and company.find('div', class_='locality') else 'No address available'
                category = "Digital Marketing Agencies"  # Static data as category is predefined
                description = 'Not available'  # Static text as descriptions are not consistently available
                
                # Adding data to the list only if essential fields are present
                if name and website and phone and address:
                    data.append([name, website, phone, address, category, description, 'Email not available'])
                    total_companies += 1
                    if total_companies >= 50:
                        break
            except Exception as e:
                print(f'Error extracting data for a company: {e}')
        
        time.sleep(1.5)  # Delay to prevent being blocked by the server
        page += 1  # Increment page counter to navigate to the next page

    # Creating a DataFrame from the list of data and exporting to CSV
    df = pd.DataFrame(data, columns=['Name', 'Website', 'Phone', 'Address', 'Category', 'Description', 'Email'])
    df.to_csv('digital_marketing_services.csv', index=False)
    print(f"Total companies scraped: {total_companies}")

In [3]:
if __name__ == '__main__':
    scrape_yellow_pages()

Total companies scraped: 50
