In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time


BASE_URL = "https://www.yellowpages.com/search?search_terms=IT+Services&geo_location_terms=New+York%2C+NY"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"
}

# List to store scraped data
data = []

# Function to scrape a single page
def scrape_page(page_url):
    try:
        response = requests.get(page_url, headers=HEADERS)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, "html.parser")
        companies = soup.find_all("div", class_="result")

        for company in companies:
            company_name = company.find("a", class_="business-name").text.strip() if company.find("a", class_="business-name") else None
            website_url = company.find("a", class_="track-visit-website")['href'] if company.find("a", class_="track-visit-website") else None
            contact_number = company.find("div", class_="phones phone primary").text.strip() if company.find("div", class_="phones phone primary") else None
            address = company.find("div", class_="street-address").text.strip() if company.find("div", class_="street-address") else None
            industry = company.find("div", class_="categories").text.strip() if company.find("div", class_="categories") else None
            description = company.find("p", class_="snippet").text.strip() if company.find("p", class_="snippet") else None
            email = None  # Emails are often not available in such directories and require further processing.

            data.append({
                "Company Name": company_name,
                "Website URL": website_url,
                "Contact Number": contact_number,
                "Location/Address": address,
                "Industry/Category": industry,
                "Company Description": description,
                "Email Address": email
            })

    except requests.exceptions.RequestException as e:
        print(f"Error fetching page: {e}")

# Loop through the first 2-3 pages
for page in range(1, 4):
    page_url = f"{BASE_URL}&page={page}"
    print(f"Scraping page {page}")
    scrape_page(page_url)
    time.sleep(2)  # To avoid getting blocked

# Save data to CSV
df = pd.DataFrame(data)
df.to_csv("companies_data.csv", index=False, encoding="utf-8")
print("Data saved to companies_data.csv")

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.yellowpages.com:443
2024-11-06 07:10:43 [urllib3.connectionpool] DEBUG: Starting new HTTPS connection (1): www.yellowpages.com:443


Scraping page 1


DEBUG:urllib3.connectionpool:https://www.yellowpages.com:443 "GET /search?search_terms=IT+Services&geo_location_terms=New+York%2C+NY&page=1 HTTP/11" 200 None
2024-11-06 07:10:45 [urllib3.connectionpool] DEBUG: https://www.yellowpages.com:443 "GET /search?search_terms=IT+Services&geo_location_terms=New+York%2C+NY&page=1 HTTP/11" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.yellowpages.com:443
2024-11-06 07:10:47 [urllib3.connectionpool] DEBUG: Starting new HTTPS connection (1): www.yellowpages.com:443


Scraping page 2


DEBUG:urllib3.connectionpool:https://www.yellowpages.com:443 "GET /search?search_terms=IT+Services&geo_location_terms=New+York%2C+NY&page=2 HTTP/11" 200 None
2024-11-06 07:10:49 [urllib3.connectionpool] DEBUG: https://www.yellowpages.com:443 "GET /search?search_terms=IT+Services&geo_location_terms=New+York%2C+NY&page=2 HTTP/11" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.yellowpages.com:443
2024-11-06 07:10:52 [urllib3.connectionpool] DEBUG: Starting new HTTPS connection (1): www.yellowpages.com:443


Scraping page 3


DEBUG:urllib3.connectionpool:https://www.yellowpages.com:443 "GET /search?search_terms=IT+Services&geo_location_terms=New+York%2C+NY&page=3 HTTP/11" 200 None
2024-11-06 07:10:54 [urllib3.connectionpool] DEBUG: https://www.yellowpages.com:443 "GET /search?search_terms=IT+Services&geo_location_terms=New+York%2C+NY&page=3 HTTP/11" 200 None


Data saved to companies_data.csv


In [None]:
df = pd.read_csv('companies_data.csv')
df.head()

Unnamed: 0,Company Name,Website URL,Contact Number,Location/Address,Industry/Category,Company Description,Email Address
0,Geeks On Site,,,,Computers & Computer Equipment-Service & Repair,,
1,AAA All Voice And Data Inc.,,,,Computer Cable & Wire InstallationComputer Sys...,,
2,My Computer Works,,(800) 262-2219,,Computer Service & Repair-BusinessComputer Tec...,,
3,Geeks On Site,,(800) 531-2349,,Computers & Computer Equipment-Service & Repair,,
4,My Computer Works,,(877) 201-0623,,Computers & Computer Equipment-Service & Repair,,


In [None]:
pd.set_option('display.max_rows', None)
print(df)

                                         Company Name  \
0                                       Geeks On Site   
1                         AAA All Voice And Data Inc.   
2                                   My Computer Works   
3                                       Geeks On Site   
4                                   My Computer Works   
5                                       Geeks on Site   
6                         AAA All Voice And Data Inc.   
7                           Jean Martin Insurance Inc   
8                                        EgeekPRO INC   
9                            SourceCom Communications   
10                                 Computer Overhauls   
11                   New York Computer Help - Midtown   
12                     TecProtocol Business Solutions   
13                               ABC Computer Service   
14                                      Computer Guys   
15                              Jay Shapiro Computers   
16                  Thirty5Tech