In [23]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time  # Optional: To add delays between requests

In [24]:
# Step 2: Send an HTTP request to the website
# Send a GET request to the web page you want to scrape. This request retrieves the HTML content of the page:

url = 'https://example.com'
response = requests.get(url)
if response.status_code == 200:
    print('Request successful!')
else:
    print('Failed to retrieve the webpage')

'''
Explanation: the requests.get() function sends a request to the specified URL and
stores the response. It’s important to check the status_code of the response to ensure
that the request was successful (a status code of 200 indicates success).
'''

Request successful!


'\nExplanation: the requests.get() function sends a request to the specified URL and\nstores the response. It’s important to check the status_code of the response to ensure \nthat the request was successful (a status code of 200 indicates success).\n'

In [25]:
# Step 3: Parse the HTML content
# Once you retrieve the HTML content, use BeautifulSoup to parse it and create a navigable tree structure:

soup = BeautifulSoup(response.text, 'html.parser')
# Print the title of the webpage to verify
print("Title: " + soup.title.text)

'''
Explanation: BeautifulSoup parses the HTML content and allows you to navigate and
search through the HTML elements easily. The soup.title.text line prints the title of
the web page to confirm that the HTML has been parsed correctly.
'''

Title: Example Domain


'\nExplanation: BeautifulSoup parses the HTML content and allows you to navigate and \nsearch through the HTML elements easily. The soup.title.text line prints the title of \nthe web page to confirm that the HTML has been parsed correctly.\n'

In [None]:
# Step 4: Extract the data you need
# Determine which HTML elements contain the data you want to extract. For this example,
# let’s assume you’re scraping a table with product information:

# Locate the table that contains the product data
table = soup.find('table', {'id': 'product-table'})  # Replace with the actual id or class name

# Extract the rows of the table
rows = table.find_all('tr')

# Initialize an empty list to store the data
data = []

# Loop through each row and extract the relevant data
for row in rows[1:]:  # Skipping the header row
    cols = row.find_all('td')
    product_name = cols[0].text.strip()
    price = cols[1].text.strip()
    rating = cols[2].text.strip()
    data.append([product_name, price, rating])

# Convert the list to a pandas DataFrame
df = pd.DataFrame(data, columns=['Product Name', 'Price', 'Rating'])

'''
Explanation: this code locates the table with the product data and iterates over
each row (skipping the header). For each row, it extracts the product name, price,
and rating and appends this information to a list. Finally,
the list is converted into a pandas DataFrame for easier manipulation and export.
'''

In [27]:
# Step 5: Handle common scraping challenges

'''
Web scraping often involves dealing with various challenges, such as missing data,
dynamic content, or blocked requests. Here are a few strategies to handle these:
'''




'\nWeb scraping often involves dealing with various challenges, such as missing data, \ndynamic content, or blocked requests. Here are a few strategies to handle these:\n'

a. Handling missing data


In [None]:
# If some rows or columns might be missing data, you can add checks to handle these cases:

for row in rows[1:]:
    cols = row.find_all('td')
    if len(cols) == 3:  # Ensure all three columns are present
        product_name = cols[0].text.strip() if cols[0] else 'N/A'
        price = cols[1].text.strip() if cols[1] else 'N/A'
        rating = cols[2].text.strip() if cols[2] else 'N/A'
        data.append([product_name, price, rating])
    else:
        print('Skipping a row with missing data.')

b. Adding delays between requests





In [None]:
# To avoid overwhelming the server or getting blocked, it’s good practice
# to add delays between requests:

time.sleep(2)  # Adds a 2-second delay before the next request

c. Handling dynamic content



Some websites load content dynamically using JavaScript, which can’t be directly scraped with BeautifulSoup. In such cases, you might need to use Selenium, a web driver that can interact with JavaScript-driven content.


d. Error handling




In [21]:
# Incorporate error handling to manage issues such as network errors or changes in the website structure:

try:
    response = requests.get(url)
    response.raise_for_status()  # Raises an HTTPError for bad responses
except requests.exceptions.HTTPError as err:
    print('HTTP error occurred:', err)
except Exception as err:
    print('Other error occurred:', err)


HTTP error occurred: 403 Client Error: Forbidden for url: https://en.wikipedia.org/wiki/Cloud-computing_comparison


In [None]:
# Step 5: Save the scraped data
# Save the DataFrame to a CSV file
df.to_csv('scraped_data.csv', index=False)


In [None]:
# Step 6: Save the scraped data
# Save the DataFrame to a CSV file

df.to_csv('scraped_products.csv', index=False)

print('Data successfully saved to scraped_products.csv')
