[Reference](https://python.plainenglish.io/10-essential-concepts-for-effective-web-scraping-in-python-4a7cd7f6155f)

# Understanding HTML and CSS

In [1]:
from bs4 import BeautifulSoup

# Example HTML code
html = """
<html>
  <head>
    <title>Example Website</title>
  </head>
  <body>
    <div class="container">
      <h1>Welcome to my website!</h1>
      <p>This is an example paragraph.</p>
      <ul>
        <li>Item 1</li>
        <li>Item 2</li>
        <li>Item 3</li>
      </ul>
    </div>
  </body>
</html>
"""

# Parse the HTML code with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

# Extract the text from the h1 tag
title = soup.find('h1').get_text()

# Extract the text from the first li tag
li = soup.find('li').get_text()

# Print the extracted data
print(title)
print(li)

Welcome to my website!
Item 1


# Inspecting Elements

In [2]:
import requests
from bs4 import BeautifulSoup

# Send a request to the website
response = requests.get('https://www.example.com')

# Parse the HTML code with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Find the title tag
title = soup.find('title').get_text()

# Print the title tag
print(title)

Example Domain


# User Agents

In [3]:
import requests

# Define the user agent
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}

# Send a request to the website with the defined user agent
response = requests.get('https://www.example.com', headers=headers)

# Print the response content
print(response.content)

b'<!doctype html>\n<html>\n<head>\n    <title>Example Domain</title>\n\n    <meta charset="utf-8" />\n    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />\n    <meta name="viewport" content="width=device-width, initial-scale=1" />\n    <style type="text/css">\n    body {\n        background-color: #f0f0f2;\n        margin: 0;\n        padding: 0;\n        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;\n        \n    }\n    div {\n        width: 600px;\n        margin: 5em auto;\n        padding: 2em;\n        background-color: #fdfdff;\n        border-radius: 0.5em;\n        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);\n    }\n    a:link, a:visited {\n        color: #38488f;\n        text-decoration: none;\n    }\n    @media (max-width: 700px) {\n        div {\n            margin: 0 auto;\n            width: auto;\n        }\n    }\n    </style>    \n</head>\n\n<body>\n<div>\n    

# Proxies

In [5]:
# import requests

# # Define the proxy
# proxies = {
#     'http': 'http://10.10.1.10:3128',
#     'https': 'https://10.10.1.10:1080',
# }

# # Send a request to the website with the proxies defined
# response = requests.get('https://www.example.com', proxies=proxies)

# # Print the response content
# print(response.content)

# Rate Limiting

In [6]:
import requests
import time

# Set the start time
start_time = time.time()

# Send a request to the website
response = requests.get('https://www.example.com')

# Set the end time
end_time = time.time()

# Calculate the time taken to make the request
time_taken = end_time - start_time

# Delay the next request if it took less than a second
if time_taken < 1:
    time.sleep(1 - time_taken)

# Print the response content
print(response.content)

b'<!doctype html>\n<html>\n<head>\n    <title>Example Domain</title>\n\n    <meta charset="utf-8" />\n    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />\n    <meta name="viewport" content="width=device-width, initial-scale=1" />\n    <style type="text/css">\n    body {\n        background-color: #f0f0f2;\n        margin: 0;\n        padding: 0;\n        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;\n        \n    }\n    div {\n        width: 600px;\n        margin: 5em auto;\n        padding: 2em;\n        background-color: #fdfdff;\n        border-radius: 0.5em;\n        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);\n    }\n    a:link, a:visited {\n        color: #38488f;\n        text-decoration: none;\n    }\n    @media (max-width: 700px) {\n        div {\n            margin: 0 auto;\n            width: auto;\n        }\n    }\n    </style>    \n</head>\n\n<body>\n<div>\n    

# Handling Errors

In [7]:
import requests

try:
    # Send a request to the website
    response = requests.get('https://www.example.com')

    # Check if the response status code is 200 (OK)
    if response.status_code == 200:
        # Print the response content
        print(response.content)
    else:
        # Print the response status code
        print(f"Error: {response.status_code}")
except requests.exceptions.RequestException as e:
    # Print the error message
    print(e)

b'<!doctype html>\n<html>\n<head>\n    <title>Example Domain</title>\n\n    <meta charset="utf-8" />\n    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />\n    <meta name="viewport" content="width=device-width, initial-scale=1" />\n    <style type="text/css">\n    body {\n        background-color: #f0f0f2;\n        margin: 0;\n        padding: 0;\n        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;\n        \n    }\n    div {\n        width: 600px;\n        margin: 5em auto;\n        padding: 2em;\n        background-color: #fdfdff;\n        border-radius: 0.5em;\n        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);\n    }\n    a:link, a:visited {\n        color: #38488f;\n        text-decoration: none;\n    }\n    @media (max-width: 700px) {\n        div {\n            margin: 0 auto;\n            width: auto;\n        }\n    }\n    </style>    \n</head>\n\n<body>\n<div>\n    

# Parsing JSON

In [11]:
# import requests
# import json

# # Send a request to the API
# response = requests.get('https://api.example.com/data')

# # Parse the JSON data
# data = json.loads(response.content)

# # Print the data
# print(data)

# Handling Errors

In [9]:
import requests

try:
    # Send a request to the website
    response = requests.get('https://www.example.com')

    # Check if the response status code is 200 (OK)
    if response.status_code == 200:
        # Print the response content
        print(response.content)
    else:
        # Print the response status code
        print(f"Error: {response.status_code}")
except requests.exceptions.RequestException as e:
    # Print the error message
    print(e)

b'<!doctype html>\n<html>\n<head>\n    <title>Example Domain</title>\n\n    <meta charset="utf-8" />\n    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />\n    <meta name="viewport" content="width=device-width, initial-scale=1" />\n    <style type="text/css">\n    body {\n        background-color: #f0f0f2;\n        margin: 0;\n        padding: 0;\n        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;\n        \n    }\n    div {\n        width: 600px;\n        margin: 5em auto;\n        padding: 2em;\n        background-color: #fdfdff;\n        border-radius: 0.5em;\n        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);\n    }\n    a:link, a:visited {\n        color: #38488f;\n        text-decoration: none;\n    }\n    @media (max-width: 700px) {\n        div {\n            margin: 0 auto;\n            width: auto;\n        }\n    }\n    </style>    \n</head>\n\n<body>\n<div>\n    

# Parsing JSON

In [12]:
import requests
import json

# Send a request to the API
response = requests.get('https://api.example.com/data')

# Parse the JSON data
data = json.loads(response.content)

# Print the data
print(data)

ConnectionError: ignored

# Handling Dynamic Content

In [15]:
# from selenium import webdriver

# # Create a new Chrome instance
# driver = webdriver.Chrome()

# # Navigate to the website
# driver.get('https://www.example.com')

# # Extract the text from the h1 tag
# title = driver.find_element_by_tag_name('h1').text

# # Print the title
# print(title)

# # Close the browser
# driver.quit()

# Data Cleaning

In [16]:
import re

# Define the text to clean
text = '   Hello\n\n\tWorld   '

# Remove whitespace characters
clean_text = re.sub('\s+', ' ', text).strip()

# Print the cleaned text
print(clean_text)

Hello World


# Scraping Multiple Pages

In [17]:
import requests
from bs4 import BeautifulSoup

# Define the URL pattern
url_pattern = 'https://www.example.com/page/{}'

# Define the number of pages to scrape
num_pages = 5

# Loop through the pages and extract the data
for i in range(1, num_pages + 1):
    # Send a request to the page
    url = url_pattern.format(i)
    response = requests.get(url)

    # Extract the data from the page
    soup = BeautifulSoup(response.content, 'html.parser')
    data = soup.find_all('div', {'class': 'data'})

    # Print the data
    print(data)

[]
[]
[]
[]
[]
