# WEBSCRAPING - INTRODUCTION

In [None]:
# Installing packages
from bs4 import BeautifulSoup
import requests

In [None]:
url ='https://www.scrapethissite.com/pages/forms'

In [None]:
page = requests.get(url) # response 204 : No content on the web page
                         # repsonse 100 : bad request

In [None]:
soup = BeautifulSoup(page.text , 'html')

In [None]:
print(soup)

In [None]:
print(soup.prettify())

# WEBSCRAPING - WIKIPEDIA (THE WORLD'S BILLIONAIRES)

In [12]:
# Installing packages
from bs4 import BeautifulSoup
import requests

In [None]:
url = 'https://en.wikipedia.org/wiki/The_World%27s_Billionaires'

page = requests.get(url)

soup = BeautifulSoup(page.text, 'html')

In [None]:
print(soup)

In [None]:
soup.find('table')

In [None]:
soup.find_all('table')[2]

In [None]:
soup.find('table', class_ = 'wikitable sortable')

In [None]:
table_3rd = soup.find_all('table')[2]

In [None]:
print(table_3rd)

In [None]:
world_billionaires = table_3rd.find_all('th')

In [None]:
world_billionaires

In [None]:
world_table_billionaires = [title.text.strip() for title in world_billionaires]

print(world_table_billionaires)

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(columns = world_table_billionaires)

df

In [None]:
column_data = table_3rd.find_all('tr')

In [None]:
for row in column_data[1:]:
    row_data = row.find_all('td')
    individual_rows = [data.text.strip() for data in row_data]
    print(individual_rows)

In [None]:
length = len(df)
df.loc[length] = individual_rows

In [None]:
import csv

csv_file_path = r'C:\Users\14086\Downloads\World_Billionaires.csv'

with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(world_table_billionaires)

    for row in column_data[1:]:
        row_data = row.find_all('td')
        individual_rows = [data.text.strip() for data in row_data]
        writer.writerow(individual_rows)


# AMAZON - WEBSCRAPING + SELENIUM

In [1]:
# Installing packages
!pip install selenium chromedriver-autoinstaller pandas beautifulsoup4

# Importing required libraries
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
import chromedriver_autoinstaller
from IPython.display import display

chromedriver_autoinstaller.install()

# Chrome options - Webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')



In [2]:
# Function to create a search URL for Amazon
def get_url(search_term):
    search_term = search_term.replace(' ', '+')
    return f"https://www.amazon.com/s?k={search_term}"

In [3]:
search_term = 'monitor'
url = get_url(search_term)
print(url)

https://www.amazon.com/s?k=monitor


In [4]:
# Initialize Selenium WebDriver
driver = webdriver.Chrome()
driver.get(url)

# Reading The HTML website
soup = BeautifulSoup(driver.page_source, 'html.parser')
results = soup.find_all('div', {'data-component-type': 's-search-result'})

# Printing the no. of search results found
print(len(results))

22


In [5]:
import re

from bs4 import BeautifulSoup
 
def extract_record(item):

    # Extracting Description

    description = item.h2.a.text.strip()
 
    # Extracting URL

    url = "https://www.amazon.com" + item.h2.a.get('href')
 
    # Extracting Price

    price = item.find('span', 'a-price-whole')

    price_fraction = item.find('span', 'a-price-fraction')

    if price and price_fraction:

        price_whole = price.text.strip().replace(',', '')

        # Check if price whole already contains a decimal

        if '.' in price_whole:

            price = price_whole + price_fraction.text.strip()

        else:

            price = price_whole + '.' + price_fraction.text.strip()

    else:

        price = ''
 
    # Extracting Ratings

    rating = item.find('i', {'class': 'a-icon-star-small'}) or ''
    if rating:
        rating = rating.text.strip()
 
    # Extracting Review Counts

    review_count = item.find('span', {'class': 'a-size-base'}) or ''
    if review_count:
        review_count = review_count.text.strip()
 
    # Extracting Display Size

    display_size = ''

    features = item.find_all('span', {'class': 'a-text-bold'})

    for feature in features:

        if 'inches' in feature.text:

            display_size = feature.text.strip()

            break
 
    # Extract Resolution

    resolution = ''

    features = item.find_all('span', {'class': 'a-text-bold'})

    for feature in features:

        # Look for a pattern that signifies resolution (numbers followed by 'p')

        if re.search(r'\d+p', feature.text):

            resolution = feature.text.strip()

            break
 
    # Return extracted details as a dictionary

    return {

        'Description': description,

        'Price(USD)': price,

        'Rating': rating,

        'Review Count': review_count,

        'Display Size': display_size,

        'Resolution': resolution,

        'Url of the Product': url

    }


In [6]:
# Main function to search a query and save results to CSV
def search_query(search_term):
    driver = webdriver.Chrome(options=chrome_options)
    records = []
    url = get_url(search_term)

    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    results = soup.find_all('div', {'data-component-type': 's-search-result'})

    for item in results:
        record = extract_record()
        if record:
            records.append(record)

    driver.close()

    # Save data to DataFrame and CSV
    if records:
        df = pd.DataFrame(records)
        filename = f"{search_term.replace(' ', '_')}.csv"
        df.to_csv(filename, index=False)
        print(f"Data saved to {filename}")
        display(df)
    else:
        print("No records found.")


In [8]:
search_query('Monitor')

No records found.
