In [69]:
!pip install dotenv requests bs4 pandas requests


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [70]:
import os
import requests
import pandas as pd

from bs4 import BeautifulSoup
from dotenv import load_dotenv

In [71]:
def get_title(soup):
    try:
        title = soup.find("span", attrs={"id": "productTitle"})
        title_val = title.text
        title_str = title_val.strip()
    except AttributeError:
        title_str = ""

    return title_str

def get_price(soup):
    try:
        price = soup.find("span", attrs={"class": "a-price aok-align-center apex-pricetopay-value"})
        price_val = price.find("span", attrs={"class": "a-offscreen"}).text
        price_str = price_val.strip()

    except AttributeError:
        price_str = "Not Available"

    return price_str

def get_rating(soup):
    try:
        rating = soup.find("span", attrs={"class": "a-icon-alt"})
        rating_val = rating.text
        rating_str = rating_val.strip()
    except AttributeError:
        rating_str = ""

    return rating_str

In [78]:
import time
import random

In [79]:
if __name__ == "__main__":

    load_dotenv()
    # Headers for the request.
    user_agent_string = os.getenv("USER_AGENT_STRING")

    HEADERS = ({'User-Agent' : user_agent_string, 'Accept-Language': 'en-US,en;q=0.5'})

    #The URL to scrape.
    URL = 'https://www.amazon.com/s?k=ram&crid=WYRP9WC6NWJJ&sprefix=ram%2Caps%2C473&ref=nb_sb_noss_1'

    # HTTP request.
    webpage = requests.get(URL, headers=HEADERS)

    # Soup object containing all data
    soup = BeautifulSoup(webpage.content, 'html.parser')

    # Fetch links as Lists of Tag Objects
    links = soup.find_all("a", attrs={"class": "a-link-normal s-line-clamp-2 puis-line-clamp-3-for-col-4-and-8 s-link-style a-text-normal"})

    # Store Links
    links_list = []

    print("Extracting links from the page...")

    # Loop for extracting links from tag objects
    for link in links:
        links_list.append(link.get('href'))

    print("Total number of products found: ", len(links_list), "\n")

    dict = {"title" : [], "price" : [], "rating" : []}

    # Loop for extracting product details from each link
    for link in links_list:
        new_webpage = requests.get("https://www.amazon.com" + link, headers=HEADERS)

        # Faking human behavior by sleeping for a random time between 2 and 6 seconds.
        sleep_time = random.uniform(2, 6)
        print(f"Sleeping for {sleep_time:.2f} seconds to mimic human behavior...")
        time.sleep(sleep_time)
        print("Done sleeping!")

        new_soup = BeautifulSoup(new_webpage.content, 'html.parser')

        dict["title"].append(get_title(new_soup))
        dict["price"].append(get_price(new_soup))
        dict["rating"].append(get_rating(new_soup))

    amazon_df = pd.DataFrame(dict)
    amazon_df.head()

Extracting links from the page...
Total number of products found:  22 

Sleeping for 4.29 seconds to mimic human behavior...
Done sleeping!
Sleeping for 3.24 seconds to mimic human behavior...
Done sleeping!
Sleeping for 4.90 seconds to mimic human behavior...
Done sleeping!
Sleeping for 2.20 seconds to mimic human behavior...
Done sleeping!
Sleeping for 4.04 seconds to mimic human behavior...
Done sleeping!
Sleeping for 3.89 seconds to mimic human behavior...
Done sleeping!
Sleeping for 4.52 seconds to mimic human behavior...
Done sleeping!
Sleeping for 3.19 seconds to mimic human behavior...
Done sleeping!
Sleeping for 4.00 seconds to mimic human behavior...
Done sleeping!
Sleeping for 2.47 seconds to mimic human behavior...
Done sleeping!
Sleeping for 5.13 seconds to mimic human behavior...
Done sleeping!
Sleeping for 4.26 seconds to mimic human behavior...
Done sleeping!
Sleeping for 2.49 seconds to mimic human behavior...
Done sleeping!
Sleeping for 3.55 seconds to mimic human beh

In [81]:
amazon_df.to_csv("amazon_products.csv", index=False)
print("Dataframe saved as CSV file.")
amazon_df.shape
print("Dataframe has ", amazon_df.shape[0], " rows and ", amazon_df.shape[1], " columns.")
print(f"Dataframe missing values:")
amazon_df.isnull().sum()


Dataframe saved as CSV file.
Dataframe has  22  rows and  3  columns.
Dataframe missing values:


title     0
price     0
rating    0
dtype: int64

In [76]:
amazon_df

Unnamed: 0,title,price,rating
0,NEMIX RAM 128GB (4X32GB) DDR4 2666MHZ PC4-2130...,"LKR355,497.51",4.8 out of 5 stars
1,【DDR3 RAM】 GIGASTONE 32GB Kit (4x8GB) DDR3/DDR...,"LKR27,533.51",4.7 out of 5 stars
2,"Crucial Pro DDR5 RAM 32GB Kit (2x16GB), 6400MH...","LKR113,323.94",4.6 out of 5 stars
3,CORSAIR Vengence RGB DDR5 RAM 32GB (2x16GB) 60...,"LKR127,469.71",4.8 out of 5 stars
4,Samsung 64GB (2x32GB) DDR5 4800MHz PC5-38400 S...,"LKR244,426.00",Previous page
5,CORSAIR Vengeance LPX DDR4 RAM 32GB (2x16GB) 3...,"LKR78,275.11",4.8 out of 5 stars
6,DATO 16GB DDR4 RAM 8GBx2 3200MHz DIMM with Hea...,"LKR54,760.71",4.4 out of 5 stars
7,CORSAIR VENGEANCE RGB DDR5 RAM 32GB (2x16GB) 6...,"LKR133,032.72",4.7 out of 5 stars
8,"Crucial 16GB DDR5 RAM Kit (2x8GB), 5600MHz Des...","LKR43,263.40",4.7 out of 5 stars
9,Samsung 32GB (2x16GB) DDR4 2666MHz PC4-21300 S...,"LKR80,134.60",Previous set of slides
