In [62]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

headers = {
    "Referer": 'https://www.amazon.com/',
    "Sec-Ch-Ua": "Not_A Brand",
    "Sec-Ch-Ua-Mobile": "?0",
    "Sec-Ch-Ua-Platform": "macOS",
    'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
}

def get_amazon_data_books(num_books):
    # Base URL of the Amazon search results for data science books
    base_url = f"https://www.amazon.com/s?k=data+engineering+books"

    books = []
    seen_titles = set()  # To keep track of seen titles

    page = 1

    while len(books) < num_books:
        url = f"{base_url}&page={page}"
        
        # Send a request to the URL
        response = requests.get(url, headers=headers)
        
        # Check if the request was successful
        if response.status_code == 200:
            # Parse the content of the request with BeautifulSoup
            soup = BeautifulSoup(response.content, "html.parser")
            
            # Find book containers (you may need to adjust the class names based on the actual HTML structure)
            book_containers = soup.find_all("div", {"class": "s-result-item"})
            
            # Loop through the book containers and extract data
            for book in book_containers:
                title = book.find("span", {"class": "a-text-normal"})
                author = book.find("a", {"class": "a-size-base"})
                price = book.find("span", {"class": "a-price-whole"})
                rating = book.find("span", {"class": "a-icon-alt"})
                
                if title and author and price and rating:
                    book_title = title.text.strip()
                    
                    # Check if title has been seen before
                    if book_title not in seen_titles:
                        seen_titles.add(book_title)
                        books.append({
                            "Title": book_title,
                            "Author": author.text.strip(),
                            "Price": price.text.strip(),
                            "Rating": rating.text.strip(),
                        })
            
            # Increment the page number for the next iteration
            page += 1
        else:
            print("Failed to retrieve the page")
            break

    # Limit to the requested number of books
    books = books[:num_books]
    
    # Convert the list of dictionaries into a DataFrame
    df = pd.DataFrame(books)
    
    # Remove duplicates based on 'Title' column
    df.drop_duplicates(subset="Title", inplace=True)
    df['Price'] = df['Price'].astype(float)
    df[['Rating', 'Rating_Out_Of']] = df.loc[:,'Rating'].str.replace(' stars', '').str.split(' out of ', expand=True)
    
    # Push the DataFrame to XCom
    # ti.xcom_push(key='book_data', value=df.to_dict('records'))
    return df

In [63]:
df = get_amazon_data_books(10)
df

Unnamed: 0,Title,Author,Price,Rating,Rating_Out_Of
0,Ultimate Data Engineering with Databricks: Dev...,Mayank Malhotra,37.0,4.1,5
1,Designing Data-Intensive Applications: The Big...,Martin Kleppmann,49.0,4.7,5
2,Python Data Engineering Resources: Forge Your ...,Vajo Lukic,0.0,4.7,5
3,Ace the Data Science Interview: 201 Real Inter...,Nick Singh,38.0,4.5,5
4,Data Quality Fundamentals: A Practitioner's Gu...,Paperback,45.0,4.1,5
5,Data Engineering 101,Kindle,0.0,3.9,5
6,Storytelling with Data: A Data Visualization G...,Cole Nussbaumer Knaflic,26.0,4.6,5
7,Data Mesh: Delivering Data-Driven Value at Scale,Zhamak Dehghani,43.0,4.5,5
8,Modern Data Engineering with Apache Spark: A H...,Scott Haines,29.0,4.1,5
9,Prompt Engineering for Generative AI: Future-P...,James Phoenix,53.0,4.5,5


In [59]:
df1 = df.copy()
# df1.loc[:,'Rating'].str.replace(' stars', '').str.split(' out of ', expand=True)
df1[['Rating', 'Rating_OutOf']] = df1.loc[:,'Rating'].str.replace(' stars', '').str.split(' out of ', expand=True)
# df1.loc[:,'Rating'].str.replace(' stars', '').str.split(' out of ', expand=True)
df1

Unnamed: 0,Title,Author,Price,Rating,Rating_OutOf
0,Ultimate Data Engineering with Databricks: Dev...,Mayank Malhotra,37.0,4.1,5
1,Designing Data-Intensive Applications: The Big...,Martin Kleppmann,49.0,4.7,5
2,Python Data Engineering Resources: Forge Your ...,Vajo Lukic,0.0,4.7,5
3,Ace the Data Science Interview: 201 Real Inter...,Nick Singh,38.0,4.5,5
4,Data Quality Fundamentals: A Practitioner's Gu...,Paperback,45.0,4.1,5
5,Fighting Churn with Data: The science and stra...,Carl S. Gold,56.0,4.4,5
6,Data Engineering 101,Kindle,0.0,3.9,5
7,Storytelling with Data: A Data Visualization G...,Cole Nussbaumer Knaflic,26.0,4.6,5
8,Data Mesh: Delivering Data-Driven Value at Scale,Zhamak Dehghani,43.0,4.5,5
9,Modern Data Engineering with Apache Spark: A H...,Scott Haines,30.0,4.1,5


In [18]:
df

Unnamed: 0,Title,Author,Price,Rating
0,Ultimate Data Engineering with Databricks: Dev...,Mayank Malhotra,37.0,4.1 out of 5 stars
1,Designing Data-Intensive Applications: The Big...,Martin Kleppmann,49.0,4.7 out of 5 stars
2,Python Data Engineering Resources: Forge Your ...,Vajo Lukic,0.0,4.7 out of 5 stars
3,Ace the Data Science Interview: 201 Real Inter...,Nick Singh,38.0,4.5 out of 5 stars
4,Data Quality Fundamentals: A Practitioner's Gu...,Paperback,45.0,4.1 out of 5 stars
5,Data Engineering 101,Kindle,0.0,3.9 out of 5 stars
6,Storytelling with Data: A Data Visualization G...,Cole Nussbaumer Knaflic,26.0,4.6 out of 5 stars
7,Data Mesh: Delivering Data-Driven Value at Scale,Zhamak Dehghani,43.0,4.5 out of 5 stars
8,Modern Data Engineering with Apache Spark: A H...,Scott Haines,29.0,4.1 out of 5 stars
9,Prompt Engineering for Generative AI: Future-P...,James Phoenix,53.0,4.5 out of 5 stars


In [65]:
base_url = f"https://www.amazon.com/s?k=data+engineering+books"

books = []
seen_titles = set()  # To keep track of seen titles

page = 1
response = requests.get(base_url, headers=headers)
if response.status_code == 200:
    # Parse the content of the request with BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Find book containers (you may need to adjust the class names based on the actual HTML structure)
    book_containers = soup.find_all("div", {"class": "s-result-item"})

In [66]:
book_containers

[<div class="a-section a-spacing-none s-result-item s-flex-full-width s-border-bottom-none s-widget s-widget-spacing-large" data-asin="" data-index="0"><div cel_widget_id="MAIN-MESSAGING-0" class="s-widget-container s-spacing-mini s-widget-container-height-mini celwidget slot=MAIN template=MESSAGING widgetId=messaging-messages-results-header-builder" data-uuid="a7ea757d-658f-48f5-8dad-8cf726e4a51b">
 <span class="rush-component" data-component-type="s-messaging-widget-results-header">
 <div class="a-section a-spacing-none s-messaging-widget-results-header">
 <div class="s-no-outline">
 <h2 class="a-size-medium-plus a-spacing-none a-color-base a-text-bold">Results</h2>
 </div>
 </div>
 </span>
 </div></div>,
 <div class="sg-col-4-of-24 sg-col-4-of-12 s-result-item s-asin sg-col-4-of-16 AdHolder sg-col s-widget-spacing-small sg-col-4-of-20 gsx-ies-anchor" data-asin="8196994788" data-component-type="s-search-result" data-index="2" data-uuid="2bf4e805-4aa4-4709-81c6-b86d97aa2855"><div clas