# Get Books Data From Amazon

In [93]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

headers = {
    "Referer": 'https://www.amazon.com.au/',
    "Sec-Ch-Ua": "Not_A Brand",
    "Sec-Ch-Ua-Mobile": "?0",
    "Sec-Ch-Ua-Platform": "macOS",
    'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
}

def get_amazon_data_books(num_books=1000):
    # Base URL of the Amazon search results for data science books
    base_url = f"https://www.amazon.com.au/s?k=data+engineering+books"

    books = []
    seen_titles = set()  # To keep track of seen titles

    page = 1

    while len(books) < num_books:
        url = f"{base_url}&page={page}"
        
        # Send a request to the URL
        response = requests.get(url, headers=headers)
        
        # Check if the request was successful
        if response.status_code == 200:
            # Parse the content of the request with BeautifulSoup
            soup = BeautifulSoup(response.content, "html.parser")
            
            # Find book containers (you may need to adjust the class names based on the actual HTML structure)
            book_containers = soup.find_all("div", {"class": "s-result-item"})
            
            # Loop through the book containers and extract data
            for book in book_containers:
                title = book.find("span", {"class": "a-text-normal"})
                type = book.find("a", {"class": "a-size-base"})
                author = book.find("a", {"class": "a-size-base"})
                price = book.find("span", {"class": "a-price-whole"})
                rating = book.find("span", {"class": "a-icon-alt"})
                
                if title and author and price and rating:
                    book_title = title.text.strip()
                    
                    # Check if title has been seen before
                    if book_title not in seen_titles:
                        seen_titles.add(book_title)
                        books.append({
                            "Title": book_title,
                            "Type": type.text.strip(),
                            "Author": author.text.strip(),
                            "Price": price.text.strip(),
                            "Rating": rating.text.strip(),
                        })
            
            # Increment the page number for the next iteration
            page += 1
        else:
            print("Failed to retrieve the page")
            break

    # Limit to the requested number of books
    # books = books[:num_books]
    
    # Convert the list of dictionaries into a DataFrame
    df = pd.DataFrame(books)
    
    # # Remove duplicates based on 'Title' column
    df.drop_duplicates(subset="Title", inplace=True)
    df['Price'] = df['Price'].astype(float)
    df[['Rating', 'Rating_Out_Of']] = df.loc[:,'Rating'].str.replace(' stars', '').str.split(' out of ', expand=True)
    
    return df

# Read Present Data in Our Database

In [94]:
import psycopg2
import pandas as pd

def read_book_data_from_postgres() -> pd.DataFrame:
    # Connect to your local PostgreSQL database
    connection = psycopg2.connect(
        dbname='SuperStore',
        user='postgres',
        password='Welcome',
        host='localhost',
        port='5432'
    )
    
    # Define the SQL query to retrieve data
    read_query = """
    SELECT * FROM books
    """
    
    # Execute the query and fetch the data
    cursor = connection.cursor()
    cursor.execute(read_query)
    result = cursor.fetchall()
    
    # Fetch column names for the DataFrame
    column_names = [desc[0] for desc in cursor.description]
    
    # Convert the result into a pandas DataFrame
    df = pd.DataFrame(result, columns=column_names)
    
    # Close the cursor and connection
    cursor.close()
    connection.close()
    
    return df

# Write Delta of Books Data to our Database

In [95]:
def write_delta_to_db(new: pd.DataFrame, old: pd.DataFrame):
    connection = psycopg2.connect(
        dbname='SuperStore',
        user='postgres',
        password='Welcome',
        host='localhost',
        port='5432'
    )
    
    df = new[~new.apply(tuple, 1).isin(old.apply(tuple,1))]

    # from sqlalchemy import create_engine
    # engine = create_engine('postgresql+psycopg2://postgres:Welcome@localhost:5432/SuperStore')

    # df.to_sql('books', con=engine, index=False, if_exists='replace')
    return df

# Execute the Steps

In [96]:
fresh_data = get_amazon_data_books(100)
current_data = read_book_data_from_postgres()

In [97]:
print(f'Fresh Data = {len(fresh_data)} and Current data = {len(current_data)}')

Fresh Data = 124 and Current data = 20


In [98]:
print(len(write_delta_to_db(fresh_data, current_data)))

124


In [None]:
current_data

In [99]:
fresh_data[~fresh_data.apply(tuple, 1).isin(current_data.apply(tuple,1))]

Unnamed: 0,Title,Author,Price,Rating,Rating_Out_Of
0,Ultimate Data Engineering with Databricks: Dev...,Paperback,64.0,4.1,5
1,Fundamentals of Person-Centred Healthcare Prac...,Paperback,44.0,4.6,5
2,The Production of Space,Paperback,49.0,4.6,5
3,Fundamentals of Data Engineering: Plan and Bui...,Paperback,42.0,4.7,5
4,Cracking the Data Engineering Interview: Land ...,Paperback,41.0,4.3,5
...,...,...,...,...,...
119,MATLAB: A Practical Introduction to Programmin...,Paperback,94.0,4.4,5
120,Database Development For Dummies,Paperback,42.0,4.4,5
121,Machine Learning on Kubernetes: A practical ha...,Paperback,66.0,4.6,5
122,Windows 11 For Dummies,Paperback,31.0,4.4,5


In [59]:
df1 = df.copy()
# df1.loc[:,'Rating'].str.replace(' stars', '').str.split(' out of ', expand=True)
df1[['Rating', 'Rating_OutOf']] = df1.loc[:,'Rating'].str.replace(' stars', '').str.split(' out of ', expand=True)
# df1.loc[:,'Rating'].str.replace(' stars', '').str.split(' out of ', expand=True)
df1

Unnamed: 0,Title,Author,Price,Rating,Rating_OutOf
0,Ultimate Data Engineering with Databricks: Dev...,Mayank Malhotra,37.0,4.1,5
1,Designing Data-Intensive Applications: The Big...,Martin Kleppmann,49.0,4.7,5
2,Python Data Engineering Resources: Forge Your ...,Vajo Lukic,0.0,4.7,5
3,Ace the Data Science Interview: 201 Real Inter...,Nick Singh,38.0,4.5,5
4,Data Quality Fundamentals: A Practitioner's Gu...,Paperback,45.0,4.1,5
5,Fighting Churn with Data: The science and stra...,Carl S. Gold,56.0,4.4,5
6,Data Engineering 101,Kindle,0.0,3.9,5
7,Storytelling with Data: A Data Visualization G...,Cole Nussbaumer Knaflic,26.0,4.6,5
8,Data Mesh: Delivering Data-Driven Value at Scale,Zhamak Dehghani,43.0,4.5,5
9,Modern Data Engineering with Apache Spark: A H...,Scott Haines,30.0,4.1,5


In [18]:
df

Unnamed: 0,Title,Author,Price,Rating
0,Ultimate Data Engineering with Databricks: Dev...,Mayank Malhotra,37.0,4.1 out of 5 stars
1,Designing Data-Intensive Applications: The Big...,Martin Kleppmann,49.0,4.7 out of 5 stars
2,Python Data Engineering Resources: Forge Your ...,Vajo Lukic,0.0,4.7 out of 5 stars
3,Ace the Data Science Interview: 201 Real Inter...,Nick Singh,38.0,4.5 out of 5 stars
4,Data Quality Fundamentals: A Practitioner's Gu...,Paperback,45.0,4.1 out of 5 stars
5,Data Engineering 101,Kindle,0.0,3.9 out of 5 stars
6,Storytelling with Data: A Data Visualization G...,Cole Nussbaumer Knaflic,26.0,4.6 out of 5 stars
7,Data Mesh: Delivering Data-Driven Value at Scale,Zhamak Dehghani,43.0,4.5 out of 5 stars
8,Modern Data Engineering with Apache Spark: A H...,Scott Haines,29.0,4.1 out of 5 stars
9,Prompt Engineering for Generative AI: Future-P...,James Phoenix,53.0,4.5 out of 5 stars


In [65]:
base_url = f"https://www.amazon.com/s?k=data+engineering+books"

books = []
seen_titles = set()  # To keep track of seen titles

page = 1
response = requests.get(base_url, headers=headers)
if response.status_code == 200:
    # Parse the content of the request with BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Find book containers (you may need to adjust the class names based on the actual HTML structure)
    book_containers = soup.find_all("div", {"class": "s-result-item"})

In [1]:
import requests

# Define the API endpoint
url = "https://data.iowa.gov/resource/m3tr-qhgy.json"

# Send a GET request to the API endpoint
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the JSON data
    data = response.json()
    
    # Print the data (or process it as needed)
    # for item in data:
    #     print(item)
    print(f"Retrieved data: {response.status_code}")
else:
    print(f"Failed to retrieve data: {response.status_code}")


Retrieved data: 200


In [3]:
len(data)

1000

In [4]:
import requests
import pandas as pd

# Define the API endpoint
url = "https://data.iowa.gov/resource/m3tr-qhgy.json"

# Initialize parameters for pagination
limit = 1000  # Number of records per request (adjust based on API limits)
offset = 0    # Starting point for each request
all_data = [] # List to store all records

while True:
    # Send a GET request to the API endpoint with pagination parameters
    response = requests.get(url, params={"$limit": limit, "$offset": offset})
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the JSON data
        data = response.json()
        
        # Break the loop if no more data is returned
        if not data:
            break
        
        # Append the data to the list
        all_data.extend(data)
        
        # Increment the offset for the next request
        offset += limit
    else:
        print(f"Failed to retrieve data: {response.status_code}")
        break

# Convert the list of records to a DataFrame
df = pd.DataFrame(all_data)

# Save the DataFrame to a CSV file
df.to_csv("iowa_data.csv", index=False)

print("Data download complete. Saved to iowa_data.csv")

KeyboardInterrupt: 

In [5]:
len(df)

NameError: name 'df' is not defined

: 

In [15]:
df = read_book_data_from_postgres()
print(df)

                  person          region
0         Anthony Jacobs         Oceania
1            Jack Lebron           North
2         Kelly Williams            East
3          Anna Andreadi         Central
4            Chuck Magee           South
5             Nora Preis    Central Asia
6      Deborah Brumfield          Africa
7        Shirley Daniels      North Asia
8   Alejandro Ballentine  Southeast Asia
9          Nicole Hansen          Canada
10      Giulietta Dortch       Caribbean
11          Larry Hughes            AMEA
12        Matt Collister            West


In [19]:
connection = psycopg2.connect(
    dbname='SuperStore',
    user='postgres',
    password='Welcome',
    host='localhost',
    port='5432'
)

from sqlalchemy import create_engine
engine = create_engine('postgresql+psycopg2://postgres:Welcome@localhost:5432/SuperStore')

df.to_sql('books', con=engine, index=False, if_exists='replace')

20