# Initialization

## Importing Libraries

In [379]:
import time
import pandas as pd
from tqdm import tqdm

from bs4 import BeautifulSoup
from selenium import webdriver

## Parameters

In [380]:
name = 'Manga_Fiction'
URL = 'https://www.amazon.com/Best-Sellers-Books-Science-Fiction-Manga/zgbs/books/13436201/ref=zg_bs_nav_books_3_4367'

out_dir = '../out'
out_filepath = f'{out_dir}/{name}_Amazon_Info_Top50.xlsx'

# Main

## Downloading Page

In [381]:
# using selenium to run javascript based web pages (we need to scroll down to get full length)
browser=webdriver.Chrome()
browser.get(URL)

# waiting for amazon to run the api to get new data
time.sleep(5)

# initializing values
curr_height = 0
total_height = browser.execute_script(f'return document.body.scrollHeight')

# continuous scrolling because amazon used lazy loading
while curr_height < total_height:
    # scroll by pixel counts
    browser.execute_script(f'window.scrollTo(0,{curr_height})')
    
    # each time scroll 100 pixels
    curr_height += 100

    # waiting before each scroll
    time.sleep(0.1)

    # update the total_height when data new comes into picture    
    total_height = browser.execute_script(f'return document.body.scrollHeight')

# waiting for amazon to run the api to get new data
time.sleep(2)

# Parsing website's html content
soup = BeautifulSoup(browser.page_source, "html.parser")

browser.close()

## Extracting Info

In [None]:
def is_audio_book(book_div):
    try:
        book_type_span = book_div.find('span',class_='a-size-small a-color-secondary a-text-normal')
        book_type = book_type_span.get_text(strip=True)

        return book_type == 'Audible Audiobook'
    
    except Exception as e:
        raise Exception(e)
    

In [None]:
# Looking for book Content
book_list_div = soup.find('div', class_ = 'p13n-gridRow _cDEzb_grid-row_3Cywl')
book_divs = book_list_div.find_all('div', id="gridItemRoot")

info = {
    'amazon_links': [],
    'title': [],
    'author': [],
    'review': [],
    'rating': [],
    'price': [],
    'length': [],
    'rank': [],
    'size': []
}

for book_div in book_divs:

    # check if book type
    if is_audio_book(book_div): continue

    ############################ EXTRACTING ############################
    # book link
    book_link_a = book_div.find('a',class_='a-link-normal')
    book_link = f'https://www.amazon.com/{book_link_a["href"]}'
    
    # book title
    book_title_a = book_div.find('div',class_='_cDEzb_p13n-sc-css-line-clamp-1_1Fn1y')
    book_title = book_title_a.get_text(strip=True)
   

    # book author
    book_author_div = book_div.find('div',class_='a-row a-size-small')
    if book_author_div == None:
        continue

    book_author = book_author_div.get_text(strip=True)
    

    # book review
    book_review_span = book_div.find('div',class_='a-icon-row')
    if book_review_span == None:
        print(book_title)
        continue

    book_review_div = book_review_span.find('span',class_='a-size-small')
    book_review = book_review_div.get_text(strip=True)
    


    # book ratings
    book_ratings_span = book_div.find('span',class_='a-icon-alt')
    if book_ratings_span == None:
        print(book_title)
        continue

    book_ratings = book_ratings_span.get_text(strip=True)

    # book price
    book_price_div = book_div.find('div',class_='_cDEzb_p13n-sc-price-animation-wrapper_3PzN2')
    if book_price_div == None:
        print(book_title)
        continue
    book_price_span = book_price_div.find('span',class_='_cDEzb_p13n-sc-price_3mJ9Z')
    if book_price_span == None:
        print(book_title)
        continue

    book_price = book_price_span.get_text(strip=True)

    ############################ SAVING ############################
    info['amazon_links'].append(book_link)
    info['title'].append(book_title)
    info['author'].append(book_author)
    info['review'].append(book_review)
    info['rating'].append(book_ratings)
    info['price'].append(book_price)

In [390]:
# selecting amazon links column from excel
books_urls = list(info['amazon_links'])

browser=webdriver.Chrome()
info['length']=[]

for book_url in tqdm(books_urls):

    # adding page length column to dataset
    try:
        browser.get(book_url)
        html = browser.page_source
        soup = BeautifulSoup(html, 'html.parser') 
        wrapper_div = soup.find('div', {'id':'rich_product_information'})
        book_length_a = wrapper_div.find('a', class_ = 'a-popover-trigger a-declarative')
        text = book_length_a.get_text()
        second_index = text.find('pages')
        book_length = text[:second_index]
        info['length'].append(book_length)

        
    except Exception as e:
        info['length'].append('NOT FOUND')

browser.close()


100%|██████████| 48/48 [04:09<00:00,  5.19s/it]


In [391]:
# selecting amazon links column from excel
books_urls = list(info['amazon_links'])

browser=webdriver.Chrome()
info['rank']=[]

for book_url in tqdm(books_urls):

    try:
        browser.get(book_url)
        html = browser.page_source
        soup = BeautifulSoup(html, 'html.parser') 
        wrapper_div = soup.find_all('ul', class_ = 'a-unordered-list a-nostyle a-vertical a-spacing-none detail-bullet-list')[1]
        best_seller_rank_span = wrapper_div.find('span', class_='a-text-bold')
        text = wrapper_div.get_text()
        start_index = text.find('#')
        end_index = text.find('in')
        rank = text[start_index: end_index]

        info['rank'].append(rank)

    except Exception as e:
        info['rank'].append('NOT FOUND')
    
browser.close()

100%|██████████| 48/48 [04:09<00:00,  5.19s/it]


## Saving Excel

In [393]:
# create a dataframe
df = pd.DataFrame(info)

df.to_excel(out_filepath, index=False)

In [7]:
import pandas as pd

# Reading the entire Excel file
data = pd.read_excel('/Users/achaudhari/Desktop/Book Prediction New/out/Training.xlsx')

# Define thresholds
Threshold_small = 500 
Threshold_large = 1000

# Function to categorize book length
def get_length_category(length):
    if length <= Threshold_small:
        return 'small'
    elif length >= Threshold_large:
        return 'large'
    else:
        return 'medium'

# Assuming 'Length' is the column name that contains the book lengths
# Apply the function to the 'Length' column
data['Length_Category'] = data['Length'].apply(get_length_category)

# Saving the updated DataFrame to a new Excel file
data.to_excel('/Users/achaudhari/Desktop/Book Prediction New/length_size.xlsx', index=False)
