In [1]:
from bs4 import BeautifulSoup, Tag
import pandas as pd

import aiohttp
import asyncio

from typing import Any, List


In [2]:


# Scrapes the requirement elements from the page content provided
def scrape_page(content: bytes) -> List:

    # Parse site's content html
    soup: Beautifulsoup =  BeautifulSoup(content, 'html.parser')
    
    # Get the books from the list
    ol = soup.find('ol')
    articles = ol.find_all('article', class_='product_pod')
    data = []
    
    # Extract the fields needed
    for article in articles:
        image = article.find('img')
        title = image.attrs['alt']
        star_elem = article.find('p')
        star_num = star_elem.attrs['class'][1]
        price = article.find('p', class_='price_color').get_text()
        price_float = float(price[1:])
    
        data.append([title, price_float, star_num])
        
    return data


# Fetches page, called asynchronously by "parent" function
async def fetch_page(session: aiohttp.ClientSession, url: str) -> bytes:
    headers = {'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/117.0",
               "X-Amzn-Trace-Id": "Root=1-65043b46-31bc2efb2ba67202432972da",
               "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
               "Accept-Encoding": "gzip, deflate",
               "Accept-Language": "en-US,en;q=0.5",
               "Upgrade-Insecure-Requests": "1"
              }

    try:
        async with session.get(url, headers=headers) as response:
            response.raise_for_status()
            return await response.read()
    except aiohttp.ClientError as e:
        # Return empty byte to indicate error
        print(f"HTTP error occured for URL {url}: {e}")
        return b''

        
    
# Gets data by fetching and scraping the pages asynchronously
async def get_data(session: aiohttp.ClientSession, urls: List[str]) -> List:
    all_data = []
    tasks = [fetch_page(session, url) for url in urls]
    pages = await asyncio.gather(*tasks)
    
    for page in pages:
        data = scrape_page(page)
        all_data.extend(data)
        
    # for i, page_data in enumerate(all_data):
    #     print(f"Page{i+1} - Number of books scraped: {len(page_data)}")
    # return all_data


In [3]:
books: List[Any] = []
BASE_URL: str = "http://books.toscrape.com/catalogue/page-{}.html"
urls: List = [BASE_URL.format(i) for i in range(1,51)]
    
async with aiohttp.ClientSession() as session:
    books_data = await get_data(session, urls)
    
    for page_data in books_data:
        books.append(page_data)

Page1 - Number of books scraped: 3
Page2 - Number of books scraped: 3
Page3 - Number of books scraped: 3
Page4 - Number of books scraped: 3
Page5 - Number of books scraped: 3
Page6 - Number of books scraped: 3
Page7 - Number of books scraped: 3
Page8 - Number of books scraped: 3
Page9 - Number of books scraped: 3
Page10 - Number of books scraped: 3
Page11 - Number of books scraped: 3
Page12 - Number of books scraped: 3
Page13 - Number of books scraped: 3
Page14 - Number of books scraped: 3
Page15 - Number of books scraped: 3
Page16 - Number of books scraped: 3
Page17 - Number of books scraped: 3
Page18 - Number of books scraped: 3
Page19 - Number of books scraped: 3
Page20 - Number of books scraped: 3
Page21 - Number of books scraped: 3
Page22 - Number of books scraped: 3
Page23 - Number of books scraped: 3
Page24 - Number of books scraped: 3
Page25 - Number of books scraped: 3
Page26 - Number of books scraped: 3
Page27 - Number of books scraped: 3
Page28 - Number of books scraped: 3
P

In [4]:
df = pd.DataFrame(books, columns=['Title','Price', 'Star Rating'])
df.to_csv('books.csv')
print("Done!")

Done!
            Price
count  1000.00000
mean     35.07035
std      14.44669
min      10.00000
25%      22.10750
50%      35.98000
75%      47.45750
max      59.99000
