# Scraping data from a static website


Books.toscrape.com is a demo static website for webscrapping purposes. Prices and ratings here were randomly assigned and have no real meaning.


### Objective:
Write a Python script that:

Loads the homepage of the website.
Extracts the title, price, ratings, status of all the books listed on that page.

Stores the results in a CSV file

In [1]:
# Iporting necessary 
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [2]:
page = 1            # Starting from the firdt page of the catelouge
list_of_books = []      # List to store the books data

while True:
    
    url = (f'https://books.toscrape.com/catalogue/page-{page}.html')      # URL of the desired website

    response = requests.get(url)            # Sending a GET request to the URL

    soup = BeautifulSoup(response.text, 'html.parser')      # Parsing the content of the page with BeautifulSoup

    books = soup.find_all('article', class_="product_pod" )         # Finding all the books on the page
    
    if not books:         # If no books are found, break the loop
       print('Finished Scraping')
       break


    for book in books:
        book_title = book.find('h3').find('a')['title']     # Extracting the title of the book
        book_price = book.find('p', class_='price_color').text     # Extracting the price of the book
        book_rating = book.find('p', class_='star-rating')['class'][1]    # Extracting the rating of the book
        book_availability = book.find('p', class_='instock availability').text.strip()      # Extracting the availability status of the book and striping extra space


        # Saving the extracted books into a dictionary
        book_dictionary = {'Book' : book_title,
                       'Price' : book_price,
                       'Rating' : book_rating,
                       'Status' : book_availability
                        }
        list_of_books.append(book_dictionary)
    page += 1       # Incrementing the page number, so next page can be scraped
    time.sleep(2)   # 2 seconds delay between requests to avoid overwhelming the server


Finished Scraping


In [4]:
print(list_of_books)

[{'Book': 'A Light in the Attic', 'Price': 'Â£51.77', 'Rating': 'Three', 'Status': 'In stock'}, {'Book': 'Tipping the Velvet', 'Price': 'Â£53.74', 'Rating': 'One', 'Status': 'In stock'}, {'Book': 'Soumission', 'Price': 'Â£50.10', 'Rating': 'One', 'Status': 'In stock'}, {'Book': 'Sharp Objects', 'Price': 'Â£47.82', 'Rating': 'Four', 'Status': 'In stock'}, {'Book': 'Sapiens: A Brief History of Humankind', 'Price': 'Â£54.23', 'Rating': 'Five', 'Status': 'In stock'}, {'Book': 'The Requiem Red', 'Price': 'Â£22.65', 'Rating': 'One', 'Status': 'In stock'}, {'Book': 'The Dirty Little Secrets of Getting Your Dream Job', 'Price': 'Â£33.34', 'Rating': 'Four', 'Status': 'In stock'}, {'Book': 'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull', 'Price': 'Â£17.93', 'Rating': 'Three', 'Status': 'In stock'}, {'Book': 'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics', 'Price': 'Â£22.60', 'Rating': 'Four', 'Status': 'In

In [6]:
# Turning the list of books into a DataFrame
data = pd.DataFrame(list_of_books)

data.head()     # View the first 5 rows

Unnamed: 0,Book,Price,Rating,Status
0,A Light in the Attic,Â£51.77,Three,In stock
1,Tipping the Velvet,Â£53.74,One,In stock
2,Soumission,Â£50.10,One,In stock
3,Sharp Objects,Â£47.82,Four,In stock
4,Sapiens: A Brief History of Humankind,Â£54.23,Five,In stock


In [7]:
# Shape of the data
data.shape

(1000, 4)

In [9]:
data.to_csv(r'C:\Users\BLESSING\Desktop\books.csv', index=False)  # Saving the data into a CSV file

## 2nd Web Scrapping Project

'https://en.wikipedia.org/wiki/List_of_highest-grossing_films' in the table, films are ranked by the revenues from theatrical exhibition at their nominal value, along with the highest positions they attained.


### Objective:
Write a Python script that:

Loads the homepage of the website.
Extracts the table of the Highest-grossing films listed on the page.

Stores the results in a CSV file

In [10]:
# Importing necessary libraries

import requests
from bs4 import BeautifulSoup
import pandas as pd

In [11]:
# Url of desired website
film_url = 'https://en.wikipedia.org/wiki/List_of_highest-grossing_films'

In [12]:
# The received HTML of the website
film_response = requests.get(film_url)

In [14]:
# Inserts the HTML in a structured form
film_soup = BeautifulSoup(film_response.text, 'html.parser')

In [15]:
# View the websites HTML
print(film_soup.prettify())

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of highest-grossing films - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limite

In [16]:
# Find all tables in the  html
'''Wikipedia tables usually have the <table> tag and class "wikitable".'''

all_tables = film_soup.find_all('table', class_ = 'wikitable')

In [17]:
# Number of available tables
print(len(all_tables))

85


In [26]:
# Extracting the targeted table
targeted_table = all_tables[0]

In [27]:
# Extracting table header

headers = []

for th in targeted_table.find_all('tr')[0].find_all('th'):
    headers.append(th.text.strip())

print(headers)

['Rank', 'Peak', 'Title', 'Worldwide gross', 'Year', 'Ref']


In [28]:
# Extracting data from each column

data = []

for row in targeted_table.find_all('tr')[1:]:
    columns = row.find_all(['td', 'th'])
    row_data = [column.text.strip() for column in columns]
    data.append(row_data)

In [29]:
# Inserting data in a dataframe
df = pd.DataFrame(data, columns=headers)
df.head()

Unnamed: 0,Rank,Peak,Title,Worldwide gross,Year,Ref
0,1,1,Avatar,"$2,923,706,026",2009,[# 1][# 2]
1,2,1,Avengers: Endgame,"$2,797,501,328",2019,[# 3][# 4]
2,3,3,Avatar: The Way of Water,"$2,320,250,281",2022,[# 5][# 6]
3,4,1,Titanic,"T$2,257,844,554",1997,[# 7][# 8]
4,5,5,Ne Zha 2 †,"$2,199,200,000",2025,[# 9][# 10]


In [31]:
# The shape of the data frame
df.shape

(50, 6)

In [32]:
df.to_csv(r'C:\Users\BLESSING\Desktop\grossing_film.csv', index=False)  # Saving the data into a CSV file)