In [3]:
html_doc = """<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Sample Books Data</title>
    <style>
        body {
            font-family: Arial, sans-serif;
            margin: 20px;
            background-color: #f8f8f8;
        }
        h1 {
            text-align: center;
            color: #333;
        }
        .book {
            border: 1px solid #ccc;
            padding: 15px;
            margin: 10px;
            border-radius: 5px;
            background-color: #fff;
        }
        .title {
            font-size: 1.5em;
            color: #0056b3;
        }
        .author {
            font-style: italic;
            color: #555;
        }
        .price {
            color: #d9534f;
            font-weight: bold;
        }
        .description {
            margin-top: 10px;
        }
        a {
            color: #007bff;
            text-decoration: none;
        }
        a:hover {
            text-decoration: underline;
        }
    </style>
</head>
<body>
    <h1>Books List</h1>
    <div class="book">
        <p class="title">
            <b>The Great Gatsby</b>
        </p>
        <p class="author">Author: F. Scott Fitzgerald</p>
        <p class="price">Price: $10.99</p>
        <p class="publication-year">Publication Year: 1925</p>
        <p class="genre">Genre: Fiction</p>
        <p class="description">
            A novel about the American dream, set in the 1920s on Long Island.
        </p>
        <p>More Info: <a class="info-link" href="http://example.com/gatsby" id="link1">Learn More</a></p>
    </div>
    <div class="book">
        <p class="title">
            <b>1984</b>
        </p>
        <p class="author">Author: George Orwell</p>
        <p class="price">Price: $8.99</p>
        <p class="publication-year">Publication Year: 1949</p>
        <p class="genre">Genre: Dystopian</p>
        <p class="description">
            A story about a totalitarian regime that uses surveillance and propaganda to control its citizens.
        </p>
        <p>More Info: <a class="info-link" href="http://example.com/1984" id="link2">Learn More</a></p>
    </div>
    <div class="book">
        <p class="title">
            <b>To Kill a Mockingbird</b>
        </p>
        <p class="author">Author: Harper Lee</p>
        <p class="price">Price: $7.99</p>
        <p class="publication-year">Publication Year: 1960</p>
        <p class="genre">Genre: Fiction</p>
        <p class="description">
            A novel set in the Great Depression that addresses serious issues of race and injustice.
        </p>
        <p>More Info: <a class="info-link" href="http://example.com/mockingbird" id="link3">Learn More</a></p>
    </div>
</body>
</html>
"""

In [4]:
print(html_doc)

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Sample Books Data</title>
    <style>
        body {
            font-family: Arial, sans-serif;
            margin: 20px;
            background-color: #f8f8f8;
        }
        h1 {
            text-align: center;
            color: #333;
        }
        .book {
            border: 1px solid #ccc;
            padding: 15px;
            margin: 10px;
            border-radius: 5px;
            background-color: #fff;
        }
        .title {
            font-size: 1.5em;
            color: #0056b3;
        }
        .author {
            font-style: italic;
            color: #555;
        }
        .price {
            color: #d9534f;
            font-weight: bold;
        }
        .description {
            margin-top: 10px;
        }
        a {
            color: #007bff;
            text-decoration: none;
        }
        

In [5]:
from bs4 import BeautifulSoup
import pandas as pd

In [6]:
soup = BeautifulSoup(html_doc, 'html.parser')

In [7]:
soup

<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title>Sample Books Data</title>
<style>
        body {
            font-family: Arial, sans-serif;
            margin: 20px;
            background-color: #f8f8f8;
        }
        h1 {
            text-align: center;
            color: #333;
        }
        .book {
            border: 1px solid #ccc;
            padding: 15px;
            margin: 10px;
            border-radius: 5px;
            background-color: #fff;
        }
        .title {
            font-size: 1.5em;
            color: #0056b3;
        }
        .author {
            font-style: italic;
            color: #555;
        }
        .price {
            color: #d9534f;
            font-weight: bold;
        }
        .description {
            margin-top: 10px;
        }
        a {
            color: #007bff;
            text-decoration: none;
        }
        a:hover {
   

In [8]:
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <title>
   Sample Books Data
  </title>
  <style>
   body {
            font-family: Arial, sans-serif;
            margin: 20px;
            background-color: #f8f8f8;
        }
        h1 {
            text-align: center;
            color: #333;
        }
        .book {
            border: 1px solid #ccc;
            padding: 15px;
            margin: 10px;
            border-radius: 5px;
            background-color: #fff;
        }
        .title {
            font-size: 1.5em;
            color: #0056b3;
        }
        .author {
            font-style: italic;
            color: #555;
        }
        .price {
            color: #d9534f;
            font-weight: bold;
        }
        .description {
            margin-top: 10px;
        }
        a {
            color: #007bff;
            text-decoration: none;
        }
        a:h

In [14]:
books = soup.find_all('div', class_='book')
print("number of books found", len(books))

number of books found 3


In [15]:
book_data = []

In [18]:
for book in books:
    title = book.find('p', class_='title').get_text(strip=True)
    author = book.find('p', class_='author').get_text(strip=True)
    price = book.find('p', class_='price').get_text(strip=True)
    publication_year = book.find('p', class_='publication-year').get_text(strip=True)
    genre = book.find('p', class_='genre').get_text(strip=True)
    description = book.find('p', class_='description').get_text(strip=True)

    book_data.append({'Title': title, 'Author':author,'Price':price, 'Publication Year':publication_year, 'Genre':genre, 'Description':description})

In [19]:
book_data

[{'Title': 'The Great Gatsby',
  'Author': 'Author: F. Scott Fitzgerald',
  'Price': 'Price: $10.99',
  'Publication Year': 'Publication Year: 1925',
  'Genre': 'Genre: Fiction',
  'Description': 'A novel about the American dream, set in the 1920s on Long Island.'},
 {'Title': '1984',
  'Author': 'Author: George Orwell',
  'Price': 'Price: $8.99',
  'Publication Year': 'Publication Year: 1949',
  'Genre': 'Genre: Dystopian',
  'Description': 'A story about a totalitarian regime that uses surveillance and propaganda to control its citizens.'},
 {'Title': 'To Kill a Mockingbird',
  'Author': 'Author: Harper Lee',
  'Price': 'Price: $7.99',
  'Publication Year': 'Publication Year: 1960',
  'Genre': 'Genre: Fiction',
  'Description': 'A novel set in the Great Depression that addresses serious issues of race and injustice.'}]

In [20]:
df=pd.DataFrame(book_data)
df

Unnamed: 0,Title,Author,Price,Publication Year,Genre,Description
0,The Great Gatsby,Author: F. Scott Fitzgerald,Price: $10.99,Publication Year: 1925,Genre: Fiction,"A novel about the American dream, set in the 1..."
1,1984,Author: George Orwell,Price: $8.99,Publication Year: 1949,Genre: Dystopian,A story about a totalitarian regime that uses ...
2,To Kill a Mockingbird,Author: Harper Lee,Price: $7.99,Publication Year: 1960,Genre: Fiction,A novel set in the Great Depression that addre...
