In [1]:
#TASK:
# Scrape quotes from the website, extract the name of the author, the quote, and the tags associated with each quote. Save the data in a CSV file and use the Pandas library to create a DataFrame.

In [2]:
# Import Libraries:
# 1-Requests: To download web pages.

# 2-BeautifulSoup: For parsing HTML.

# 3-pandas: For working with DataFrames.

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [4]:
# Download Web Page:
# Use the requests library to download the HTML content of the page.

In [5]:
url = "http://quotes.toscrape.com"
response = requests.get(url)
html_content = response.content

In [6]:
response.status_code  # if the requests was successful

# output 200 means it was successful

200

In [7]:
# Parse HTML with BeautifulSoup:
# Use BeautifulSoup to parse the HTML content and explore the structure of the page.

In [8]:
soup = BeautifulSoup(html_content, 'html.parser')

In [9]:
soup.title

<title>Quotes to Scrape</title>

In [10]:
soup.title.text

'Quotes to Scrape'

In [11]:
soup.a

<a href="/" style="text-decoration: none">Quotes to Scrape</a>

In [12]:
soup.a['href']

'/'

In [13]:
soup.find_all("a")  # will return a list of all the links

[<a href="/" style="text-decoration: none">Quotes to Scrape</a>,
 <a href="/login">Login</a>,
 <a href="/author/Albert-Einstein">(about)</a>,
 <a class="tag" href="/tag/change/page/1/">change</a>,
 <a class="tag" href="/tag/deep-thoughts/page/1/">deep-thoughts</a>,
 <a class="tag" href="/tag/thinking/page/1/">thinking</a>,
 <a class="tag" href="/tag/world/page/1/">world</a>,
 <a href="/author/J-K-Rowling">(about)</a>,
 <a class="tag" href="/tag/abilities/page/1/">abilities</a>,
 <a class="tag" href="/tag/choices/page/1/">choices</a>,
 <a href="/author/Albert-Einstein">(about)</a>,
 <a class="tag" href="/tag/inspirational/page/1/">inspirational</a>,
 <a class="tag" href="/tag/life/page/1/">life</a>,
 <a class="tag" href="/tag/live/page/1/">live</a>,
 <a class="tag" href="/tag/miracle/page/1/">miracle</a>,
 <a class="tag" href="/tag/miracles/page/1/">miracles</a>,
 <a href="/author/Jane-Austen">(about)</a>,
 <a class="tag" href="/tag/aliteracy/page/1/">aliteracy</a>,
 <a class="tag" href

In [14]:
links = soup.find_all("a")
for link in links:
    print(link.get("href"))

/
/login
/author/Albert-Einstein
/tag/change/page/1/
/tag/deep-thoughts/page/1/
/tag/thinking/page/1/
/tag/world/page/1/
/author/J-K-Rowling
/tag/abilities/page/1/
/tag/choices/page/1/
/author/Albert-Einstein
/tag/inspirational/page/1/
/tag/life/page/1/
/tag/live/page/1/
/tag/miracle/page/1/
/tag/miracles/page/1/
/author/Jane-Austen
/tag/aliteracy/page/1/
/tag/books/page/1/
/tag/classic/page/1/
/tag/humor/page/1/
/author/Marilyn-Monroe
/tag/be-yourself/page/1/
/tag/inspirational/page/1/
/author/Albert-Einstein
/tag/adulthood/page/1/
/tag/success/page/1/
/tag/value/page/1/
/author/Andre-Gide
/tag/life/page/1/
/tag/love/page/1/
/author/Thomas-A-Edison
/tag/edison/page/1/
/tag/failure/page/1/
/tag/inspirational/page/1/
/tag/paraphrased/page/1/
/author/Eleanor-Roosevelt
/tag/misattributed-eleanor-roosevelt/page/1/
/author/Steve-Martin
/tag/humor/page/1/
/tag/obvious/page/1/
/tag/simile/page/1/
/page/2/
/tag/love/
/tag/inspirational/
/tag/life/
/tag/humor/
/tag/books/
/tag/reading/
/tag/fri

In [15]:
response.text

'<!DOCTYPE html>\n<html lang="en">\n<head>\n\t<meta charset="UTF-8">\n\t<title>Quotes to Scrape</title>\n    <link rel="stylesheet" href="/static/bootstrap.min.css">\n    <link rel="stylesheet" href="/static/main.css">\n</head>\n<body>\n    <div class="container">\n        <div class="row header-box">\n            <div class="col-md-8">\n                <h1>\n                    <a href="/" style="text-decoration: none">Quotes to Scrape</a>\n                </h1>\n            </div>\n            <div class="col-md-4">\n                <p>\n                \n                    <a href="/login">Login</a>\n                \n                </p>\n            </div>\n        </div>\n    \n\n<div class="row">\n    <div class="col-md-8">\n\n    <div class="quote" itemscope itemtype="http://schema.org/CreativeWork">\n        <span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>\n        <sp

In [16]:
# Extract Information:
# Identify the HTML elements that contain the data you want to scrape and create functions to extract that information.

In [17]:
def extract_quotes(soup):
    quotes = []
    for quote in soup.find_all('div', class_='quote'):
        text = quote.find('span', class_='text').text
        author = quote.find('small', class_='author').text
        tags = [tag.text for tag in quote.find_all('a', class_='tag')]
        quotes.append({'Author': author, 'Quote': text, 'Tags': tags})
    return quotes

quotes_data = extract_quotes(soup)

In [19]:
# Save Data to CSV:
# Create a function to save the data into a CSV file.

In [20]:
def save_to_csv(data, filename):
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)

save_to_csv(quotes_data, 'quotes.csv')

In [21]:
# Verify Data Using Pandas:
# Read the CSV file back using Pandas to verify the data

In [22]:
df = pd.read_csv('quotes.csv')
print(df.head())

            Author                                              Quote  \
0  Albert Einstein  “The world as we have created it is a process ...   
1     J.K. Rowling  “It is our choices, Harry, that show what we t...   
2  Albert Einstein  “There are only two ways to live your life. On...   
3      Jane Austen  “The person, be it gentleman or lady, who has ...   
4   Marilyn Monroe  “Imperfection is beauty, madness is genius and...   

                                                Tags  
0   ['change', 'deep-thoughts', 'thinking', 'world']  
1                           ['abilities', 'choices']  
2  ['inspirational', 'life', 'live', 'miracle', '...  
3         ['aliteracy', 'books', 'classic', 'humor']  
4                   ['be-yourself', 'inspirational']  
