#PROJECT 1: Web Scraping + Text Extraction

#Step 1: Install libraries

In [None]:
!pip install requests
!pip install beautifulsoup4



#Step 2: Import required libraries

requests -> used to access the website
BeautifulSoup -> helps us extract text from HTML

In [None]:
from bs4 import BeautifulSoup
import requests

#Step 3: Choose a website to scrape

We will scrape text from a simple demo website made for learning:

In [None]:
url = "https://quotes.toscrape.com"

# Step 4: Send a request to the website
This tells the website "Hey, please give me your HTML"

In [None]:
response = requests.get(url)

In [None]:
print("Status Code:", response.status_code)

Status Code: 200


#Step 5: Get the HTML content

In [None]:
html_content = response.text
print("\nRAW HTML CONTENT (very messy):\n")
print(html_content[:1000])


RAW HTML CONTENT (very messy):

<!DOCTYPE html>
<html lang="en">
<head>
	<meta charset="UTF-8">
	<title>Quotes to Scrape</title>
    <link rel="stylesheet" href="/static/bootstrap.min.css">
    <link rel="stylesheet" href="/static/main.css">
    
    
</head>
<body>
    <div class="container">
        <div class="row header-box">
            <div class="col-md-8">
                <h1>
                    <a href="/" style="text-decoration: none">Quotes to Scrape</a>
                </h1>
            </div>
            <div class="col-md-4">
                <p>
                
                    <a href="/login">Login</a>
                
                </p>
            </div>
        </div>
    

<div class="row">
    <div class="col-md-8">

    <div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
        <span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
  

#Step 6: Parse HTML using BeautifulSoup

In [None]:
soup = BeautifulSoup(html_content, "html.parser")

#Step 7: Extract the quotes from the webpage

In [None]:
quotes = soup.find_all("span", class_="text")
print("\nExtracted Quotes (raw):\n")
for q in quotes:
    print(q.text)


Extracted Quotes (raw):

“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
“It is our choices, Harry, that show what we truly are, far more than our abilities.”
“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”
“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”
“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”
“Try not to become a man of success. Rather become a man of value.”
“It is better to be hated for what you are than to be loved for what you are not.”
“I have not failed. I've just found 10,000 ways that won't work.”
“A woman is like a tea bag; you never know how strong it is until it's in hot water.”
“A day without sunshine is like, you know, night.”


#Step 8: Clean the extracted text

In [None]:
cleaned_quotes = [q.text.strip() for q in quotes]

print("\nCLEANED QUOTES:\n")
for q in cleaned_quotes:
    print(q)


CLEANED QUOTES:

“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
“It is our choices, Harry, that show what we truly are, far more than our abilities.”
“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”
“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”
“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”
“Try not to become a man of success. Rather become a man of value.”
“It is better to be hated for what you are than to be loved for what you are not.”
“I have not failed. I've just found 10,000 ways that won't work.”
“A woman is like a tea bag; you never know how strong it is until it's in hot water.”
“A day without sunshine is like, you know, night.”


#Step 9: Save the cleaned text to a file

In [12]:
with open("scraped_quotes.txt", "w", encoding="utf-8") as f:
    for quote in cleaned_quotes:
        f.write(quote + "\n")

print("\nScraped text saved as scraped_quotes.txt")


Scraped text saved as scraped_quotes.txt
