#  Scraping Business Quotes
This notebook extracts quotes using a loop-based method.

##  1. Import libraries

In [1]:
!pip install requests-html
!pip install lxml[html_clean]



In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import random

## 2. Define scraping function and quote sources

In [3]:
def scrape_goodreads_multiple_tags(tags, pages_per_tag=5):
    base_url = "https://www.goodreads.com/quotes/tag/"
    all_quotes = []

    # Fake browser header to avoid 403 errors
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
    }

    for tag in tags:
        print(f"\nüîé Scraping tag: {tag}")
        for page in range(1, pages_per_tag + 1):
            url = f"{base_url}{tag}?page={page}"
            print(f"  ‚Üí Fetching: {url}")
            response = requests.get(url, headers=headers)

            if response.status_code != 200:
                print(f"    ‚ùå Failed to load {url} ‚Äî Status: {response.status_code}")
                continue

            soup = BeautifulSoup(response.content, "html.parser")
            quote_blocks = soup.find_all("div", class_="quote")

            for block in quote_blocks:
                text_block = block.find("div", class_="quoteText")
                if text_block:
                    raw_text = text_block.get_text(separator="\n").strip()
                    lines = [line.strip() for line in raw_text.split("\n") if line.strip()]
                    
                    # Extract quote
                    quote_text = lines[0].strip("‚Äú‚Äù\"").strip()
                    
                    # Extract author (look for <span> or fallback)
                    author_tag = text_block.find("span", class_="authorOrTitle")
                    author = author_tag.get_text(strip=True) if author_tag else "Unknown"

                    quote_entry = {
                        "quote": quote_text,
                        "author": author,
                        "theme/tag": tag,
                        "source": f"Goodreads ‚Äì {tag}",
                        "position": "",
                        "region": "",
                        "decade": "",
                        "gender": ""
                    }
                    all_quotes.append(quote_entry)

            # Polite scraping pause
            time.sleep(random.uniform(1, 2))

    df = pd.DataFrame(all_quotes)
    return df.drop_duplicates(subset=["quote", "author"])


In [5]:
tags = ["leadership"]
df_goodreads = scrape_goodreads_multiple_tags(tags, pages_per_tag=3)
df_goodreads.to_csv("quotes_goodreads.csv", index=False)



üîé Scraping tag: leadership
  ‚Üí Fetching: https://www.goodreads.com/quotes/tag/leadership?page=1
  ‚Üí Fetching: https://www.goodreads.com/quotes/tag/leadership?page=2
  ‚Üí Fetching: https://www.goodreads.com/quotes/tag/leadership?page=3


In [6]:
df_goodreads.head()

Unnamed: 0,quote,author,theme/tag,source,position,region,decade,gender
0,It‚Äôs only after you‚Äôve stepped outside your co...,Roy T. Bennett,leadership,Goodreads ‚Äì leadership,,,,
1,"Success is not how high you have climbed, but ...","Roy T. Bennett,",leadership,Goodreads ‚Äì leadership,,,,
2,Be grateful for what you already have while yo...,"Roy T. Bennett,",leadership,Goodreads ‚Äì leadership,,,,
3,"It is a curious thing, Harry, but perhaps thos...","J.K. Rowling,",leadership,Goodreads ‚Äì leadership,,,,
4,You never change your life until you step out ...,Roy T. Bennett,leadership,Goodreads ‚Äì leadership,,,,


In [7]:
# Basic duplicate check based on both quote text and author
duplicates = df_goodreads.duplicated(subset=["quote", "author"], keep=False)

# Show how many duplicates we found
print(f"üîç Number of duplicate rows: {duplicates.sum()}")

# Optional: show the actual duplicate rows
df_duplicates = df_goodreads[duplicates]
df_duplicates.head()


üîç Number of duplicate rows: 0


Unnamed: 0,quote,author,theme/tag,source,position,region,decade,gender


## 3. Clean and filter quotes

In [11]:
def clean_quote(text):
    text = re.sub(r'\[\d+\]|\[citation needed\]', '', text)
    return text.strip("‚Äú‚Äù\"'\n ")

def is_valid_quote(text):
    return (
        len(text) >= 30 and
        any(p in text for p in [".", "!", "?"]) and
        not text.istitle()
    )

def scrape_wikiquote_person(name):
    url = f"https://en.wikiquote.org/wiki/{name}"
    print(f"üîé Scraping: {url}")
    headers = {"User-Agent": "Mozilla/5.0"}
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
    except Exception as e:
        print(f"‚ùå Failed to fetch {url}: {e}")
        return pd.DataFrame()

    soup = BeautifulSoup(response.text, "html.parser")
    content = soup.find("div", class_="mw-parser-output")

    if not content:
        print("‚ö†Ô∏è No main content found.")
        return pd.DataFrame()

    quotes = []
    for elem in content.find_all("li", recursive=True):
        if elem.find("ul") or elem.find("ol"):
            continue  # skip nested lists

        text = elem.get_text(separator=" ").strip()
        quote = clean_quote(text)
        if not is_valid_quote(quote):
            continue

        quotes.append({
            "quote": quote,
            "author": name.replace("_", " "),
            "theme/tag": "",
            "source": f"Wikiquote ‚Äì {name.replace('_', ' ')}",
            "position": "",
            "region": "",
            "decade": "",
            "gender": ""
        })

    time.sleep(random.uniform(1.5, 2.5))
    return pd.DataFrame(quotes)

# üîß People to scrape
people = [
    "Elon_Musk",
    "Steve_Jobs",
    "Bill_Gates",
]

# üîÅ Scrape all people
dfs = [scrape_wikiquote_person(name) for name in people]
df_people = pd.concat(dfs, ignore_index=True)
df_people = df_people.drop_duplicates(subset=["quote"]).reset_index(drop=True)

print(f"\n‚úÖ Scraped from {len(people)} people ‚Äî Total quotes: {df_people.shape[0]}")


üîé Scraping: https://en.wikiquote.org/wiki/Elon_Musk
üîé Scraping: https://en.wikiquote.org/wiki/Steve_Jobs
üîé Scraping: https://en.wikiquote.org/wiki/Bill_Gates

‚úÖ Scraped from 3 people ‚Äî Total quotes: 221


In [12]:
df_people.head()

Unnamed: 0,quote,author,theme/tag,source,position,region,decade,gender
0,Life is too short for long-term grudges.,Elon Musk,,Wikiquote ‚Äì Elon Musk,,,,
1,I didn‚Äôt really expect to make any money. If I...,Elon Musk,,Wikiquote ‚Äì Elon Musk,,,,
2,I don‚Äôt have an issue with serving in the mili...,Elon Musk,,Wikiquote ‚Äì Elon Musk,,,,
3,I think South Africa is a great country.,Elon Musk,,Wikiquote ‚Äì Elon Musk,,,,
4,"If you wanted to be close to the cutting edge,...",Elon Musk,,Wikiquote ‚Äì Elon Musk,,,,


In [14]:
df_final = pd.concat([df_people, df_goodreads], ignore_index=True)
df_final = df_final.drop_duplicates(subset='quote').reset_index(drop=True)

In [16]:
df_final.shape

(311, 8)