In [1]:
import nest_asyncio
nest_asyncio.apply()

In [19]:
import requests
from bs4 import BeautifulSoup
import csv
import os
import uuid
import re

CSV_FILE = 'worldcup_data_updated.csv'

# Initialize CSV file with headers if it doesn't exist
if not os.path.exists(CSV_FILE):
    with open(CSV_FILE, mode='w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['id', 'host', 'year', 'text', 'url'])

def scrape_and_save(url):
    try:
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to retrieve {url} (status code {response.status_code})")
            return

        soup = BeautifulSoup(response.content, "html.parser")

        # Extract year and host from URL using regex
        year_match = re.search(r'/world-cup/(\d{4})-', url)
        year = year_match.group(1) if year_match else "Unknown"

        host_match = re.search(r'/world-cup/\d{4}-(.+?).html', url)
        host = host_match.group(1).replace('-', ' ').title() if host_match else "Unknown"

        main_sections = ["Background", "Format", "Cities and arenas", "Tournament", "Results", "Aftermath"]
        sidebar_sections = ["Participating teams", "1st, 2nd and 3rd places", "Top scorers", "Cities and stadiums"]

        content_parts = []

        # Extract <h1> and intro paragraphs before first h2/h3
        h1 = soup.find('h1')
        if h1:
            title = h1.get_text(strip=True)
            content_parts.append(f"{title}\n{'=' * len(title)}")
            for sibling in h1.find_next_siblings():
                if sibling.name in ['h2', 'h3']:
                    break
                if sibling.name == 'p':
                    content_parts.append(sibling.get_text(" ", strip=True))
        else:
            print("No <h1> title found.")

        # Extract specified main sections
        for header in soup.find_all(['h2', 'h3']):
            header_text = header.get_text(strip=True)
            if header_text in main_sections:
                content_parts.append(f"\n{header_text}\n{'-' * len(header_text)}")
                for sibling in header.find_next_siblings():
                    if sibling.name in ['h2', 'h3']:
                        break
                    if sibling.name == 'p':
                        content_parts.append(sibling.get_text(" ", strip=True))

        # Extract sidebar sections from <div class="fact">
        fact_box = soup.find("div", class_="fact")
        if fact_box:
            for tag in fact_box.find_all(['h3', 'ul', 'p'], recursive=False):
                if tag.name == 'h3':
                    heading = tag.get_text(strip=True)
                    if heading in sidebar_sections:
                        content_parts.append(f"\n{heading}\n{'-' * len(heading)}")
                elif tag.name == 'ul':
                    items = [li.get_text(" ", strip=True) for li in tag.find_all('li')]
                    content_parts.extend(items)
                elif tag.name == 'p':
                    text = tag.get_text(" ", strip=True)
                    if text:
                        content_parts.append(text)

        final_text = "\n".join(content_parts).strip()
        if not final_text:
            final_text = "[No content extracted]"

        record_id = str(uuid.uuid4())

        # Append to CSV
        with open(CSV_FILE, mode='a', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow([record_id, host, year, final_text, url])

        print(f"✅ Scraped and saved data from: {url}")

    except Exception as e:
        print(f"❌ Error scraping {url}: {e}")

# === CLI loop ===
print("Enter a World Cup URL to scrape from footballhistory.org.")
print("Type 'exit' or 'quit' to stop.\n")

while True:
    url = input("URL: ").strip()
    if url.lower() in ['exit', 'quit']:
        print("Exiting scraper.")
        break
    if not url.startswith("http"):
        print("⚠️ Please enter a valid URL starting with http or https.")
        continue

    scrape_and_save(url)


Enter a World Cup URL to scrape from footballhistory.org.
Type 'exit' or 'quit' to stop.



URL:  https://footballhistory.org/world-cup/1930-uruguay.html


✅ Scraped and saved data from: https://footballhistory.org/world-cup/1930-uruguay.html


URL:  https://footballhistory.org/world-cup/1934-italy.html


✅ Scraped and saved data from: https://footballhistory.org/world-cup/1934-italy.html


URL:  https://footballhistory.org/world-cup/1938-france.html


✅ Scraped and saved data from: https://footballhistory.org/world-cup/1938-france.html


URL:  https://footballhistory.org/world-cup/1950-brazil.html


✅ Scraped and saved data from: https://footballhistory.org/world-cup/1950-brazil.html


URL:  https://footballhistory.org/world-cup/1954-switzerland.html


✅ Scraped and saved data from: https://footballhistory.org/world-cup/1954-switzerland.html


URL:  https://footballhistory.org/world-cup/1958-sweden.html


✅ Scraped and saved data from: https://footballhistory.org/world-cup/1958-sweden.html


URL:  https://footballhistory.org/world-cup/1962-chile.html


✅ Scraped and saved data from: https://footballhistory.org/world-cup/1962-chile.html


URL:  https://footballhistory.org/world-cup/1966-england.html


✅ Scraped and saved data from: https://footballhistory.org/world-cup/1966-england.html


URL:  https://footballhistory.org/world-cup/1970-mexico.html


✅ Scraped and saved data from: https://footballhistory.org/world-cup/1970-mexico.html


URL:  https://footballhistory.org/world-cup/1974-west-germany.html


✅ Scraped and saved data from: https://footballhistory.org/world-cup/1974-west-germany.html


URL:  https://footballhistory.org/world-cup/1978-argentina.html


✅ Scraped and saved data from: https://footballhistory.org/world-cup/1978-argentina.html


URL:  https://footballhistory.org/world-cup/1982-spain.html


✅ Scraped and saved data from: https://footballhistory.org/world-cup/1982-spain.html


URL:  https://footballhistory.org/world-cup/1986-mexico.html


✅ Scraped and saved data from: https://footballhistory.org/world-cup/1986-mexico.html


URL:  https://footballhistory.org/world-cup/1990-italy.html


✅ Scraped and saved data from: https://footballhistory.org/world-cup/1990-italy.html


URL:  https://footballhistory.org/world-cup/1994-unites-states.html


✅ Scraped and saved data from: https://footballhistory.org/world-cup/1994-unites-states.html


URL:  https://footballhistory.org/world-cup/1998-france.html


✅ Scraped and saved data from: https://footballhistory.org/world-cup/1998-france.html


URL:  https://footballhistory.org/world-cup/2002-korea-japan.html


✅ Scraped and saved data from: https://footballhistory.org/world-cup/2002-korea-japan.html


URL:  https://footballhistory.org/world-cup/2006-germany.html


✅ Scraped and saved data from: https://footballhistory.org/world-cup/2006-germany.html


URL:  https://footballhistory.org/world-cup/2010-south-africa.html


✅ Scraped and saved data from: https://footballhistory.org/world-cup/2010-south-africa.html


URL:  https://footballhistory.org/world-cup/2014-brazil.html


✅ Scraped and saved data from: https://footballhistory.org/world-cup/2014-brazil.html


URL:  https://footballhistory.org/world-cup/2018-russia.html


✅ Scraped and saved data from: https://footballhistory.org/world-cup/2018-russia.html


URL:  https://footballhistory.org/world-cup/2022-qatar.html


✅ Scraped and saved data from: https://footballhistory.org/world-cup/2022-qatar.html


URL:  exit


Exiting scraper.


This is a **web scraper** that extracts **World Cup data from a specific webpage** (e.g., from `footballhistory.org`) and saves it into a **CSV file**.

---

### 💡 **High-Level Purpose:**

To **scrape World Cup history pages**, extract useful sections like the title, format, results, scorers, etc., and **store the cleaned data** in a CSV file for later use (e.g., in a chatbot or database).

---

### 🧩 **How it Works:**

#### ✅ 1. **Set Up the CSV File**

* The code first checks if `worldcup_data_updated.csv` exists.
* If it doesn't, it **creates the file** and writes headers:
  `['id', 'host', 'year', 'text', 'url']`.

---

#### 🌐 2. **Scrape a Given URL**

Function: `scrape_and_save(url)`

##### 🔍 a. **Get Webpage Content**

* Uses `requests.get()` to fetch the HTML from the given URL.
* Parses the page using `BeautifulSoup`.

##### 🗓️ b. **Extract Metadata from URL**

* **Year** and **host country** are extracted from the URL using regular expressions.

  * Example: From `https://.../world-cup/1998-france.html`, it gets:

    * Year = `1998`
    * Host = `France`

---

#### 📑 3. **Extract Web Page Content**

The scraper collects content from:

##### 📌 a. **Main Content Sections**

* From the main article, it looks for headings like:

  * `"Background"`, `"Format"`, `"Tournament"`, etc.
* It grabs:

  * The **title** (`<h1>`)
  * Paragraphs under those headings until the next heading appears.

##### 📦 b. **Sidebar "Fact Box" Sections**

* From the `<div class="fact">` sidebar, it extracts headings like:

  * `"Top scorers"`, `"Participating teams"`, etc.
* Pulls out list items (`<ul><li>`), small paragraphs, and headers.

---

#### 🧾 4. **Compile and Save the Text**

* It combines all extracted content into a readable `final_text`.
* If nothing is extracted, it adds a placeholder.

#### 🆔 5. **Save to CSV**

* Creates a unique ID (`uuid4`) for the record.
* Saves a new row to the CSV with:

  * `id`, `host`, `year`, `text`, `url`

---

### 🧑‍💻 6. **Command-Line Interface (CLI)**

* Prompts the user to **enter a URL**.
* Scrapes it and saves the data.
* Allows repeated scraping by re-entering URLs.
* You can type **'exit' or 'quit'** to stop the program.

---

### ✅ Example Output Row in CSV:

| id | host | year | text (cleaned summary) | url |
| -- | ---- | ---- | ---------------------- | --- |

---

### ⚙️ Tech Stack:

* `requests`: for HTTP requests
* `BeautifulSoup`: for HTML parsing
* `csv`: for saving structured data
* `uuid`: for generating unique IDs
* `re`: for regular expressions

---

