# CW1 — Task 1: Web Scraping (Manning) → CSV

**Name:** Akia Hans Swin Carreon
**Student ID UoR:** *carreona@roehampton.ac.uk*
**Student ID Lithan:** *LS07432@learning.lithan.com*

Goal: Scrape at least **15** Data Engineering book records from Manning and save to CSV.
Output: `data/processed/books.csv`

This notebook runs the same logic as:

- `python -m src.task1_scrape.manning_run`


## Install Dependencies (If needed)


In [None]:
# If your venv already has these, you can skip this cell.
# %pip works in VS Code notebooks.
%pip install -q requests beautifulsoup4 pandas

## Imports, Config, Helpers


In [None]:
import re
import time
from pathlib import Path
from typing import Any, Optional

import requests
import pandas as pd
from bs4 import BeautifulSoup

# Output folders
RAW_DIR = Path("data/raw")
PROCESSED_DIR = Path("data/processed")
RAW_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

# Catalog URLs (add/remove if needed)
CATALOG_URLS = [
    "https://www.manning.com/catalog/software-development/cloud/data-engineering",
    "https://www.manning.com/catalog/software-development/cloud/data-engineering/cloud-data-platforms",
    "https://www.manning.com/catalog/software-development/cloud/data-engineering/big-data-processing",
    "https://www.manning.com/catalog/software-development/cloud/data-engineering/cloud-data-engineering",
    "https://www.manning.com/catalog/software-development/databases/database-platforms/azure-data-engineering",
    "https://www.manning.com/catalog/data-science/data-engineering/data-management-and-organization",
]

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Referer": "https://www.manning.com/catalog",
}

YEAR_RE = re.compile(r"^(19|20)\d{2}$")
PRICE_LINE_RE = re.compile(r"^([$£€])\s*([0-9]+(?:\.[0-9]{2})?)$")
RATINGCOUNT_RE = re.compile(r"\(\s*(\d+)\s*\)")  # more tolerant than strict match

NOISE = {
    "manning.com", "/", "catalog", "browse", "home", "cart", "log in",
    "sort:", "newest", "popularity",
    "Software Development", "Cloud", "Data Engineering",
    "Databases", "Database Platforms",
}
NOISE_LOWER = {s.lower() for s in NOISE}

DEBUG = False  # set True for per-row debug


def fetch_html(url: str, timeout: int = 30) -> str:
    r = requests.get(url, headers=HEADERS, timeout=timeout)
    print(f"[FETCH] {r.status_code} {url}")
    r.raise_for_status()
    return r.text


def clean_lines(html: str) -> list[str]:
    soup = BeautifulSoup(html, "html.parser")
    lines = [ln.strip() for ln in soup.get_text("\n", strip=True).split("\n")]
    return [ln for ln in lines if ln]


def is_noise(line: str) -> bool:
    low = line.strip().lower()
    if low in NOISE_LOWER:
        return True
    if line.strip() in {",", "|"}:
        return True
    return False


def is_year(line: str) -> bool:
    return YEAR_RE.match(line.strip()) is not None


def is_price(line: str) -> bool:
    return PRICE_LINE_RE.match(line.strip()) is not None


def price_value(line: str) -> Optional[float]:
    m = PRICE_LINE_RE.match(line.strip())
    if not m:
        return None
    return float(m.group(2))


def looks_like_title(line: str) -> bool:
    s = line.strip()
    if not s or len(s) < 6:
        return False
    if is_noise(s):
        return False
    if is_year(s) or is_price(s) or RATINGCOUNT_RE.search(s):
        return False
    return True


def dbg_row(title, authors, year, prices, chosen_price, star_rating, url, year_idx, idx_after, lines):
    print("\n" + "=" * 72)
    print("[ROW DEBUG]")
    print(f"Title      : {title!r}")
    print(f"Authors    : {authors!r}")
    print(f"Year       : {year!r}")
    print(f"Prices(raw): {prices!r} -> chosen: {chosen_price!r}")
    print(f"Rating(cnt): {star_rating!r}")
    print(f"Source URL : {url}")
    print(f"year_idx   : {year_idx} | idx(after scan): {idx_after}")
    print("-" * 72)
    start = max(year_idx - 4, 0)
    end = min(year_idx + 12, len(lines))
    for w in lines[start:end]:
        print("  ", w)
    print("=" * 72)


## Parser & Scraper


In [None]:
def parse_catalog(html: str, catalog_url: str) -> list[dict[str, Any]]:
    lines = clean_lines(html)
    n = len(lines)

    rows = []
    seen = set()
    i = 0

    while i < n:
        if not looks_like_title(lines[i]):
            i += 1
            continue

        title = lines[i].strip()

        # locate year within next 10 lines
        year_idx = None
        for j in range(i + 1, min(i + 10, n)):
            if is_year(lines[j]):
                year_idx = j
                break
        if year_idx is None:
            i += 1
            continue

        # authors between title and year
        author_parts = []
        for k in range(i + 1, year_idx):
            part = lines[k].strip()
            if part in {",", "|"}:
                continue
            if is_noise(part) or is_price(part) or is_year(part) or RATINGCOUNT_RE.search(part):
                continue
            author_parts.append(part)

        authors = " ".join(author_parts).strip() if author_parts else None
        year = int(lines[year_idx].strip())

        # scan after year for prices + rating
        scan_start = year_idx + 1
        scan_end = min(year_idx + 15, n)

        prices = []
        idx = scan_start

        # collect consecutive price lines
        while idx < scan_end:
            p = price_value(lines[idx])
            if p is not None:
                prices.append(p)
                idx += 1
                continue
            break

        # rating count anywhere near year/price region
        star_rating = None
        for r_i in range(scan_start, scan_end):
            m = RATINGCOUNT_RE.search(lines[r_i])
            if m:
                star_rating = float(m.group(1))
                break

        if not prices or not authors:
            i += 1
            continue

        price = prices[-1]  # pick the last displayed (often discounted)
        if title in seen:
            i = max(i + 1, idx)
            continue
        seen.add(title)

        row = {
            "title": title,
            "authors": authors,
            "year": year,
            "star_rating": star_rating,  # count shown in parentheses if present
            "price": price,
            "source_url": catalog_url,
        }

        if DEBUG:
            dbg_row(title, authors, year, prices, price, star_rating, catalog_url, year_idx, idx, lines)

        rows.append(row)
        i = max(i + 1, idx)

    return rows


def scrape_manning(catalog_urls: list[str]) -> pd.DataFrame:
    all_rows = []
    for idx, url in enumerate(catalog_urls, start=1):
        print(f"\n=== CATALOG {idx}/{len(catalog_urls)} ===")
        html = fetch_html(url)
        rows = parse_catalog(html, url)
        print(f"[PARSE] {len(rows)} rows from catalog")
        all_rows.extend(rows)
        time.sleep(1)

    df = pd.DataFrame(all_rows)

    if df.empty:
        raise RuntimeError("No rows scraped. Add more catalog URLs or adjust parser.")

    df = (
        df.dropna(subset=["title", "authors", "year", "price"])
          .drop_duplicates(subset=["title"])
          .reset_index(drop=True)
    )

    return df


## Run & Save (15) Rows


In [None]:
df = scrape_manning(CATALOG_URLS)

df_15 = df.head(15).copy()
print("[DF] shape:", df_15.shape)
display(df_15)

raw_path = RAW_DIR / "books_manning_raw.csv"
processed_path = PROCESSED_DIR / "books.csv"

df_15.to_csv(raw_path, index=False)
df_15.to_csv(processed_path, index=False)

print("Saved RAW:", raw_path)
print("Saved PROCESSED:", processed_path)
print("Non-null star_rating:", df_15["star_rating"].notna().sum(), "of", len(df_15))
