In [None]:
!pip install playwright nest_asyncio
!playwright install chromium
!apt-get install -y libatk1.0-0 libatk-bridge2.0-0 libatspi2.0-0 libxcomposite1

import asyncio, json, csv
from pathlib import Path
import nest_asyncio
nest_asyncio.apply()
from playwright.async_api import async_playwright

BASE_URL = "https://webscraper.io/test-sites/e-commerce/static/computers/laptops?page={}"


# =================================================================
# Detect total pages ‚Äî NO TIMEOUTS USED
# =================================================================
async def get_total_pages(page):
    print("\nüìÑ Detecting total pages...")

    response = await page.goto(BASE_URL.format(1))
    if not response or response.status != 200:
        print("‚ùå Could not load first page. Default pages = 1")
        return 1

    try:
        await page.wait_for_selector(".pagination")
    except:
        print("‚ùå Pagination not found. Default pages = 1")
        return 1

    nums = []
    buttons = await page.query_selector_all(".pagination li a")
    for b in buttons:
        text = (await b.text_content()).strip()
        if text.isdigit():
            nums.append(int(text))

    total_pages = max(nums) if nums else 1
    print(f"‚úÖ Total pages found: {total_pages}")
    return total_pages


# =================================================================
# Scrape a single page ‚Äî NO TIMEOUTS, NO RETRIES
# =================================================================
async def scrape_page(page, page_number):
    url = BASE_URL.format(page_number)
    print(f"\nüîé Scraping Page {page_number}: {url}")

    # Try opening the page
    response = await page.goto(url)

    # If URL not found / server error
    if not response:
        print("‚ùå No response. Skipping page.")
        return []

    if response.status != 200:
        print(f"‚ùå Bad status {response.status}. Skipping page.")
        return []

    # Try selecting product cards
    try:
        await page.wait_for_selector(".thumbnail")
    except:
        print("‚ùå No products found. Skipping page.")
        return []

    cards = await page.query_selector_all(".thumbnail")
    items = []

    for card in cards:
        title_el = await card.query_selector(".title")
        price_el = await card.query_selector(".price")
        img_el   = await card.query_selector("img")

        title = (await title_el.text_content()).strip() if title_el else None
        price = (await price_el.text_content()).strip() if price_el else None
        image = await img_el.get_attribute("src") if img_el else None

        stars = await card.query_selector_all(".glyphicon-star")
        rating = len(stars)

        items.append({
            "title": title,
            "price": price,
            "rating_stars": rating,
            "image_url": image,
            "page": page_number
        })

    return items


# =================================================================
# Scrape all pages
# =================================================================
async def scrape_all():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()

        total_pages = await get_total_pages(page)
        all_items = []

        for page_no in range(1, total_pages + 1):
            items = await scrape_page(page, page_no)
            all_items.extend(items)

        await browser.close()
        return all_items


# =================================================================
# Run scraper
# =================================================================
data = asyncio.get_event_loop().run_until_complete(scrape_all())
print(f"\nüéâ Total products collected: {len(data)}")


# =================================================================
# Save results
# =================================================================
Path("output").mkdir(exist_ok=True)

csv_path = Path("output/all_products.csv")
json_path = Path("output/all_products.json")

# Save CSV
with open(csv_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=data[0].keys())
    writer.writeheader()
    writer.writerows(data)

# Save JSON
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print("\nüìÅ Saved CSV  ‚Üí", csv_path)
print("üìÅ Saved JSON ‚Üí", json_path)


Collecting playwright
  Downloading playwright-1.57.0-py3-none-manylinux1_x86_64.whl.metadata (3.5 kB)
Collecting pyee<14,>=13 (from playwright)
  Downloading pyee-13.0.0-py3-none-any.whl.metadata (2.9 kB)
Downloading playwright-1.57.0-py3-none-manylinux1_x86_64.whl (46.0 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m46.0/46.0 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyee-13.0.0-py3-none-any.whl (15 kB)
Installing collected packages: pyee, playwright
Successfully installed playwright-1.57.0 pyee-13.0.0
Downloading Chromium 143.0.7499.4 (playwright build v1200)[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1200/chromium-linux.zip[22m
[1G164.7 MiB [] 0% 0.0s[0K[1G164.7 MiB [] 0% 68.2s[0K[1G164.7 MiB [] 0% 55.8s[0K[1G164.7 MiB [] 0% 66.9s[0K[1G164.7 MiB [] 0% 51.2s[0K[1G164.7 MiB [] 0% 43.5s[0K[1G164.7 MiB [] 0% 36.6s[

In [None]:
import asyncio, json, csv
from pathlib import Path
import nest_asyncio
nest_asyncio.apply()
from playwright.async_api import async_playwright

MAIN_URL = "https://webscraper.io/test-sites/e-commerce/static"


# =====================================================================
# (1) Detect all sections (NO TIMEOUT, NO RETRY)
# =====================================================================
async def detect_all_sections(page):
    response = await page.goto(MAIN_URL)
    if not response or response.status != 200:
        raise Exception("‚ùå Could not load main page")

    try:
        await page.wait_for_selector(".category-link")
    except:
        raise Exception("‚ùå Section links not found")

    sections = {}
    links = await page.locator(".category-link").all()

    for link in links:
        name = (await link.text_content()).strip().lower()
        url = await link.get_attribute("href")
        if url.startswith("/"):
            url = "https://webscraper.io" + url
        sections[name] = url

    print("üìå Sections Found:", sections)
    return sections


# =====================================================================
# (2) Detect subsections (NO TIMEOUT, NO RETRY)
# =====================================================================
async def detect_subsections(page, section_url):

    response = await page.goto(section_url)
    if not response or response.status != 200:
        raise Exception("‚ùå Could not load section page")

    try:
        await page.wait_for_selector(".subcategory-link")
    except:
        print("‚ùå No subsections found")
        return {}

    subs = {}
    links = await page.locator(".subcategory-link").all()

    for sl in links:
        name = (await sl.text_content()).strip().lower()
        url = await sl.get_attribute("href")
        if url.startswith("/"):
            url = "https://webscraper.io" + url
        subs[name] = url

    print("üìÇ Subsections Found:", subs)
    return subs


# =====================================================================
# (3) Get total page count
# =====================================================================
async def get_total_pages(page):
    try:
        buttons = await page.locator("ul.pagination li a").all()
    except:
        return 1

    nums = []
    for b in buttons:
        t = (await b.text_content()).strip()
        if t.isdigit():
            nums.append(int(t))

    return max(nums) if nums else 1


# =====================================================================
# (4) Scrape one page (NO TIMEOUT)
# =====================================================================
async def scrape_page(page, url):

    response = await page.goto(url)
    if not response or response.status != 200:
        print(f"‚ùå Skipping (bad URL): {url}")
        return []

    try:
        await page.wait_for_selector(".thumbnail")
    except:
        print(f"‚ùå No products found on page: {url}")
        return []

    cards = await page.locator(".thumbnail").all()
    products = []

    for c in cards:
        title = await c.locator(".title").text_content()
        price = await c.locator(".price").text_content()
        img = await c.locator("img").get_attribute("src")
        link = await c.locator(".title").get_attribute("href")
        stars = await c.locator(".glyphicon-star").count()

        products.append({
            "title": title.strip(),
            "price": price.strip(),
            "rating": stars,
            "image_url": img,
            "product_url": link
        })

    return products


# =====================================================================
# (5) Main scraper (Only Tablets)
# =====================================================================
async def scrape_tablets():

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()

        # Step 1: detect "computers" section
        sections = await detect_all_sections(page)

        comp_url = sections.get("computers")
        if not comp_url:
            raise Exception("‚ùå Computers section not found")

        # Step 2: detect subsections inside computers
        subs = await detect_subsections(page, comp_url)

        if "tablets" not in subs:
            raise Exception("‚ùå Tablets section not found")

        tablets_url = subs["tablets"]
        print("üìå Tablets URL:", tablets_url)

        # Step 3: detect total pages
        resp = await page.goto(tablets_url)
        if not resp or resp.status != 200:
            raise Exception("‚ùå Cannot open tablets URL to detect pages")

        total_pages = await get_total_pages(page)
        print(f"üìÑ Total tablet pages: {total_pages}")

        # Step 4: scrap all tablet pages
        all_tablets = []

        for pnum in range(1, total_pages + 1):
            url = f"{tablets_url}?page={pnum}"
            print("üîé Scraping:", url)
            products = await scrape_page(page, url)
            all_tablets.extend(products)

        await browser.close()
        return all_tablets


# =====================================================================
# (6) Run Script & Save output
# =====================================================================
data = asyncio.get_event_loop().run_until_complete(scrape_tablets())
print(f"\n‚úÖ Total tablet products collected: {len(data)}")

Path("output").mkdir(exist_ok=True)
csv_path = Path("output/tablets.csv")

with open(csv_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=data[0].keys())
    writer.writeheader()
    writer.writerows(data)

print("üìÅ File saved ‚Üí", csv_path)


üìå Sections Found: {'computers': 'https://webscraper.io/test-sites/e-commerce/static/computers', 'phones': 'https://webscraper.io/test-sites/e-commerce/static/phones'}
üìÇ Subsections Found: {'laptops': 'https://webscraper.io/test-sites/e-commerce/static/computers/laptops', 'tablets': 'https://webscraper.io/test-sites/e-commerce/static/computers/tablets'}
üìå Tablets URL: https://webscraper.io/test-sites/e-commerce/static/computers/tablets
üìÑ Total tablet pages: 4
üîé Scraping: https://webscraper.io/test-sites/e-commerce/static/computers/tablets?page=1
üîé Scraping: https://webscraper.io/test-sites/e-commerce/static/computers/tablets?page=2
üîé Scraping: https://webscraper.io/test-sites/e-commerce/static/computers/tablets?page=3
üîé Scraping: https://webscraper.io/test-sites/e-commerce/static/computers/tablets?page=4

‚úÖ Total tablet products collected: 21
üìÅ File saved ‚Üí output/tablets.csv


In [2]:
import requests
import json
import pandas as pd
from pathlib import Path
import math

# ------------------------------
# CONFIGURATION
# ------------------------------
API_KEY = "cef5755cba525a47429f31c618317d2a"  # Replace with your ScraperAPI key
COUNTRY = "in"
TLD = "com"

# ------------------------------
# FUNCTIONS
# ------------------------------
def fetch_search_results(keyword, page=1):
    """Fetch Amazon search results from ScraperAPI structured endpoint."""
    url = "https://api.scraperapi.com/structured/amazon/search"
    params = {
        "api_key": API_KEY,
        "query": keyword,
        "country": COUNTRY,
        "tld": TLD,
        "page": page
    }
    try:
        r = requests.get(url, params=params)
        r.raise_for_status()
        return r.json()
    except Exception as e:
        print(f"Error fetching page {page} for '{keyword}': {e}")
        return None

def save_json(data, filename):
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)
    print(f"‚úî JSON saved: {filename}")

def save_data_pandas(products, csv_filename, excel_filename):
    df = pd.DataFrame(products)
    df.to_csv(csv_filename, index=False)
    df.to_excel(excel_filename, index=False)
    print(f"‚úî CSV saved: {csv_filename}")
    print(f"‚úî Excel saved: {excel_filename}")

# ------------------------------
# MAIN SCRAPER
# ------------------------------
def scrape_keyword(keyword):
    print(f"\nüîç Searching for: '{keyword}'")

    page = 1
    all_products = []

    # First request
    data = fetch_search_results(keyword, page)
    if not data:
        print("No data returned from API.")
        return [], 0, 0

    results = data.get("results", [])
    products_per_page = len(results) if results else 1

    # Add first page
    for product in results:
        product["keyword"] = keyword
    all_products.extend(results)
    print(f"Page {page} scraped: {len(results)} products")

    # Iterate remaining pages until no more results
    page += 1
    while True:
        page_data = fetch_search_results(keyword, page)
        if not page_data:
            break
        page_results = page_data.get("results", [])
        if not page_results:
            break
        for product in page_results:
            product["keyword"] = keyword
        all_products.extend(page_results)
        print(f"Page {page} scraped: {len(page_results)} products")
        page += 1

    # Calculate totals
    total_products = len(all_products)
    total_pages = math.ceil(total_products / products_per_page) if products_per_page else 1

    return all_products, total_products, total_pages

# ------------------------------
# USER INPUT
# ------------------------------
keyword_input = input("Enter the product keyword to search: ").strip()
if not keyword_input:
    print("No keyword entered. Exiting...")
    exit()

# Scrape
products_data, total_products, total_pages = scrape_keyword(keyword_input)

# ------------------------------
# SAVE FILES
# ------------------------------
Path("output").mkdir(exist_ok=True)
save_json(products_data, "output/amazon_products.json")
save_data_pandas(products_data, "output/amazon_products.csv", "output/amazon_products.xlsx")

# ------------------------------
# SUMMARY
# ------------------------------
print("\n‚úÖ Scraping completed!")
print(f"Keyword: '{keyword_input}' | Total Products: {total_products} | Total Pages: {total_pages}")
print("Products data saved in 'output/' folder")


Enter the product keyword to search: iphone

üîç Searching for: 'iphone'
Page 1 scraped: 17 products
Page 2 scraped: 17 products
Page 3 scraped: 17 products
Page 4 scraped: 16 products
Page 5 scraped: 17 products
Page 6 scraped: 16 products
Page 7 scraped: 17 products
Page 8 scraped: 7 products
‚úî JSON saved: output/amazon_products.json
‚úî CSV saved: output/amazon_products.csv
‚úî Excel saved: output/amazon_products.xlsx

‚úÖ Scraping completed!
Keyword: 'iphone' | Total Products: 124 | Total Pages: 8
Products data saved in 'output/' folder


In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from pathlib import Path
import re
import json

BASE_URL = "https://books.toscrape.com/catalogue/page-{}.html"

# -------------------------------
# Convert rating text to number
# -------------------------------
def rating_to_number(rating_text):
    mapping = {"One": 1, "Two": 2, "Three": 3, "Four": 4, "Five": 5}
    return mapping.get(rating_text, 0)

# -------------------------------
# Simple sentiment analysis
# -------------------------------
def analyze_sentiment(description):
    positive_keywords = ["excellent", "amazing", "best", "masterpiece", "fantastic"]
    negative_keywords = ["boring", "dull", "bad", "poor", "weak"]

    score = 0
    desc_lower = description.lower()
    for word in positive_keywords:
        if word in desc_lower:
            score += 0.2
    for word in negative_keywords:
        if word in desc_lower:
            score -= 0.2

    return max(min(score, 1), -1)

# -------------------------------
# Smart pricing strategy
# -------------------------------
def pricing_strategy(price, stock, rating, description):
    adjusted_price = price
    sentiment_score = analyze_sentiment(description)

    # 5-star book: increase price slightly
    if rating == 5:
        adjusted_price *= 1.07  # +7%

    # Low stock: increase price slightly
    if stock <= 5:
        adjusted_price *= 1.05  # +5%

    # Positive sentiment: small boost
    if sentiment_score > 0.3:
        adjusted_price *= 1 + sentiment_score / 10

    return round(adjusted_price, 2)

# -------------------------------
# Scrape one page
# -------------------------------
def scrape_page(page_number):
    url = BASE_URL.format(page_number)
    r = requests.get(url)
    if r.status_code != 200:
        return []

    soup = BeautifulSoup(r.text, "html.parser")
    books = soup.select("article.product_pod")
    data = []

    for book in books:
        title = book.h3.a["title"]

        # Price
        price_text = book.select_one(".price_color").text.strip()
        price = float(re.sub(r"[^0-9.]", "", price_text))

        # Rating
        rating_text = book.p["class"][1]
        rating = rating_to_number(rating_text)

        # Detail page URL
        book_url = book.h3.a["href"]
        detail_page = "https://books.toscrape.com/catalogue/" + book_url

        # Default values
        stock = 0
        description = ""

        # Fetch detail page for real stock and description
        detail_resp = requests.get(detail_page)
        if detail_resp.status_code == 200:
            detail_soup = BeautifulSoup(detail_resp.text, "html.parser")

            # Description
            desc_tag = detail_soup.select_one("#product_description ~ p")
            if desc_tag:
                description = desc_tag.text.strip()

            # Real stock from table
            table_rows = detail_soup.select("table.table.table-striped tr")
            for row in table_rows:
                th = row.th.text.strip()
                td = row.td.text.strip()
                if th == "Availability":
                    match = re.search(r"\((\d+)\s+available\)", td)
                    if match:
                        stock = int(match.group(1))
                    else:
                        # fallback if number not shown
                        stock = 1
                    break

        # Adjust price based on rating, stock, and description sentiment
        adjusted_price = pricing_strategy(price, stock, rating, description)

        # Flags
        hot_selling = rating >= 4 and stock <= 5
        last_copy = stock == 1

        data.append({
            "title": title,
            "original_price": price,
            "adjusted_price": adjusted_price,
            "rating": rating,
            "stock": stock,
            "hot_selling": hot_selling,
            "last_copy": last_copy,
            "description": description,
            "url": book_url
        })

    return data

# -------------------------------
# Scrape all pages
# -------------------------------
all_books = []
page = 1
while True:
    books = scrape_page(page)
    if not books:
        break
    all_books.extend(books)
    print(f"Page {page} scraped: {len(books)} books")
    page += 1

print(f"\nTotal books scraped: {len(all_books)}")

# -------------------------------
# Save data
# -------------------------------
Path("output").mkdir(exist_ok=True)
df = pd.DataFrame(all_books)
df.to_csv("output/books_prices.csv", index=False)
df.to_excel("output/books_prices.xlsx", index=False)
with open("output/books_prices.json", "w", encoding="utf-8") as f:
    json.dump(all_books, f, ensure_ascii=False, indent=4)

print("üìÅ Saved CSV, Excel, and JSON files in 'output/' folder")


Page 1 scraped: 20 books
Page 2 scraped: 20 books
Page 3 scraped: 20 books
Page 4 scraped: 20 books
Page 5 scraped: 20 books
Page 6 scraped: 20 books
Page 7 scraped: 20 books
Page 8 scraped: 20 books
Page 9 scraped: 20 books
Page 10 scraped: 20 books
Page 11 scraped: 20 books
Page 12 scraped: 20 books
Page 13 scraped: 20 books
Page 14 scraped: 20 books
Page 15 scraped: 20 books
Page 16 scraped: 20 books
Page 17 scraped: 20 books
Page 18 scraped: 20 books
Page 19 scraped: 20 books
Page 20 scraped: 20 books
Page 21 scraped: 20 books
Page 22 scraped: 20 books
Page 23 scraped: 20 books
Page 24 scraped: 20 books
Page 25 scraped: 20 books
Page 26 scraped: 20 books
Page 27 scraped: 20 books
Page 28 scraped: 20 books
Page 29 scraped: 20 books
Page 30 scraped: 20 books
Page 31 scraped: 20 books
Page 32 scraped: 20 books
Page 33 scraped: 20 books
Page 34 scraped: 20 books
Page 35 scraped: 20 books
Page 36 scraped: 20 books
Page 37 scraped: 20 books
Page 38 scraped: 20 books
Page 39 scraped: 20 b