In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

BASE_URL = "https://books.toscrape.com/catalogue/page-{}.html"


books_data = []

def get_star_rating(tag):
    ratings = {
        "One": "One",
        "Two": "Two",
        "Three": "Three",
        "Four": "Four",
        "Five": "Five"
    }
    for rating in ratings:
        if rating in tag["class"]:
            return ratings[rating]
    return None

for page in range(1, 51):
    url = BASE_URL.format(page)
    response = requests.get(url)

    if response.status_code != 200:
        print(f"Failed to retrieve page {page}")
        continue

    soup = BeautifulSoup(response.text, "html.parser")
    books = soup.find_all("article", class_="product_pod")

    for book in books:
        title = book.h3.a["title"]
        price = book.find("p", class_="price_color").text.strip()
        availability = book.find("p", class_="instock availability").text.strip()
        rating = get_star_rating(book.find("p", class_="star-rating"))

        books_data.append({
            "Title": title,
            "Price": price,
            "Availability": availability,
            "Star Rating": rating
        })

df = pd.DataFrame(books_data)


df.to_csv("books.csv", index=False, encoding="utf-8")

print("Scraping completed! Data saved to books.csv")


Scraping completed! Data saved to books.csv


In [33]:
import pandas as pd
import asyncio
from playwright.async_api import async_playwright


def extract_year(title_string):
    import re
    match = re.search(r'\((\d{4})\)', title_string)
    if match:
        return match.group(1)
    return None

async def scrape_imdb():
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()
        await page.goto("https://www.imdb.com/chart/top/")


        await page.wait_for_selector(".ipc-metadata-list", timeout=120000) # Increased timeout to 60 seconds

        movies_data = []

        movie_elements = await page.query_selector_all(".ipc-metadata-list-summary-item")

        for movie in movie_elements:

            rank_element = await movie.query_selector(".ipc-metadata-list-summary-item__t")
            rank_text = await rank_element.text_content() if rank_element else None
            rank = int(rank_text.split('.')[0].strip()) if rank_text and rank_text.split('.')[0].strip().isdigit() else None


            title_element = await movie.query_selector(".ipc-metadata-list-summary-item__t")
            title_text = await title_element.text_content() if title_element else None
            title = title_text.split('.')[1].strip() if title_text and '.' in title_text else None



            year_element = await movie.query_selector(".sc-b189961a-8.hpuOZU.cli-title-metadata-item")
            year = await year_element.text_content() if year_element else None


            rating_element = await movie.query_selector(".ipc-html-content-div > div > span")
            rating = await rating_element.text_content() if rating_element else None


            movies_data.append({
                "Rank": rank,
                "Movie Title": title,
                "Year of Release": year,
                "IMDB Rating": rating
            })

        await browser.close()
        return movies_data

movies_data = await scrape_imdb()


df_movies = pd.DataFrame(movies_data)
df_movies.to_csv("imdb_top250.csv", index=False, encoding="utf-8")

print("IMDB Top 250 Movies scraped and saved to imdb_top250.csv")
display(df_movies.head())

TimeoutError: Page.wait_for_selector: Timeout 120000ms exceeded.
Call log:
  - waiting for locator(".ipc-metadata-list") to be visible


In [24]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://www.timeanddate.com/weather/"
response = requests.get(url)

if response.status_code != 200:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")
else:
    soup = BeautifulSoup(response.text, "html.parser")
    weather_data = []

    table = soup.find("table", class_="zebra")

    if table:

        for row in table.find_all("tr")[1:]:
            cells = row.find_all("td")
            if len(cells) >= 4:
                city = cells[1].text.strip()
                temperature = cells[2].text.strip()
                condition = cells[3].text.strip()

                weather_data.append({
                    "City": city,
                    "Temperature": temperature,
                    "Condition": condition
                })

        df_weather = pd.DataFrame(weather_data)


        df_weather.to_csv("weather.csv", index=False, encoding="utf-8")

        print("Weather data scraped and saved to weather.csv")
        display(df_weather.head())
    else:
        print("Could not find the weather table on the page.")

Weather data scraped and saved to weather.csv


Unnamed: 0,City,Temperature,Condition
0,Mon 6:38 am,,73 °F
1,Mon 9:38 am,,61 °F
2,Mon 4:08 pm,,59 °F
3,Mon 7:38 am,,81 °F
4,Mon 11:38 am,,72 °F


In [29]:
!playwright install

Downloading Chromium 140.0.7339.16 (playwright build v1187)[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1187/chromium-linux.zip[22m
[1G173.7 MiB [] 0% 0.0s[0K[1G173.7 MiB [] 0% 21.0s[0K[1G173.7 MiB [] 0% 9.4s[0K[1G173.7 MiB [] 0% 5.9s[0K[1G173.7 MiB [] 1% 5.1s[0K[1G173.7 MiB [] 1% 4.7s[0K[1G173.7 MiB [] 2% 4.7s[0K[1G173.7 MiB [] 2% 4.9s[0K[1G173.7 MiB [] 2% 4.8s[0K[1G173.7 MiB [] 3% 4.4s[0K[1G173.7 MiB [] 3% 3.9s[0K[1G173.7 MiB [] 4% 3.7s[0K[1G173.7 MiB [] 5% 3.7s[0K[1G173.7 MiB [] 5% 3.6s[0K[1G173.7 MiB [] 5% 3.8s[0K[1G173.7 MiB [] 6% 3.7s[0K[1G173.7 MiB [] 6% 3.6s[0K[1G173.7 MiB [] 7% 3.4s[0K[1G173.7 MiB [] 8% 3.3s[0K[1G173.7 MiB [] 8% 3.2s[0K[1G173.7 MiB [] 9% 3.2s[0K[1G173.7 MiB [] 10% 3.2s[0K[1G173.7 MiB [] 11% 3.0s[0K[1G173.7 MiB [] 11% 2.9s[0K[1G173.7 MiB [] 12% 2.8s[0K[1G173.7 MiB [] 13% 2.8s[0K[1G173.7 MiB [] 14% 2.7s[0K[1G173.7 MiB [] 15% 2.6s[0K[1G173.7 MiB [] 16% 2.5s[0K[1G173.7 