####Q1. Write a Python program to scrape all available books from the website(https://books.toscrape.com/) Books to Scrape – a live site built for practicing scraping (safe,legal, no anti-bot). For each book, extract the following details:
1. Title
2. Price
3. Availability (In stock / Out of stock)
4. Star Rating (One, Two, Three, Four, Five)

*Store the scraped results into a Pandas DataFrame and export them to a CSV file named books.csv.*

(Note: Use the requests library to fetch the HTML page. Use BeautifulSoup to parse and extract
book details and handle pagination so that books from all pages are scraped)

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_books():
    base_url = "https://books.toscrape.com/"
    all_books_data = []
    page_num = 1

    while True:
        url = f"{base_url}catalogue/page-{page_num}.html"
        try:
            response = requests.get(url)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"Error fetching page {page_num}: {e}")
            break

        soup = BeautifulSoup(response.text, 'html.parser')
        books = soup.find_all('article', class_='product_pod')

        if not books:
            break
        for book in books:
            title = book.h3.a['title']
            price = book.find('p', class_='price_color').text.strip()
            availability = book.find('p', class_='instock availability').text.strip()
            star_rating = book.find('p', class_='star-rating')['class'][1]

            all_books_data.append({
                'Title': title,
                'Price': price,
                'Availability': availability,
                'Star Rating': star_rating
            })

        page_num += 1

    return all_books_data

books_data = scrape_books()

df = pd.DataFrame(books_data)

df.to_csv('books.csv', index=False)

print("Scraping complete. Data saved to books.csv")
display(df.head())

Error fetching page 51: 404 Client Error: Not Found for url: https://books.toscrape.com/catalogue/page-51.html
Scraping complete. Data saved to books.csv


Unnamed: 0,Title,Price,Availability,Star Rating
0,A Light in the Attic,Â£51.77,In stock,Three
1,Tipping the Velvet,Â£53.74,In stock,One
2,Soumission,Â£50.10,In stock,One
3,Sharp Objects,Â£47.82,In stock,Four
4,Sapiens: A Brief History of Humankind,Â£54.23,In stock,Five


####Write a Python program to scrape the IMDB Top 250 Movies list (https://www.imdb.com/chart/top/) . For each movie, extract the following details:
1. Rank (1–250)
2. Movie Title
3. Year of Release
4. IMDB Rating
######Store the results in a Pandas DataFrame and export it to a CSV file named imdb_top250.csv.
(Note: Use Selenium/Playwright to scrape the required details from this website)

In [2]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import time

# Configure Selenium options
chrome_opts = Options()
chrome_opts.add_argument("--headless")
chrome_opts.add_argument("--no-sandbox")
chrome_opts.add_argument("--disable-dev-shm-usage")

# Spoof user-agent to avoid 403 Forbidden
chrome_opts.add_argument(
    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/115.0.0.0 Safari/537.36"
)

# Launch browser
browser = webdriver.Chrome(options=chrome_opts)

# Open IMDb Top 250
browser.get("https://www.imdb.com/chart/top/")
time.sleep(5)  # wait for page to load

film_list = []
movie_cards = browser.find_elements(By.CSS_SELECTOR, ".ipc-metadata-list-summary-item")

# Extract movie details
for rank, card in enumerate(movie_cards, start=1):
    try:
        name = card.find_element(By.CSS_SELECTOR, "h3").text
        release_year = card.find_element(By.CSS_SELECTOR, ".cli-title-metadata-item").text
        score = card.find_element(By.CSS_SELECTOR, ".ipc-rating-star--imdb").text.split()[0]
        film_list.append([rank, name, release_year, score])
    except Exception as e:
        print(f"Skipping a card due to error: {e}")

browser.quit()

# Save as DataFrame
imdb_table = pd.DataFrame(film_list, columns=["Position", "Movie", "Release Year", "Rating"])
imdb_table.to_csv("imdb_top250.csv", index=False)
print(imdb_table.head())

   Position                        Movie Release Year Rating
0         1  1. The Shawshank Redemption         1994    9.3
1         2             2. The Godfather         1972    9.2
2         3           3. The Dark Knight         2008    9.1
3         4     4. The Godfather Part II         1974    9.0
4         5              5. 12 Angry Men         1957    9.0


####Write a Python program to scrape the weather information for top world cities from the given website (https://www.timeanddate.com/weather/) . For each city, extract the following details:
1. City Name
2. Temperature
3. Weather Condition (e.g., Clear, Cloudy, Rainy, etc.)

Store the results in a Pandas DataFrame and export it to a CSV file named weather.csv.**bold text**

gives only 4

In [47]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_weather(url, max_cities=None):
    """
    Scrape weather data from timeanddate.com/weather/

    Args:
        url (str): URL to scrape
        max_cities (int, optional): stop after this many cities. None = all.

    Returns:
        pd.DataFrame: with columns City, Temperature, Condition
    """
    resp = requests.get(url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, 'html.parser')

    data = []



    section = soup.find('section', attrs={'id': 'qlook'})

    city_links = soup.select('a[href^="/weather/"]')

    seen = set()
    for link in city_links:
        city = link.get_text().strip()
        href = link.get('href')
        if not href:
            continue

        if city in seen:
            continue
        seen.add(city)


        temp = None
        cond = None


        parent = link.parent
        if parent:
            texts = parent.stripped_strings
            for t in texts:
                if '°' in t:
                    temp = t.strip()
                    break
            img = parent.find('img')
            if img:
                cond = img.get('alt') or img.get('title')
            if not cond:
                for t in texts:
                    low = t.strip().lower()
                    if any(word in low for word in ["cloud", "sun", "rain", "clear", "storm", "overcast", "haze", "fog", "drizzle", "thunder", "snow"]):
                        if t.strip() != temp:
                            cond = t.strip()
                            break


        if temp is None or cond is None:
            continue

        data.append({'City': city, 'Temperature': temp, 'Condition': cond})

        if max_cities is not None and len(data) >= max_cities:
            break

    df = pd.DataFrame(data)
    return df

def main():
    url = 'https://www.timeanddate.com/weather/'
    df = scrape_weather(url, max_cities=50)
    print(df.head())
    df.to_csv('weather1.csv', index=False)
    print("Saved to weather.csv")

if __name__ == '__main__':
    main()


            City Temperature                 Condition
0                      67 °F            Mostly cloudy.
1  Washington DC       67 °F            Mostly cloudy.
2       New York       63 °F                    Clear.
3         London       59 °F         Scattered clouds.
4          Tokyo       77 °F  Sprinkles. Partly sunny.
Saved to weather.csv


47 alphabetically

In [63]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_timeanddate_top_cities(url="https://www.timeanddate.com/weather/", max_cities=None):
    """
    Scrape city, temperature, weather condition from timeanddate.com/weather/
    :param url: URL to scrape.
    :param max_cities: Maximum number of cities to scrape (None = all).
    :return: list of dicts with keys: City, Temperature, Condition
    """
    resp = requests.get(url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, 'html.parser')

    results = []


    table = soup.find('table', attrs={'id': 'wt-48'})
    if table is None:
        table = soup.find('table')
    if table is None:
        raise RuntimeError("Could not find the weather table on the page")

    rows = table.find_all('tr')

    for row in rows:
        header = row.find('th')
        if header:
            continue

        cols = row.find_all('td')
        if len(cols) < 3:
            continue

        city_td = cols[0]
        city_name = city_td.get_text(strip=True)


        temp_td = None
        for td in cols:
            text = td.get_text(strip=True)
            if text.endswith("°C") or text.endswith("°F"):
                temp_td = td
                break
        if not temp_td:
            continue
        temperature = temp_td.get_text(strip=True)


        condition = None

        img = row.find('img')
        if img and img.has_attr('alt'):
            condition = img['alt'].strip()
        if not condition:
            for td in cols:
                txt = td.get_text(" ", strip=True)
                if any(word in txt.lower() for word in ['clear','cloudy','rain','sunny','overcast','snow','fog','haze','storm','thunder']):
                    if not txt.endswith("°C") and not txt.endswith("°F"):
                        condition = txt
                        break
        if not condition:
            condition = ""

        results.append({
            'City': city_name,
            'Temperature': temperature,
            'Condition': condition
        })

        if max_cities is not None and len(results) >= max_cities:
            break

    return results

def main():
    data = scrape_timeanddate_top_cities(max_cities=50)

    df = pd.DataFrame(data, columns=['City', 'Temperature', 'Condition'])

    df.to_csv('weather2.csv', index=False)
    print("Saved to weather.csv with {} entries".format(len(df)))

if __name__ == "__main__":
    main()


Saved to weather.csv with 47 entries


top 15 but visiting every single city's url

In [64]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Mapping of city names to their timeanddate URLs
CITY_URLS = {
    "New York City": "https://www.timeanddate.com/weather/usa/new-york",
    "London": "https://www.timeanddate.com/weather/uk/london",
    "Tokyo": "https://www.timeanddate.com/weather/japan/tokyo",
    "Paris": "https://www.timeanddate.com/weather/france/paris",
    "Singapore": "https://www.timeanddate.com/weather/singapore/singapore",
    "Dubai": "https://www.timeanddate.com/weather/united-arab-emirates/dubai",
    "Sydney": "https://www.timeanddate.com/weather/australia/sydney",
    "Hong Kong": "https://www.timeanddate.com/weather/hong-kong/hong-kong",
    "Los Angeles": "https://www.timeanddate.com/weather/usa/los-angeles",
    "Beijing": "https://www.timeanddate.com/weather/china/beijing",
    "Moscow": "https://www.timeanddate.com/weather/russia/moscow",
    "Rome": "https://www.timeanddate.com/weather/italy/rome",
    "Chicago": "https://www.timeanddate.com/weather/usa/chicago",
    "Toronto": "https://www.timeanddate.com/weather/canada/toronto",
    "Shanghai": "https://www.timeanddate.com/weather/china/shanghai"
}

def scrape_city_weather(city, url):
    """Scrape weather info (temperature, condition) for a single city."""
    resp = requests.get(url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    temp_div = soup.find("div", id="qlook")
    temperature = temp_div.find("div", class_="h2").get_text(strip=True) if temp_div else ""

    condition = ""
    if temp_div:
        cond_span = temp_div.find("p")
        if cond_span:
            condition = cond_span.get_text(strip=True)

    return {
        "City": city,
        "Temperature": temperature,
        "Condition": condition
    }

def main():
    results = []
    for city, url in CITY_URLS.items():
        print(f"Scraping {city}...")
        try:
            results.append(scrape_city_weather(city, url))
        except Exception as e:
            print(f"Failed to scrape {city}: {e}")
            results.append({"City": city, "Temperature": "", "Condition": ""})

    df = pd.DataFrame(results, columns=["City", "Temperature", "Condition"])

    df.to_csv("weather3.csv", index=False)
    print("Saved weather info for all 15 cities to weather3.csv")

if __name__ == "__main__":
    main()


Scraping New York City...
Scraping London...
Scraping Tokyo...
Scraping Paris...
Scraping Singapore...
Scraping Dubai...
Scraping Sydney...
Scraping Hong Kong...
Scraping Los Angeles...
Scraping Beijing...
Scraping Moscow...
Scraping Rome...
Scraping Chicago...
Scraping Toronto...
Scraping Shanghai...
Saved weather info for all 15 cities to weather3.csv
