In [1]:
!pip install requests beautifulsoup4 pandas





In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

base_url = "https://books.toscrape.com/catalogue/page-{}.html"

titles, prices, availabilities, ratings = [], [], [], []
rating_map = {"One": "One","Two": "Two","Three": "Three","Four": "Four","Five": "Five"}

for page in range(1, 51):
    url = base_url.format(page)
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Page {page} not found. Stopping.")
        break

    soup = BeautifulSoup(response.text, 'html.parser')
    books = soup.find_all('article', class_='product_pod')

    for book in books:
        title = book.h3.a['title']
        price = book.find('p', class_='price_color').text.strip()
        availability = book.find('p', class_='instock availability').text.strip()
        star_class = book.find('p')['class']
        star_rating = [r for r in star_class if r in rating_map]
        star_rating = star_rating[0] if star_rating else "No Rating"

        titles.append(title)
        prices.append(price)
        availabilities.append(availability)
        ratings.append(star_rating)

df = pd.DataFrame({
    'Title': titles,
    'Price': prices,
    'Availability': availabilities,
    'Star Rating': ratings
})

df.to_csv('books.csv', index=False, encoding='utf-8')
df.head()


Unnamed: 0,Title,Price,Availability,Star Rating
0,A Light in the Attic,Â£51.77,In stock,Three
1,Tipping the Velvet,Â£53.74,In stock,One
2,Soumission,Â£50.10,In stock,One
3,Sharp Objects,Â£47.82,In stock,Four
4,Sapiens: A Brief History of Humankind,Â£54.23,In stock,Five


In [3]:
from google.colab import files
files.download('books.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:
# --- Install Google Chrome and matching Chromedriver ---
!apt-get update -q
!apt-get install -y google-chrome-stable
!apt-get install -y chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin/chromedriver || true


Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:5 https://cli.github.com/packages stable InRelease
Hit:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:7 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:8 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists...
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lis

In [15]:
# Q2: IMDb Top 250 Movies Scraper
import requests
import re
import json
import pandas as pd

url = "https://www.imdb.com/chart/top/"
headers = {"Accept-Language": "en-US,en;q=0.5", "User-Agent":"Mozilla/5.0"}
r = requests.get(url, headers=headers)

# Extract the embedded JSON
pattern = re.compile(r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>')
match = pattern.search(r.text)
data_json = json.loads(match.group(1))

# Get movie details
items = data_json["props"]["pageProps"]["pageData"]["chartTitles"]["edges"]

data = []
rank = 1
for item in items:
    node = item["node"]
    title = node["titleText"]["text"]
    year = node.get("releaseYear", {}).get("year", "")
    rating = node.get("ratingsSummary", {}).get("aggregateRating", "")
    data.append([rank, title, year, rating])
    rank += 1

df_imdb = pd.DataFrame(data, columns=["Rank", "Title", "Year", "IMDB Rating"])
df_imdb.to_csv("imdb_top250.csv", index=False)
df_imdb.head(10)


Unnamed: 0,Rank,Title,Year,IMDB Rating
0,1,The Shawshank Redemption,1994,9.3
1,2,The Godfather,1972,9.2
2,3,The Dark Knight,2008,9.1
3,4,The Godfather Part II,1974,9.0
4,5,12 Angry Men,1957,9.0
5,6,The Lord of the Rings: The Return of the King,2003,9.0
6,7,Schindler's List,1993,9.0
7,8,The Lord of the Rings: The Fellowship of the Ring,2001,8.9
8,9,Pulp Fiction,1994,8.8
9,10,"The Good, the Bad and the Ugly",1966,8.8


In [18]:
import requests
import pandas as pd

# Example cities with coordinates
cities = {
    "Delhi": (28.6139, 77.2090),
    "London": (51.5074, -0.1278),
    "New York": (40.7128, -74.0060),
    "Tokyo": (35.6895, 139.6917),
    "Sydney": (-33.8688, 151.2093)
}

weather_data = []
for city, (lat, lon) in cities.items():
    url = f"https://api.open-meteo.com/v1/forecast?latitude={lat}&longitude={lon}&current_weather=true"
    r = requests.get(url)
    data = r.json()["current_weather"]
    temp = data["temperature"]
    windspeed = data["windspeed"]
    weather_data.append([city, temp, windspeed])

df_weather = pd.DataFrame(weather_data, columns=["City", "Temperature (°C)", "Wind Speed (km/h)"])
df_weather.to_csv("weather.csv", index=False)
df_weather.head()


Unnamed: 0,City,Temperature (°C),Wind Speed (km/h)
0,Delhi,29.5,2.9
1,London,17.7,15.8
2,New York,27.5,10.2
3,Tokyo,24.7,3.7
4,Sydney,9.0,3.4
