In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

URL_PATTERN = "https://books.toscrape.com/catalogue/page-{}.html"

all_books = []

for pg in range(1, 51): 
    page_url = URL_PATTERN.format(pg)
    response = requests.get(page_url)

    if response.status_code != 200:
        print(f"Page {pg} could not be loaded.")
        break

    soup = BeautifulSoup(response.content, "html.parser")

    for book in soup.select("article.product_pod"):
        book_title = book.h3.a["title"]
        book_price = book.select_one("p.price_color").text.strip()
        stock_status = book.select_one("p.instock.availability").get_text(strip=True)
        
        rating_tag = book.select_one("p.star-rating")
        star_text = [c for c in rating_tag["class"] if c != "star-rating"][0]

        all_books.append({
            "Title": book_title,
            "Price": book_price,
            "Availability": stock_status,
            "Star Rating": star_text
        })
df = pd.DataFrame(all_books)
df.to_csv("books.csv", index=False)

print("Scraping finished! Total books:", len(df))
print(df.sample(5)) 


Scraping finished! Total books: 1000
                                                 Title   Price Availability  \
308                   Eligible (The Austen Project #4)  £27.09     In stock   
436  If I Gave You God's Phone Number....: Searchin...  £20.91     In stock   
799              One for the Money (Stephanie Plum #1)  £32.87     In stock   
691                                       'Salem's Lot  £49.56     In stock   
508              The Bourne Identity (Jason Bourne #1)  £42.78     In stock   

    Star Rating  
308       Three  
436         One  
799         Two  
691        Four  
508        Four  


In [6]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time

options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(options=options)
driver.get("https://www.imdb.com/chart/top/")
time.sleep(3)

soup = BeautifulSoup(driver.page_source, "html.parser")
driver.quit()

movies = []
rows = soup.select("li.ipc-metadata-list-summary-item")

for rank, row in enumerate(rows, start=1):
    title_tag = row.select_one("h3")
    title = title_tag.text.replace(str(rank)+".", "").strip()
    year = row.select_one("span.ipc-title__subtext").text.strip("()")
    rating = row.select_one("span.ipc-rating-star--rating").text
    movies.append([rank, title, year, rating])

df_imdb = pd.DataFrame(movies, columns=["Rank", "Title", "Year", "Rating"])
df_imdb.to_csv("imdb_top250.csv", index=False)
df_imdb.head()
df_loaded = pd.read_csv("imdb_top250.csv")

print("Successfully loaded IMDb Top 250 from CSV")
print(df_loaded.head())     # show first 5 rows
print("\nTotal movies scraped:", len(df_loaded))



Successfully loaded IMDb Top 250 from CSV
Empty DataFrame
Columns: [Rank, Title, Year, Rating]
Index: []

Total movies scraped: 0


In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://www.timeanddate.com/weather/"
res = requests.get(url)
soup = BeautifulSoup(res.text, "html.parser")

cities = []
rows = soup.select("table tbody tr")

for row in rows:
    city_tag = row.find("a")
    if not city_tag:
        continue
    city = city_tag.text
    temp = row.find_all("td")[1].text.strip()
    condition = row.find_all("td")[2].text.strip()
    cities.append([city, temp, condition])

df_weather = pd.DataFrame(cities, columns=["City", "Temperature", "Condition"])
df_weather.to_csv("weather.csv", index=False)
df_weather.head()


Unnamed: 0,City,Temperature,Condition
