
# Part 1: Setup and Initialization
# Team A Responsibilities:
# 1. Import all necessary libraries
# 2. Set up project structure in Jupyter notebook
# 3. Create a clean HTTP request-response framework


In [7]:
# --- Imports ---
import sys
import requests
import pandas as pd
import bs4
from bs4 import BeautifulSoup
import urllib.robotparser as robotparser

# --- Print Package Versions for Reproducibility ---
print(" Package Versions:")
print(f"requests: {requests.__version__}")
print(f"beautifulsoup4 (bs4): {bs4.__version__ if hasattr(bs4, '__version__') else 'N/A'}")
print(f"pandas: {pd.__version__}")
print(f"Python: {sys.version}")

 Package Versions:
requests: 2.32.4
beautifulsoup4 (bs4): 4.13.5
pandas: 2.2.2
Python: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]


In [8]:
# Step 1: Check robots.txt to ensure scraping compliance
# -------------------------------------------------------------
print("\n Checking robots.txt...")

robots_url = "https://www.cars24.com/robots.txt"
rp = robotparser.RobotFileParser()
rp.set_url(robots_url)
try:
    rp.read()
    target_url = "https://www.cars24.com/buy-used-hyundai-cars-mumbai/"
    can_fetch = rp.can_fetch("*", target_url)
    print(f"robots.txt found: \nAllowed to scrape {target_url}? {'Yes' if can_fetch else 'No'}")
except Exception as e:
    print(" Unable to read robots.txt:", e)


 Checking robots.txt...
robots.txt found: 
Allowed to scrape https://www.cars24.com/buy-used-hyundai-cars-mumbai/? No


In [9]:
# -------------------------------------------------------------
# 🌐 Step 2: HTTP Connectivity and Page Validation
# -------------------------------------------------------------
print("\nTesting HTTP Connection...")

url = "https://www.cars24.com/buy-used-hyundai-cars-mumbai/"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/120.0.0.0 Safari/537.36"
}

try:
    # Use a session for more reliable connection handling
    with requests.Session() as session:
        response = session.get(url, headers=headers, timeout=10)
        response.raise_for_status()  # Explicitly raise for HTTP errors
        print(f" Connection successful (Status {response.status_code})")

        # Basic structure validation
        soup = BeautifulSoup(response.text, "html.parser")
        title_text = soup.title.string.strip() if soup.title else "N/A"
        print(f" Page title: {title_text}")

        # Validate expected structure (example selector)
        sample_element = soup.select_one("div")
        assert sample_element is not None, "Expected content structure missing!"
        print("Basic page structure looks valid.")
except requests.exceptions.Timeout:
    print(" Connection timed out. Try again later.")
except requests.exceptions.RequestException as e:
    print(f" HTTP request failed: {e}")
except AssertionError as e:
    print(f" Structure validation failed: {e}")
except Exception as e:
    print(f" Unexpected error: {e}")

print("\n🟢 Environment and connection checks complete. Proceed to Part 2.")


Testing HTTP Connection...
 Connection successful (Status 200)
 Page title: 436 Hyundai Used Cars in Mumbai | Second Hand Hyundai Cars in Mumbai starting from ₹0.89 lakh - CARS24
Basic page structure looks valid.

🟢 Environment and connection checks complete. Proceed to Part 2.
