# Student ID: 24216779, API: BlueMotiveCars

## Data Collection

Let us start by importing all the required libraries

In [12]:
import requests
from bs4 import BeautifulSoup
import csv
import re

Helper Function to Get Total Pages

In [13]:
def get_total_pages_from_page(url):
    """
    Given a URL, this function retrieves the total number of pages.
    It first checks the <h2> header for text like "Page 1 of 20".
    If that fails, it looks at the <nav> element for numeric page links.
    """
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching {url}: {response.status_code}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Try to extract the total pages from the header text.
    h2 = soup.find('h2')
    if h2:
        # This regular expression looks for a pattern "Page <number> of <total>"
        match = re.search(r'Page\s+\d+\s+of\s+(\d+)', h2.get_text())
        if match:
            total_pages = int(match.group(1))
            return total_pages
    
    # If the <h2> header did not contain the info, check the navigation bar.
    nav = soup.find('nav')
    if nav:
        page_numbers = []
        # Look through both <a> and <span> tags to collect numbers.
        for tag in nav.find_all(['a', 'span']):
            text = tag.get_text(strip=True)
            if text.isdigit():
                page_numbers.append(int(text))
        if page_numbers:
            return max(page_numbers)
    
    # If nothing works, return None.
    return None


Function to Extract Data for a Single Brand

In [14]:
def extract_brand_data(brand, first_page_url):
    """
    Extract all car records for a given brand.
    It first figures out the total pages available using the first page URL,
    then loops through each page to collect the car details.
    """
    print(f"\nStarting extraction for {brand}...")
    total_pages = get_total_pages_from_page(first_page_url)
    if total_pages is None:
        print(f"Could not determine total pages for {brand} using {first_page_url}")
        return []
    print(f"Found {total_pages} pages for {brand}")

    # Create a URL template by replacing the page number part with a format placeholder.
    # Assumes the URL ends with something like 'page01.html' which we can change.
    base_url = re.sub(r'page\d+\.html', 'page{:02d}.html', first_page_url)
    
    records = []
    # Loop through each page number from 1 to the total.
    for page in range(1, total_pages + 1):
        url = base_url.format(page)
        print(f"Processing {url}")
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Error processing {url}: {response.status_code}")
            continue
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find each car record within <li> tags.
        li_items = soup.find_all('li')
        for li in li_items:
            record = {}
            # Tag the record with the brand name.
            record['Brand'] = brand
            
            # Extract make and model details from the <span> with class "make-model".
            make_model_tag = li.find('span', class_='make-model')
            if make_model_tag:
                record['Make_Model'] = make_model_tag.get_text(strip=True)
            
            # The rest of the details are stored in a table with class "car".
            table = li.find('table', class_='car')
            if table:
                for tr in table.find_all('tr'):
                    cells = tr.find_all('td')
                    if len(cells) == 2:
                        # Remove extra characters (like ":") from the field name.
                        field = cells[0].get_text(strip=True).replace(":", "")
                        value = cells[1].get_text(strip=True)
                        record[field] = value
            records.append(record)
    return records

Function to Extract Data for All Brands

In [15]:
def extract_all_data(brands_first_page):
    """
    Loop through all the brands provided in the dictionary.
    For each brand, extract all car sale records and combine them into a single list.
    """
    all_records = []
    for brand, first_page_url in brands_first_page.items():
        records = extract_brand_data(brand, first_page_url)
        all_records.extend(records)
    return all_records

# You can test this with a small dictionary of brands.

Function to Write Data to a CSV File

In [16]:
def write_to_csv(records, output_file):
    """
    Write all the collected car records into a CSV file.
    This function determines the header fields from the records and uses a preferred ordering.
    """
    if not records:
        print("No records found to write.")
        return

    # Create a set of all keys present in the records.
    fieldnames = set()
    for rec in records:
        fieldnames.update(rec.keys())
    
    # Define a preferred ordering for our columns.
    ordering = ['Brand', 'Make_Model', 'Date of Sale', 'Sale Price', 'Year', 'Mileage', 
                'Classification', 'Transmission', 'Fuel Type', 'Description', 'Sale Location']
    # Include any extra fields that might have been found.
    fieldnames_ordered = [f for f in ordering if f in fieldnames] + [f for f in fieldnames if f not in ordering]
    
    # Write the records to the CSV file.
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames_ordered)
        writer.writeheader()
        for rec in records:
            writer.writerow(rec)
    print(f"\nData has been saved to {output_file}")

Main Execution Block

In [17]:
if __name__ == '__main__':
    # Define the first page URL for each car brand.
    brands_first_page = {
       "Audi": "http://mlg.ucd.ie/modules/python/assignment1/cars/Audi-page01.html",
       "BMW": "http://mlg.ucd.ie/modules/python/assignment1/cars/BMW-page01.html",
       "Mercedes-Benz": "http://mlg.ucd.ie/modules/python/assignment1/cars/Mercedes-Benz-page01.html",
       "Volkswagen": "http://mlg.ucd.ie/modules/python/assignment1/cars/Volkswagen-page01.html"
    }
    
    output_file = "car_sales_data.csv"
    
    # Extract all car records for all brands.
    all_records = extract_all_data(brands_first_page)
    
    # Write the combined data to the CSV file.
    write_to_csv(all_records, output_file)



Starting extraction for Audi...
Found 20 pages for Audi
Processing http://mlg.ucd.ie/modules/python/assignment1/cars/Audi-page01.html
Processing http://mlg.ucd.ie/modules/python/assignment1/cars/Audi-page02.html
Processing http://mlg.ucd.ie/modules/python/assignment1/cars/Audi-page03.html
Processing http://mlg.ucd.ie/modules/python/assignment1/cars/Audi-page04.html
Processing http://mlg.ucd.ie/modules/python/assignment1/cars/Audi-page05.html
Processing http://mlg.ucd.ie/modules/python/assignment1/cars/Audi-page06.html
Processing http://mlg.ucd.ie/modules/python/assignment1/cars/Audi-page07.html
Processing http://mlg.ucd.ie/modules/python/assignment1/cars/Audi-page08.html
Processing http://mlg.ucd.ie/modules/python/assignment1/cars/Audi-page09.html
Processing http://mlg.ucd.ie/modules/python/assignment1/cars/Audi-page10.html
Processing http://mlg.ucd.ie/modules/python/assignment1/cars/Audi-page11.html
Processing http://mlg.ucd.ie/modules/python/assignment1/cars/Audi-page12.html
Process