In [1]:
# Libraries to be used
import re
import time
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from requests.exceptions import Timeout
from IPython.display import clear_output

In [2]:
# sending a request to the website
html = requests.get("https://www.ccarprice.com/au/") 

# Checking the status of the website
if html.status_code == 200:
    print("Connection established successfully!")
else:
    print(f"Connection failed with status code {html.status_code}")
    

Connection established successfully!


In [3]:
# converting the html content to a soup to scrap data with
soup = BeautifulSoup(html.content, 'html.parser')

In [4]:
class Car_Brand_Scraper:
    def __init__(self, soup):
        self.soup = soup

    def extract_brand_links(self):
        """Extract all brand links from the page."""
        try:
            divs_1 = self.soup.find("div", class_="vertical-menu") \
                              .find("label").find("div", class_="show1") \
                              .find_all("a", class_="brnd")

            brand_links = [x.get("href") for x in divs_1 if x.get("href")]
            return brand_links

        except AttributeError:
            print("Error: The website structure may have changed or the expected elements were not found.")
            return []

    def display_links(self):
        """Display the links."""
        brand_links = self.extract_brand_links()
        if brand_links:
            print(f'The website "https://www.ccarprice.com/au/" has data of {len(brand_links)} car brands.')
            print('\nAll Brands Links:')
            for idx, link in enumerate(brand_links, 1):
                print(f'{idx}. {link}')
        else:
            print("No brand links found.")

# Instantiating the Car Brand Scrapper Class
scraper = Car_Brand_Scraper(soup)

# Assigning the brand links to brand_links variable
brand_links = scraper.extract_brand_links()

#  Calling the display_links class
scraper.display_links()


The website "https://www.ccarprice.com/au/" has data of 88 car brands.

All Brands Links:
1. https://www.ccarprice.com/au/honda-car-prices-in-Australia-1
2. https://www.ccarprice.com/au/bmw-car-prices-in-Australia-2
3. https://www.ccarprice.com/au/lexus-car-prices-in-Australia-3
4. https://www.ccarprice.com/au/toyota-car-prices-in-Australia-4
5. https://www.ccarprice.com/au/nissan-car-prices-in-Australia-5
6. https://www.ccarprice.com/au/audi-car-prices-in-Australia-6
7. https://www.ccarprice.com/au/kia-car-prices-in-Australia-7
8. https://www.ccarprice.com/au/mitsubishi-car-prices-in-Australia-8
9. https://www.ccarprice.com/au/hyundai-car-prices-in-Australia-9
10. https://www.ccarprice.com/au/chevrolet-car-prices-in-Australia-10
11. https://www.ccarprice.com/au/ford-car-prices-in-Australia-102
12. https://www.ccarprice.com/au/mercedes-car-prices-in-Australia-103
13. https://www.ccarprice.com/au/porsche-car-prices-in-Australia-104
14. https://www.ccarprice.com/au/suzuki-car-prices-in-A

In [5]:
# empty dictionary to store brand and link pairs  
car_links = {}

def format_brand_name(brand_link):
    """Retrieving the brand using regex"""
    # Using the regular expression to extract the brand name
    brand = re.search(r'https:\/\/www\.ccarprice\.com\/au\/(.+)\-car', str(brand_link)).group(1)

    # Split by hyphen, capitalize each word, and join back
    return ' '.join(word.capitalize() for word in brand.split('-'))

def fetch_car_links_by_brand(brand_link, formatted_brand_name):
    """Fetch all car links for a specific brand from the given brand link."""
    # Request the HTML content
    html1 = requests.get(brand_link)
    soup1 = BeautifulSoup(html1.content, 'html.parser')

    # Find the relevant div containing all the car listings
    cars_by_brand = soup1.body.find("div", {"id": "page"}).find_all("div", {"id": "pbox", "class": "price-cover"})[-1].find_all("div", {"id": "pbox", "class": "listing"})

    # Initialize a list to hold all car links
    link_of_cars_by_brand = []

    # Iterate over each car listing and extract the href
    for car in cars_by_brand:
        # Check if the car has a link and if it is not labeled as "Coming soon"
        if car.a:
            text = car.getText().strip()
            if "Coming soon" not in text:
                car_link = car.a.get("href")
                link_of_cars_by_brand.append(car_link)

    # Return the result as a dictionary
    return {formatted_brand_name: link_of_cars_by_brand}

In [6]:
# Empty dictionary to store the car brand and car link pairs 
car_links = {}

for n, each_brand_link in enumerate(brand_links):
    #Retrieving the brand name from the link
    formatted_brand_name = format_brand_name(each_brand_link)

    # Retrieving all car links associated to that brand
    brand_car_links = fetch_car_links_by_brand(each_brand_link, formatted_brand_name)

    # updating the empty dictionary
    car_links.update(brand_car_links)

# printing the results
car_links

{'Honda': ['https://www.ccarprice.com/au/honda-cr-v-ex-awd-2025-price-in-australia-23534',
  'https://www.ccarprice.com/au/honda-cr-v-ex-2wd-2025-price-in-australia-23532',
  'https://www.ccarprice.com/au/honda-elevate-signature-black-edition-2025-price-in-australia-23518',
  'https://www.ccarprice.com/au/honda-elevate-black-edition-2025-price-in-australia-23517',
  'https://www.ccarprice.com/au/honda-pilot-trailsport-2026-price-in-australia-23513',
  'https://www.ccarprice.com/au/honda-cr-v-hybrid-sport-touring-awd-2025-price-in-australia-23509',
  'https://www.ccarprice.com/au/honda-civic-sedan-hybrid-sport-cvt-2025-price-in-australia-23492',
  'https://www.ccarprice.com/au/honda-cr-v-lx-awd-2025-price-in-australia-23483',
  'https://www.ccarprice.com/au/honda-cr-v-lx-2wd-2025-price-in-australia-23482',
  'https://www.ccarprice.com/au/honda-accord-sport-l-sedan-2025-price-in-australia-23459',
  'https://www.ccarprice.com/au/honda-accord-touring-sedan-2025-price-in-australia-23444',
 

In [7]:
print('The website "https://www.ccarprice.com/au/" has the following data:\n')

car_data = {
    "Brand": list(car_links.keys()),
    "Number of Cars": [len(cars) for cars in car_links.values()]
}

# Create the DataFrame
car_df = pd.DataFrame(car_data)

# Add a total row
total_cars_details = car_df["Number of Cars"].sum()
car_df.loc[len(car_df.index)] = ["Total", total_cars_details]

# viewing the first 5 entries
car_df

The website "https://www.ccarprice.com/au/" has the following data:



Unnamed: 0,Brand,Number of Cars
0,Honda,60
1,Bmw,60
2,Lexus,60
3,Toyota,60
4,Nissan,60
...,...,...
84,Datsun,20
85,Lucid,33
86,Scout,3
87,Xiaomi,6


In [8]:
car_links

{'Honda': ['https://www.ccarprice.com/au/honda-cr-v-ex-awd-2025-price-in-australia-23534',
  'https://www.ccarprice.com/au/honda-cr-v-ex-2wd-2025-price-in-australia-23532',
  'https://www.ccarprice.com/au/honda-elevate-signature-black-edition-2025-price-in-australia-23518',
  'https://www.ccarprice.com/au/honda-elevate-black-edition-2025-price-in-australia-23517',
  'https://www.ccarprice.com/au/honda-pilot-trailsport-2026-price-in-australia-23513',
  'https://www.ccarprice.com/au/honda-cr-v-hybrid-sport-touring-awd-2025-price-in-australia-23509',
  'https://www.ccarprice.com/au/honda-civic-sedan-hybrid-sport-cvt-2025-price-in-australia-23492',
  'https://www.ccarprice.com/au/honda-cr-v-lx-awd-2025-price-in-australia-23483',
  'https://www.ccarprice.com/au/honda-cr-v-lx-2wd-2025-price-in-australia-23482',
  'https://www.ccarprice.com/au/honda-accord-sport-l-sedan-2025-price-in-australia-23459',
  'https://www.ccarprice.com/au/honda-accord-touring-sedan-2025-price-in-australia-23444',
 

In [9]:
# 
all_car_details = pd.DataFrame()
index_number = 0
# count = 0
for n, (each_brand, list_of_cars_links) in enumerate(car_links.items()):
    for m, link_of_each_car in enumerate(list_of_cars_links):
        car_details_grouped_by_brand = {}
        
        try:
            html2 = requests.get(link_of_each_car, timeout=10) 
        except:
            continue 
        else: 
            soup2 = BeautifulSoup(html2.content, 'html.parser')

            price_details = soup2.select_one('div.detail-price#pbox')
            if price_details:
                price_text = price_details.get_text(strip=True)
            
                # Extract prices using regex
                aud_match = re.search(r'Price in AUD:\s*([\d,]+)', price_text)
                price_in_aud = aud_match.group(1).replace(',', '') if aud_match else 'N/A'
            
                usd_match = re.search(r'Price in USD:\s*\$([\d,]+)', price_text)
                price_in_usd = usd_match.group(1).replace(',', '') if usd_match else 'N/A'

            else:
                print("Price details not found.")              

            # Extract year safely
            year_UNFILTERED = soup2.select_one('div#pbox.detail-cover div.tr div.td2')
            
            if year_UNFILTERED:
                year = year_UNFILTERED.text.strip()
                try:
                    year = re.search('.*(\d{4}).*', year).group(1)
                except:
                    year = np.nan
            else:
                year = np.nan

            try:
                image_tag = soup2.find("img", {"itemprop": "image"})
                image_url = image_tag["src"] if image_tag else np.nan
            except:
                image_url = np.nan

            car_details_grouped_by_brand['brand'] = each_brand
            car_details_grouped_by_brand['year'] = year
            car_details_grouped_by_brand['price_in_aud'] = price_in_aud
            car_details_grouped_by_brand['price_in_usd'] = price_in_usd
            car_details_grouped_by_brand["image_url"]= image_url
            car_details_grouped_by_brand['car_link']=link_of_each_car

            list_of_car_features = soup2.select_one('html body div#page.main div div#spec div#pbox.detail-cover').find_all("div", {"class": "tr"})
            for each in list_of_car_features:
                if each.find_all("div")[0].text.strip() :
                
                    feature_name= each.find_all("div")[0].text.strip().lower().replace(" ", '_')
                    feature_data= each.find_all("div")[1].text.strip()
                    
                    ##
                    car_details_grouped_by_brand[feature_name] = feature_data

            if n == 0 and m == 0:
                inner_df = pd.DataFrame(car_details_grouped_by_brand, index=[index_number])
                index_number += 1
                all_car_details = inner_df.copy()
            else:
                inner_df = pd.DataFrame(car_details_grouped_by_brand, index=[index_number])
                index_number += 1
                all_car_details = pd.concat([all_car_details, inner_df], join='outer').copy()
            
            time.sleep(0.1)
            
            clear_output(wait=True)
            print(f"Number of scraped car data: {index_number+1}/4160")



Number of scraped car data: 3844/4160


In [10]:
all_car_details.shape

(3843, 102)

In [11]:
all_car_details

Unnamed: 0,brand,year,price_in_aud,price_in_usd,image_url,car_link,model_number,made_in,warranty,available_colors,...,dvd_player,radio,smart_access_card_entry,leather_seats,voice_control,removable/convertible_top,shock_absorbers_type,assembled_in,cassette_player,introduction_date
0,Honda,2025,54499,33850,https://www.ccarprice.com/products/Honda_CR-V_...,https://www.ccarprice.com/au/honda-cr-v-ex-awd...,CR-V EX AWD 2025,japan,"Basic 3 Years / 36,000 Miles Corrosion 5 Years...","Crystal Black Pearl, Lunar Silver Metallic, Mo...",...,,,,,,,,,,
1,Honda,2025,52084,32350,https://www.ccarprice.com/products/Honda_CR-V_...,https://www.ccarprice.com/au/honda-cr-v-ex-2wd...,CR-V EX 2WD 2025,japan,"Basic 3 Years / 36,000 Miles Corrosion 5 Years...","Crystal Black Pearl, Lunar Silver Metallic, Mo...",...,,,,,,,,,,
2,Honda,2025,30429,18900,https://www.ccarprice.com/products/Honda_Eleva...,https://www.ccarprice.com/au/honda-elevate-sig...,Elevate Signature Black Edition 2025,,3 Years / Unlimited kilometers,crystal black,...,,,,,,,,,,
3,Honda,2025,28819,17900,https://www.ccarprice.com/products/Honda_Eleva...,https://www.ccarprice.com/au/honda-elevate-bla...,Elevate Black Edition 2025,,3 Years / Unlimited kilometers,crystal black,...,,,,,,,,,,
4,Honda,2026,79534,49400,https://www.ccarprice.com/products/Honda_Pilot...,https://www.ccarprice.com/au/honda-pilot-trail...,2026 Honda Pilot TrailSport,japan,"3-year / 36,000 miles","Lunar Silver Metallic, Crystal Black Pearl, Mo...",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3838,Xiaomi,2025,183862,114200,https://www.ccarprice.com/products/2025_Xiaomi...,https://www.ccarprice.com/au/xiaomi-su7-ultra-...,SU7 Ultra 2025,China,,"Galaxy Gray, Yellow, Twilight Red, Mist Purple...",...,Yes,,,,,,,,,
3839,Xiaomi,2025,49749,30900,https://www.ccarprice.com/products/2025_Xiaomi...,https://www.ccarprice.com/au/xiaomi-mx11-coupe...,MX11 Coupe SUV 2025,Beijing,,,...,Yes,,,,,,,,,
3840,Xiaomi,2025,48300,30000,https://www.ccarprice.com/products/Xiaomi_SU8_...,https://www.ccarprice.com/au/xiaomi-su8-midsiz...,SU8 Midsize SUV 2025,China,,,...,Yes,,,,,,,,,
3841,Xiaomi,2024,65769,40850,https://www.ccarprice.com/products/Xiaomi_SU7_...,https://www.ccarprice.com/au/xiaomi-su7-max-aw...,SU7 Max AWD 2024,Beijing,,,...,Yes,,,,,,,,,


In [12]:
# Saving the initial dataframe
all_car_details.to_csv('Car_Prices.csv', index=False)

In [43]:
# Saving the initial dataframe
df=all_car_details[all_car_details.year.isna()]

In [39]:
# Assuming `all_car_details` is your original DataFrame
df_missing_year = all_car_details[all_car_details['year'].isna()]

# Function to scrape the car year from a given URL
def scrape_year_from_url(url):
    try:
        # Send a GET request to the URL
        response = requests.get(url)
        
        # Check if request was successful
        if response.status_code == 200:
            # Parse the page content
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find the relevant text using the provided CSS selector
            element = soup.select_one('span > strong')

            # If the element is found, extract the text
            if element:
                text = element.get_text()
                
                # Use regex to extract a year starting with '2'
                year_match = re.findall(r'\b2\d{3}\b', text)

                if year_match:
                    return year_match[0]  # Return the first matched year
                else:
                    return None  # Return None if no year is found
            else:
                return None  # Return None if the element is not found
        else:
            return None  # Return None if the page could not be fetched
    except Exception as e:
        print(f"Error while scraping {url}: {e}")
        return None  # Return None if any error occurs

# Iterate through the DataFrame and scrape the missing year for each URL
for index, row in df_missing_year.iterrows():
    url = row['car_link']  # Assuming the URL column is named 'url'
    
    # Scrape the year
    scraped_year = scrape_year_from_url(url)
    
    # Update the DataFrame with the scraped year
    if scraped_year:
        all_car_details.loc[index, 'year'] = scraped_year

# Now `all_car_details` should be updated with the missing years


In [40]:
all_car_details[all_car_details['year'].isna()]

Unnamed: 0,brand,year,price_in_aud,price_in_usd,image_url,car_link,model_number,made_in,warranty,available_colors,...,halogen_headlamps,tyre_pressure_monitor,charger,tyre_type,cd_player,dvd_player,radio,smart_access_card_entry,leather_seats,voice_control


In [47]:
all_car_details.year.dtype

dtype('O')

In [None]:
# Saving the initial dataframe
all_car_details.to_csv('Car_Prices.csv', index=False)