<h1>Scraping car's data by crawling to specific URLs</h1>

In [1]:
# Imports

from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
import csv

# To print its version only
import bs4

In [2]:
# Setup version

print('pandas version: {}'.format(pd.__version__))
print('bs4 version: {}'.format(bs4.__version__))
print('requests version: {}'.format(requests.__version__))
print('csv version: {}'.format(csv.__version__))

pandas version: 1.4.2
bs4 version: 4.11.1
requests version: 2.27.1
csv version: 1.0


In [3]:
# get souped page of the main webpage
def get_souped_page(url):
    
    page= requests.get(url)
    soup= bs(page.content, 'html.parser')
    
    return soup


In [4]:
# To generate the appropriate link of a page 
def get_page_url(page):
    
    template= 'https://carvago.com/cars?page={}'
    url= template.format(int(page))
    
    return url


In [5]:
# Scrape the number of pages that we have to looop over
def get_nb_pages(url):
    
    page= requests.get(url)
    soup= bs(page.content, 'html.parser' )
    ul= soup.find_all('ul', class_= 'Pagination-container')[0]
    li= ul.find_all('li', class_= 'Pagination-item')
    final_page= int(li[5].text)
    
    return final_page


In [6]:
# scrape urls of articles per page
def scrape_urls(nb_pages):
    
    links= []
    for page in range(nb_pages):
        page_url= get_page_url(page)
        soup= get_souped_page(page_url)
        div= soup.find('div', class_='css-1f3egm3 e1qmtrzl0')
        articles= div.find_all('div', class_= 'css-j0dexk e1oahio80')
        for article in articles:
            try:
                link= 'https://carvago.com' + article.a['href']
                links.append(link)
            except:
                pass
            
    return links


In [7]:
# A function that enables us to scrape all cars data
def scrape_cars_data(urls):
    
    records= []
    for url in urls:
        
        soup= get_souped_page(url)
        try:
            # get Location
            description= soup.find('div', {'data-test-id' : "feature.car_detail.car_description"})
            location_div= description.find_all('div', class_= 'css-cslgjn evubtux3')[3]
            location= location_div.find('div', class_= 'css-o5n6vn evubtux1').text

            # get Price
            price_div= soup.find('div', class_='css-tztcmb ejteidz8')
            price= soup.find('div', class_='css-1mbrvie e1hgzarh2').text

            # get the overall details div
            details= soup.find('div', class_='css-hl133e e16182je4')
            details= details.find_all('div', class_='css-l5ry6c ebz328c2')

            # get vehicule detail 
            v_detail= details[0]
            rows= v_detail.find_all('div', class_='sc-gsnTZi gPmveL css-1su3ehq e18uvu5d8')

            make= rows[0].a.text
            model= rows[1].a.text
            try:
                body_color= rows[2].find('div', class_= 'css-s5xdrg exn7x430').text
            except:
                body_color= 'None'
            interior_color= rows[3].find('div', class_= 'sc-dkzDqf edLfRl css-1k9kkyi e18uvu5d2').text
            interior_material= rows[4].find('div', class_= 'sc-dkzDqf edLfRl css-1b7garx e18uvu5d4').text
            body= rows[5].find('div', class_= 'sc-dkzDqf edLfRl css-1b7garx e18uvu5d4').text
            doors= rows[6].find('div', class_= 'sc-dkzDqf edLfRl css-1b7garx e18uvu5d4').text
            seats= rows[7].find('div', class_= 'sc-dkzDqf edLfRl css-1b7garx e18uvu5d4').text
            vin= rows[8].find('div', class_= 'sc-dkzDqf edLfRl css-1b7garx e18uvu5d4').text
            if vin == 'not published by the seller':
                vin= 'None'

            # get steering
            v_steering= details[1]
            rows_s= v_steering.find_all('div', class_='sc-gsnTZi gPmveL css-1su3ehq e18uvu5d8')

            fuel= rows_s[0].find('div', class_= 'sc-dkzDqf edLfRl css-1b7garx e18uvu5d4').text
            transmission= rows_s[1].find('div', class_= 'sc-dkzDqf edLfRl css-1b7garx e18uvu5d4').text
            drive_type= rows_s[2].find('div', class_= 'sc-dkzDqf edLfRl css-1b7garx e18uvu5d4').text
            power= rows_s[3].find('div', class_= 'sc-dkzDqf edLfRl css-1b7garx e18uvu5d4').text
            engine_capacity= rows_s[4].find('div', class_= 'sc-dkzDqf edLfRl css-1b7garx e18uvu5d4').text

            co2_emission= rows_s[5].find('div', class_= 'sc-dkzDqf edLfRl css-1b7garx e18uvu5d4').text
            emission_class= rows_s[6].find('div', class_= 'sc-dkzDqf edLfRl css-1b7garx e18uvu5d4').text

            # get vehicule condition
            v_condition= details[2]
            rows_c= v_condition.find_all('div', class_= 'sc-gsnTZi gPmveL css-1su3ehq e18uvu5d8')

            driven_distance= rows_c[0].find('div', class_='sc-dkzDqf edLfRl css-1b7garx e18uvu5d4').text
            first_registration= rows_c[1].find('div', class_='sc-dkzDqf edLfRl css-1b7garx e18uvu5d4').text
            condition= rows_c[2].find('div', class_='sc-dkzDqf edLfRl css-1b7garx e18uvu5d4').text
            record=(make, model, price, body_color, interior_color, interior_material, body, doors, seats, vin, fuel, transmission, drive_type,
                     power, engine_capacity, co2_emission, emission_class, driven_distance, first_registration, condition )
            records.append(record)
        except:
            pass

    with open("cars.csv", 'w', newline='',encoding= 'utf-8') as f:
        writer= csv.writer(f)
        writer.writerow(['make', 'model', 'price', 'body_color', 'interior_color', 'interior_material', 'body', 'doors', 'seats', 'vin', 'fuel', 'transmission', 'drive_type',
             'power', 'engine_capacity', 'co2_emission', 'emission_class', 'driven_distance', 'first_registration', 'condition'])
        writer.writerows(records)

    return records


In [8]:
# Let's try what we have built
nb_pages= get_nb_pages('https://carvago.com/cars?page=35132')     
# genereate links
links= scrape_urls(50)
# scrape each car web page
records= scrape_cars_data(links)

In [10]:
# the result of our scraping
df= pd.read_csv('cars.csv')
df

Unnamed: 0,make,model,price,body_color,interior_color,interior_material,body,doors,seats,vin,fuel,transmission,drive_type,power,engine_capacity,co2_emission,emission_class,driven_distance,first_registration,condition
0,Seat,Leon,3 049 €,,Interior color,Other interior material,Compact,4/5 doors,5,1249,Petrol,Manual,4x2,92 kW,1 390 cc,149,Euro 4,158 500 km,6/2009,Used
1,Opel,Crossland X,15 899 €,Orange,Type of finish,Black interior,Cloth interior,SUV / offroad,4/5 doors,5,Diesel,Manual,4x2,88 kW,15 600 cc,103,Euro 6,69 000 km,8/2017,2
2,Kia,XCeed,24 649 €,Silver,Type of finish,Black interior,Cloth interior,SUV / offroad,4/5 doors,5,Petrol,Manual,4x2,103 kW,13 534 cc,134,Euro 6d-TEMP,19 500 km,10/2020,Used
3,Renault,Clio,15 099 €,Grey,Interior color,Other interior material,Compact,4/5 doors,1090,,Diesel,Manual,4x2,66 kW,14 641 cc,135,Euro 6,13 884 km,11/2019,Used
4,Mercedes-Benz,E 350,10 899 €,Black,Type of finish,Beige interior,Full leather interior,Station Wagon,4/5 doors,5,Petrol,Automatic,4x4,200 kW,3 498 cc,261,Euro 4,163 000 km,1/2009,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,Mercedes-Benz,E 220,32 049 €,Silver,Type of finish,Black interior,Part leather interior,Station Wagon,4/5 doors,5,Diesel,Automatic,4x2,143 kW,1 950 cc,159,Euro 6d-TEMP,73 170 km,6/2019,1
594,BMW,220,49 549 €,Black,Type of finish,Black interior,Full leather interior,Sedans / saloons,4/5 doors,5,Petrol,Automatic,4x2,135 kW,1 998 cc,144,No emission class,0 km,7/2022,New
595,Dodge,Durango,54 399 €,Grey,Type of finish,Black interior,Full leather interior,SUV / offroad,4/5 doors,6,Petrol,Automatic,4x4,268 kW,5 654 cc,387,Euro 5,21 958 km,7/2021,2
596,BMW,X3 M40,60 649 €,Black,Type of finish,Black interior,Full leather interior,SUV / offroad,4/5 doors,5,Diesel,Automatic,4x4,250 kW,2 993 cc,184,Euro 6d,29 699 km,8/2021,Used


In [11]:
df.shape

(598, 20)