In [2]:
from bs4 import BeautifulSoup
import json
import pandas as pd
import requests
import numpy as np
from tqdm import tqdm
import sys
import time
sys.path.append('utils')
import os
import config_handling as conf
from multithread_image_ripper import download_images
from database import Database


In [3]:
# Connect to database
config = conf.read_config('automotive.conf.ini')
config.read('config.ini')
connection_type = config['settings']['connection']
connection_type
user = config[connection_type]['user']
pw = config[connection_type]['pw']
host = config[connection_type]['host']
db = config[connection_type]['db']
port = config[connection_type].getint('port')
db = Database(host,
              port,
              user,
              pw,
              db
              )
db.connect()
db.start_transaction()
#image directory: 
basedir = config['settings']['image_directory']

Connection established


## 1. Brand extraction: 
Start by getting all the brand codes, these do not match the codes used by Autodoc in the previous step of the data scraping process. 

In [4]:
start_at = "https://www.autoscout24.be/nl/"
# basedir = 'C:\imdir'
session = requests.Session()
r = session.get(start_at)

In [5]:
def normalize_name(given_name): 
    # Lowers and removes all non-alphanumeric characters from a given input string,
    # and replaces spaces with hyphens.
    special = [' ', '-', '/', '(', ')', '.']
    alphanums = list([symbol for symbol in given_name if symbol.isalpha() or symbol.isnumeric() or symbol in special])
    normalized_name = ''.join(alphanums).lower()
    normalized_name = normalized_name.replace('/','%2f')
    return normalized_name.replace(' ', '-')

r.content
soup = BeautifulSoup(r.content)
brands = soup.select("#make")[0].find_all('option')
l = {}
for brand in brands:
    value = brand['value']
    name = normalize_name(brand.text)
    if value != '': 
        l[name] = value

## 2. Model extraction
For model extraction we can use XHR requests again to a JSON endpoint (for this we need the numerical brand id from the previous extraction step). Model hierarchy is not the same for all brands.

For instance the BMW 1-series is a valid model designation, autoscout has this as a parent element for 1-series cars with different engines in it (e.g. 114, 116, 120). In this case xyy can be interpreted as: x = series designation, yy = engine displacement /100 (114 has a 1400CC engine, 116 has a 1600CC engine...). Each of the different engine displacement is considered to be a seperate submodel to the 1 series by Autoscout - even if there's visually no difference between the cars.
The problem is with X-series carsfor BMW, where X1, X3, X5 are distinct cars but are all grouped in the same 'family' of cars.

For other brands there is no such hierarchy, e.g. Toyota where the response is a flat list of modelnames without differntiating different bodystyles (e.g. Corrola hatch or Corrola SW). Here the data will return both SW and hatch body styles for the same model. The visual difference between an SW and a hatch model is quite stark and easy to recognize for the human observer. 

In spite of these problems, it would be a good idea to have the scraper collect model by model - in stead of collecting all data for the entire brand. The advantage of this approach is that you have an identifiable target. (i.e. you know the car is of brand X and model Y), then you can use features on autoscout such as doorcount, inscription year and bodystyle (which all are subject to human errors); to correctly link it to chassis codes extracted from autodoc. 

One caveat with the inscription year is that this value is not the same as a production year!!! Generally speaking production starts well in advance of car sales and older stock models can still be registered while the new model is in production. This means there'll be an overlap - not much to do about it; we might consider dropping the first three months of sales for a given model. 

In [6]:
# brand = 'lexus'
# print(l[brand])

In [7]:
# headers = {
#     "Accept": "application/json", 
#     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"

# }
# #NOTE: API requests are rejected if you don't have a session cookie; that's why we use session.get and not the default request.get
# url = f"https://www.autoscout24.be/as24-home/api/taxonomy/cars/makes/{l[brand]}/models"
# r = session.get(url, headers= headers)

In [8]:
# #IF the modelLineId is None, than the model_id is a code on itself. (Toyota casus)
# # IF modelLineId is not None, then the model_id is to be considered a subcode of the modelLineId (BMW casus)
# models = []
# model_lines = []
# for model in r.json()['models']['model']['values']:
#     model_name = model['name']
#     model_id = model['id']
#     models.append([model_name, model_id, model['modelLineId']])
# for model_line in r.json()['models']['modelLine']['values']:
#     model_line_id = model_line['id']
#     model_line_name = model_line['name']
#     dutch_name = model_line['label']['nl_BE']
#     model_lines.append([model_line_id, model_line_name, dutch_name])

In [9]:
# model_df = pd.DataFrame(models, columns=['modelname','model_id','parent_id'])
# model_lines_df = pd.DataFrame(model_lines, columns=['line_id', 'modelline_name', 'labelnaam_requests'])

# models_merged = pd.merge(model_df, model_lines_df, left_on='parent_id', right_on='line_id', how='outer')  # You can change 'inner' to 'outer', 'left', or 'right' based on your requirement

# models_merged['request_parameter'] = np.where(
#     models_merged['labelnaam_requests'].isna(),
#     models_merged['modelname'],
#     models_merged['labelnaam_requests']
# )
# models_merged
# assert(models_merged.request_parameter.isna().sum() == 0)


In [10]:
# models_merged

Autoscout allows a max of 20 pages with 20 results each. So you have a hard limit of 400 listings for a given brand/model combination. We don't care much about pagination, as exceeding the first 400 results will automatically show the same message as 'all results have been viewed'. We can solve the issue by restricting registration years X to Y where the range X-Y 0 (iterating over all cars from a specific year)

when dealing with images downloaded from Autoscout, you'll have the data as compressed files (.webp), YOLO does not support this and I don't know about other ML models, so the multithreaded ripper should have .jpg conversion built in!

In [11]:
# scraping listings per brand/model: 
def backoff_strategy(at, extract_json = False):
    success = False
    x = 1
    while not success and x <= 6: 
        r = session.get(at)
        if r.status_code == 200:
            if not extract_json:
                success = True
                return r
            else:
                soup = BeautifulSoup(r.content, 'html.parser')
                json_string_data = soup.find('script', id="__NEXT_DATA__", type="application/json")
                json_data = json.loads(json_string_data.string)
                valid_parse = (
                    'props' in json_data and
                    'pageProps' in json_data['props'] and
                    'listingDetails' in json_data['props']['pageProps'] and
                    'vehicle' in json_data['props']['pageProps']['listingDetails']
                )
                if json_data is not None and valid_parse:
                    success = True
                    return json_data
        time.sleep(x*10)
        x+=1
    return r

def extract_dates(brand, model):
    earliest_year_url = f"https://www.autoscout24.be/nl/lst/{brand}/{model}?atype=C&cy=B&damaged_listing=exclude&desc=0&sort=year"
    latest_year_url = f"https://www.autoscout24.be/nl/lst/{brand}/{model}?atype=C&cy=B&damaged_listing=exclude&desc=1&sort=year"
    # r_start = session.get(earliest_year_url)
    r_start = backoff_strategy(earliest_year_url)
    soup_start = BeautifulSoup(r_start.content, 'html.parser')
    first_articles = soup_start.find_all('article')
    if len(first_articles) == 0:    #No cars for sale
        return[False, False]
    first_date = first_articles[0].get('data-first-registration')
    # r_stop = session.get(latest_year_url)   #defintely cars for sale, no need to check.
    r_stop = backoff_strategy(latest_year_url)
    soup_stop = BeautifulSoup(r_stop.content, 'html.parser')
    last_date = soup_stop.find_all('article')[0].get('data-first-registration')
    if first_date.lower() == 'new':
        first_year = 2024
    else:
        first_year = first_date.split('-')[1]
    if last_date.lower() == 'new' or last_date.lower() == 'unknown':
        last_year = 2024
    else:
        last_year = last_date.split('-')[1]
    return [int(first_year), int(last_year)]

def get_listing_details(listing):
    """doorcount and chassistype would be handy to have, it's unfortunately not part of the
    JSON response so we need to do one request per listing to extract this usefull information.
    There are other useful bits of information too that might help the disambiguation process.

    The good thing is it's quite easy to parse as they made it accessible in JSON format
    """
    url = f"https://www.autoscout24.be/nl/aanbod/{listing}"
    json_data = backoff_strategy(url, True)
    # soup = BeautifulSoup(r.content, 'html.parser')
    # json_string_data = soup.find('script', id="__NEXT_DATA__", type="application/json")
    # json_data = json.loads(json_string_data.string)
    data = json_data['props']['pageProps']['listingDetails']['vehicle']
    shell = data['bodyType']
    doors = data['numberOfDoors']
    weight = data['weight']
    norm_data = data['environmentEuDirective']
    if norm_data is not None:
        norm = norm_data['label']
    else:
        norm = None
    return [shell, doors, weight, norm]

def get_models(brand): 
    headers = {
    "Accept": "application/json", 
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
    }
    #NOTE: API requests are rejected if you don't have a session cookie; that's why we use session.get and not the default request.get
    url = f"https://www.autoscout24.be/as24-home/api/taxonomy/cars/makes/{l[brand]}/models"
    r = session.get(url, headers= headers)
    #IF the modelLineId is None, than the model_id is a code on itself. (Toyota casus)
    # IF modelLineId is not None, then the model_id is to be considered a subcode of the modelLineId (BMW casus)
    models = []
    model_lines = []
    for model in r.json()['models']['model']['values']:
        model_name = model['name']
        model_id = model['id']
        models.append([model_name, model_id, model['modelLineId']])
    for model_line in r.json()['models']['modelLine']['values']:
        model_line_id = model_line['id']
        model_line_name = model_line['name']
        dutch_name = model_line['label']['nl_BE']
        model_lines.append([model_line_id, model_line_name, dutch_name])
    model_df = pd.DataFrame(models, columns=['modelname','model_id','parent_id'])
    model_lines_df = pd.DataFrame(model_lines, columns=['line_id', 'modelline_name', 'labelnaam_requests'])

    models_merged = pd.merge(model_df, model_lines_df, left_on='parent_id', right_on='line_id', how='outer')  # You can change 'inner' to 'outer', 'left', or 'right' based on your requirement

    models_merged['request_parameter'] = np.where(
        models_merged['labelnaam_requests'].isna(),
        models_merged['modelname'],
        models_merged['labelnaam_requests']
    )
    return models_merged

def brand_tracker(brand): 
    file_target = f"data/tracking/{brand}_completed.txt"
    if not os.path.exists(file_target): 
        f = open(file_target, 'w+')
        f.close()
    with open(file_target, 'r', encoding='utf8') as file:
        finished_models = file.readlines()
        finished_models = [model.strip() for model in finished_models]
    return finished_models

def completed_model_of_brand(brand, model):
    file_target = f"data/tracking/{brand}_completed.txt"
    with open(file_target, 'a', encoding='utf8') as file:
        file.write(model + "\n")
        file.flush()

def completed_brand(brand):
    with open('data/tracking/brands_completed.txt', 'a', encoding='utf8') as file:
        file.write(brand + "\n")
        file.flush()
        
    


In [12]:
# get_models('kia')
with open('data/tracking/brands_completed.txt', 'r', encoding='utf8') as file:
    finished_brands = file.readlines()
    finished_brands = [brand.strip() for brand in finished_brands]

def extract_counter(html_content):
    c = html_content.content
    soup = BeautifulSoup(c, 'html.parser')
    json_string_data = soup.find('script', id="__NEXT_DATA__", type="application/json")
    json_data = json.loads(json_string_data.string)
    return json_data['props']['pageProps']['numberOfResults']

      

def validate_model_name(brand, model):
    #check that the model name is parsed correctly!
    #   if the model is not parsed okay, then the count result is the same!
    all_for_brand_url = f"https://www.autoscout24.be/nl/lst/{brand}"
    all_for_brand_model_url = f"https://www.autoscout24.be/nl/lst/{brand}/{model}"
    brand_repl = backoff_strategy(all_for_brand_url)
    brand_model_repl = backoff_strategy(all_for_brand_model_url)
    return extract_counter(brand_repl) != extract_counter(brand_model_repl)

def log_model_error(brand, model):
    with open('data/logging/error_brand_model.txt', 'a+') as file:
        file.write(f"{brand}, {model}, {normalize_name(model)}\n")
        file.flush()

# print(validate_model_name('volvo', 'c30'))
# print(validate_model_name('volvo', 'clio'))
# print(validate_model_name('volkswagen', 't61'))
# print(validate_model_name('volkswagen', 't6.1'))


In [None]:
brands = ['alfa-romeo', 'fiat', 'mazda', 'seat', 'skoda', 'volvo', 'hyundai',
           'lexus', 'lotus', 'porsche', 'audi', 'volkswagen', 'ford',
             'mercedes-benz', 'nissan', 'renault', 'peugeot', 'opel', 'jeep', 
             'dacia', 'mini', 'land-rover', 'toyota', 'subaru','kia',  'suzuki', 'honda', 'citroen', 'alpine']
#BMW is a bit of an annoying case!

for brand in brands: 
    if brand in finished_brands:
        continue
    models_done = brand_tracker(brand)
    models_merged = get_models(brand)
    assert(models_merged.request_parameter.isna().sum() == 0)
    for model in tqdm(models_merged.request_parameter.unique()):
        storage = {}
        if model.lower().strip() == 'others':
            continue
        if model in models_done:
            continue
        as_model = normalize_name(model)

        if not validate_model_name(brand, as_model):
            log_model_error(brand, model)
            continue
        
        low, high = extract_dates(brand, model)
        
        if low == False and high == False:
            continue
        for year in range(low, high+1):
            for page in range(1, 21):
                url = f"https://www.autoscout24.be/nl/lst/{brand}/{as_model}/re_{year}?atype=C&cy=B&damaged_listing=exclude&desc=0&page={page}"
                r = session.get(url)
                r.status_code
                soup = BeautifulSoup(r.content, 'html.parser')
                listings = soup.find_all('article')
                if(len(listings) == 0): 
                    break

                json_string_data = soup.find('script', id="__NEXT_DATA__", type="application/json")
                json_data = json.loads(json_string_data.string)
                json_listings = json_data['props']['pageProps']['listings']
                images_of_listings = {
                    key['id']: ['/'.join(image.split('/')[:-1]) + '/750x564.webp' for image in key['images']]
                    for key in json_listings
                }
                for listing in listings:
                    first_registration = listing.get('data-first-registration')
                    listing_id = listing.get('data-guid')
                    make_name = listing.get('data-make')
                    model_name = listing.get('data-model')
                    price = listing.get('data-price')
                    mileage = listing.get('data-mileage')
                    if mileage.lower().strip() == 'unknown':
                        mileage = -1
                    #model taxonomy MAY be useful for linking chassis codes, so parse it: 
                    model_taxonomy = listing.get('data-model-taxonomy')
                    cleaned_string = model_taxonomy.lstrip("[").rstrip("];")
                    pairs = [pair.split(":") for pair in cleaned_string.split(", ")]
                    taxonomy_dict = {key.strip(): value.strip() for key, value in pairs}
                    make_id = taxonomy_dict['make_id']
                    model_id = taxonomy_dict['model_id']
                    variant_id = taxonomy_dict['variant_id'] if taxonomy_dict['variant_id'] != '' else None
                    generation_id = taxonomy_dict['generation_id'] if taxonomy_dict['generation_id'] != '' else None
                    motortype_id = taxonomy_dict['motortype_id'] if taxonomy_dict['motortype_id'] != '' else None
                    trim_id = taxonomy_dict['trim_id'] if taxonomy_dict['trim_id'] != '' else None
                    # url_listing = listing.find('a', href=True)['href']
                    storage_dir = os.path.join(basedir, brand, model, listing_id)
                    images = images_of_listings[listing_id]
                    shell, doors, weight, norm = get_listing_details(listing_id)
                    if weight is not None:
                        weight = weight.split(' ')[0]
                    data_listing = [listing_id, brand, model, year, first_registration, make_name, model_name, price, mileage, make_id, model_id, variant_id, generation_id, motortype_id, trim_id, shell, doors, weight, norm]

                    # insert_id = db.execute_query(query_listing, data_listing, True)
                    images = download_images(images, storage_dir, 10, True)
                    storage[listing_id] = {
                        'listing_data': data_listing,
                        'listing_images': images
                    }

        #Improved transaction handle with reduced database locking!
        db.start_transaction()
        query_image = """INSERT INTO 
            automotive.images (listing_id, image_path)
            VALUES(%s, %s)"""
        query_listing = """INSERT INTO 
            automotive.listings 
                (autoscout_id, brand, model, `year`, 
                first_registration, make_name_autoscout, 
                model_name_autoscout, price,mileage, 
                make_id, model_id, variant_id, generation_id,
                motortype_id, trim_id, shelltype, doorcount,
                weight, normlabel)
                VALUES (%s, %s, %s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
                   
        for listing in storage:
            metadata = storage[listing]['listing_data']
            insert_id = 0
            insert_id = db.execute_query(query_listing, metadata, True)
            if insert_id == 0:
                raise ValueError("Failed to retrieve listing ID.")
 
            image_data = storage[listing]['listing_images']
            for image in image_data:
                #remove the basedir from the path - this way we can move data to other computers
                # as long as we point to relative folder.
                image = image.replace(basedir, '').lstrip('\\')
                data_image = [insert_id, image ]
                db.execute_query(query_image, data_image, False)
        db.commit_transaction()
        #end of improved transaction handles!

        completed_model_of_brand(brand, model)


                
    completed_brand(brand)
db.close()




 82%|████████▏ | 62/76 [1:40:21<14:04, 60.29s/it]   

In [1]:
url

NameError: name 'url' is not defined

In [None]:
normalize_name(model)

In [None]:
soup

In [None]:
model

In [None]:
db.rollback_transaction()