In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from supabase import create_client, Client
from tqdm import tqdm
import json
import time

In [2]:
import os
from dotenv import load_dotenv

# Load variables from .env into the environment
load_dotenv()

# Read variables
supabase_url = os.getenv("SUPABASE_URL")
supabase_key = os.getenv("SUPABASE_KEY")

In [4]:
supabase_url

'https://cnubrmbhdqqlwzwcnshp.supabase.co'

In [5]:
# Initialize client
supabase: Client = create_client(supabase_url, supabase_key)

In [6]:
supabase

<supabase._sync.client.SyncClient at 0x1d810a2fe90>

In [7]:
table_name = "scraped_data"

In [8]:
response = supabase.table(table_name).select("*").limit(1).execute()

In [9]:
response

APIResponse[~_ReturnT](data=[{'id': 147, 'car_id': 'a835a871-370a-4ce2-9250-26b64bb928ea', 'scraped_at': '2025-08-12T09:06:18.605133', 'make': 'bmw', 'model': '116', 'fuel_type': 'b', 'first_registration': '05-2012', 'mileage': 264535, 'post_code': '8161 PG', 'listing_price': 6250, 'lat_postcode': None, 'lon_postcode': None}], count=None)

In [10]:
response.data

[{'id': 147,
  'car_id': 'a835a871-370a-4ce2-9250-26b64bb928ea',
  'scraped_at': '2025-08-12T09:06:18.605133',
  'make': 'bmw',
  'model': '116',
  'fuel_type': 'b',
  'first_registration': '05-2012',
  'mileage': 264535,
  'post_code': '8161 PG',
  'listing_price': 6250,
  'lat_postcode': None,
  'lon_postcode': None}]

## Supabase

In [28]:
url = "https://www.autoscout24.nl/lst?atype=C&cy=NL&damaged_listing=exclude&desc=1&powertype=kw&search_id=dponocfwd5&sort=age&source=homepage_search-mask&ustate=N%2CU"
html = requests.get(url).text
soup = BeautifulSoup(html, "html.parser")
car_listings = soup.find_all("article", class_="cldt-summary-full-item")

In [13]:
cars_data = []

for car in car_listings:
    car_info = {
        "car_id": car.get("id"),                     # e.g., "toyota"
        "make": car.get("data-make"),                     # e.g., "toyota"
        "model": car.get("data-model"),                   # e.g., "aygo"
        "first_registration": car.get("data-first-registration"),  # e.g., "07-2009"
        "fuel_type": car.get("data-fuel-type"),           # e.g., "b" (probably benzine/gasoline)
        "mileage": float(car.get("data-mileage")),                # e.g., "103211"
        "post_code": car.get("data-listing-zip-code"),      # e.g., "5482 VR"
        "listing_price": float(car.get("data-price")),      # e.g., "5482 VR"
    }
    
    # Extract price (if available inside a tag)
    # price_tag = car.find("span", class_="cldt-price")
    # car_info["price"] = price_tag.get_text(strip=True) if price_tag else None

    # # Extract link to car details page
    # link_tag = car.find("a", class_="cldt-summary-link")
    # car_info["link"] = link_tag["href"] if link_tag else None
    
    cars_data.append(car_info)

In [14]:
cars_data[0]

{'car_id': 'cc5faae5-ce81-43e8-ba3d-14491a63ff1e',
 'make': 'renault',
 'model': 'clio',
 'first_registration': '09-2019',
 'fuel_type': 'b',
 'mileage': 76560.0,
 'post_code': '7466 PK',
 'listing_price': 12950.0}

In [15]:
car_listings[0]

<article class="cldt-summary-full-item listing-impressions-tracking list-page-item ListItem_article__qyYw7" data-applied-tier="t10" data-applied_boost_level="t10" data-boost_level="t10" data-boosting_product="nfm" data-customer-id="17324558" data-deliverable="false" data-first-registration="09-2019" data-fuel-type="b" data-guid="cc5faae5-ce81-43e8-ba3d-14491a63ff1e" data-image-content="no-placeholder|0.14589044540556095" data-is-smyle-eligible="false" data-leads-range="zero" data-listing-country="nl" data-listing-zip-code="7466 PK" data-make="renault" data-mia-level="t10" data-mileage="76560" data-model="clio" data-model-taxonomy="[make_id:60, model_group_id:201266, variant_id:, generation_id:1126, motortype_id:3847, trim_id:4620];" data-order-bucket="0" data-otp="nfm" data-ownership-models="tr" data-position="1" data-price="12950" data-price-label="fair-price" data-relevance_adjustment="sponsored" data-seller-type="d" data-source="listpage_search-results" data-testid="list-item" data-

## Scraping

In [11]:
base_url = "https://www.autoscout24.nl/lst"

In [12]:
params = {
    "atype": "C",
    "cy": "NL",
    "damaged_listing": "exclude",
    "desc": "1",
    "powertype": "kw",
    "sort": "age",
    "source": "homepage_search-mask",
    "ustate": "N,U",
    "kmfrom":0,
    "kmto":1000,
    "pricefrom":0,
    "priceto":10000,
    "page": 1  # start page
}

In [13]:
# price_vec = np.append(np.insert(np.logspace(1, 6, 10), 0, 0, axis=0), 1e8) # Pad with 0 and 1e9
# km_vec = np.append(np.insert(np.logspace(0, 6, 10), 0, 0, axis=0), 1e9) # Pad with 0 and 1e9

In [30]:
# price_vec = np.append(np.insert(np.logspace(4, 5, 20), 0, 0, axis=0), 1e8) # Pad with 0 and 1e9
# km_vec = np.append(np.insert(np.logspace(0, 6, 20), 0, 0, axis=0), 1e9) # Pad with 0 and 1e9
price_vec = np.array([0, 500, 625, 750, 1000, 1250, 1500, 1750, 2000, 2250, 2500, 2750, 3000, 3250, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 10500, 11000, 11500, 12000, 12500, 13000, 13500, 14000, 14500, 15000, 16000, 17000, 19000, 20000, 21000, 22000, 23000, 24000, 25000, 26000, 27000, 28000, 29000, 30000, 31000, 32000, 33000, 34000, 35000, 36000, 37000, 38000, 39000, 40000, 41000, 42000, 43000, 44000, 45000, 46000, 47000, 48000, 49000, 50000, 52000, 54000, 56000, 58000, 60000, 62000, 64000, 66000, 68000, 70000, 75000, 80000, 85000, 90000, 95000, 100000, 150000, 1e9])
km_vec = np.array([0, 1, 2, 20, 50, 100, 200, 500, 1000, 2000, 3000, 5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 55000, 60000, 70000, 80000, 90000, 100000, 120000, 130000, 140000, 145000, 1500000, 155000, 160000, 180000, 190000, 200000, 210000, 220000, 230000, 240000, 260000, 280000, 300000, 350000, 400000, 1e9])

In [31]:
price_vec

array([0.00e+00, 5.00e+02, 6.25e+02, 7.50e+02, 1.00e+03, 1.25e+03,
       1.50e+03, 1.75e+03, 2.00e+03, 2.25e+03, 2.50e+03, 2.75e+03,
       3.00e+03, 3.25e+03, 3.50e+03, 4.00e+03, 4.50e+03, 5.00e+03,
       5.50e+03, 6.00e+03, 6.50e+03, 7.00e+03, 7.50e+03, 8.00e+03,
       8.50e+03, 9.00e+03, 9.50e+03, 1.00e+04, 1.05e+04, 1.10e+04,
       1.15e+04, 1.20e+04, 1.25e+04, 1.30e+04, 1.35e+04, 1.40e+04,
       1.45e+04, 1.50e+04, 1.60e+04, 1.70e+04, 1.90e+04, 2.00e+04,
       2.10e+04, 2.20e+04, 2.30e+04, 2.40e+04, 2.50e+04, 2.60e+04,
       2.70e+04, 2.80e+04, 2.90e+04, 3.00e+04, 3.10e+04, 3.20e+04,
       3.30e+04, 3.40e+04, 3.50e+04, 3.60e+04, 3.70e+04, 3.80e+04,
       3.90e+04, 4.00e+04, 4.10e+04, 4.20e+04, 4.30e+04, 4.40e+04,
       4.50e+04, 4.60e+04, 4.70e+04, 4.80e+04, 4.90e+04, 5.00e+04,
       5.20e+04, 5.40e+04, 5.60e+04, 5.80e+04, 6.00e+04, 6.20e+04,
       6.40e+04, 6.60e+04, 6.80e+04, 7.00e+04, 7.50e+04, 8.00e+04,
       8.50e+04, 9.00e+04, 9.50e+04, 1.00e+05, 1.50e+05, 1.00e

In [32]:
km_vec

array([0.00e+00, 1.00e+00, 2.00e+00, 2.00e+01, 5.00e+01, 1.00e+02,
       2.00e+02, 5.00e+02, 1.00e+03, 2.00e+03, 3.00e+03, 5.00e+03,
       1.00e+04, 1.50e+04, 2.00e+04, 2.50e+04, 3.00e+04, 3.50e+04,
       4.00e+04, 4.50e+04, 5.00e+04, 5.50e+04, 6.00e+04, 7.00e+04,
       8.00e+04, 9.00e+04, 1.00e+05, 1.20e+05, 1.30e+05, 1.40e+05,
       1.45e+05, 1.50e+06, 1.55e+05, 1.60e+05, 1.80e+05, 1.90e+05,
       2.00e+05, 2.10e+05, 2.20e+05, 2.30e+05, 2.40e+05, 2.60e+05,
       2.80e+05, 3.00e+05, 3.50e+05, 4.00e+05, 1.00e+09])

In [33]:
count_added = 0
cars_to_insert = []
batch_size = 500  # Set an appropriate batch size for your database
page_limit_autoscout = 20

# --- 1. Optimize Database Lookups: Fetch all existing car IDs once ---
print("Fetching existing car IDs from the database...")
response = supabase.table(table_name).select("car_id").execute()
# --- 2. Simplify Duplicate Checking: Use a set for near-instant lookups ---
car_ids_in_database = {d['car_id'] for d in response.data}
print(f"Found {len(car_ids_in_database)} existing car IDs.")

Fetching existing car IDs from the database...
Found 211955 existing car IDs.


In [None]:
from rich.console import Console
from rich.progress import Progress, BarColumn, TextColumn, TimeRemainingColumn

# Initialize a rich Console object
console = Console()

# --- Main Loops ---
# Define a custom Progress display
with Progress(
    TextColumn("[progress.description]{task.description}"),
    BarColumn(),
    TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
    TimeRemainingColumn(),
    console=console
) as progress:
    # Outer loop over price range
    task_price = progress.add_task(
        "[green]Processing price ranges...", total=len(price_vec[:-1]-1)
    )

    for k, price in enumerate(price_vec[:-1]):
        params['pricefrom'] = round(price_vec[k])
        params['priceto'] = round(price_vec[k+1])

        # Inner loop over mileage
        task_mileage = progress.add_task(
            f"[cyan]  Processing mileage {round(km_vec[0])}-{round(km_vec[-1])}...",
            total=len(km_vec[:-1]-1)
        )

        for j, km in enumerate(km_vec[:-1]):
            params['kmfrom'] = round(km_vec[j])
            params['kmto'] = round(km_vec[j+1])

            # Flag to check if the page loop completes fully
            page_limit_reached = True

            # Innermost loop over pages
            for i in range(page_limit_autoscout):
                params['page'] = i + 1
                html = requests.get(base_url, params=params).text
                soup = BeautifulSoup(html, "html.parser")
                car_listings = soup.find_all("article", class_="cldt-summary-full-item")

                if not car_listings:
                    # If no listings are found, the loop breaks early.
                    page_limit_reached = False
                    break

                for car in car_listings:
                    # Your data extraction logic here
                    try:
                        data_mileage = float(car.get("data-mileage"))
                    except (ValueError, TypeError):
                        data_mileage = -1
                    
                    try:
                        listing_price = float(car.get("data-price"))
                    except (ValueError, TypeError):
                        listing_price = -1
                    
                    car_id = car.get("id")

                    if car_id not in car_ids_in_database:
                        car_info = {
                            "car_id": car_id,
                            "make": car.get("data-make"),
                            "model": car.get("data-model"),
                            "first_registration": car.get("data-first-registration"),
                            "fuel_type": car.get("data-fuel-type"),
                            "mileage": data_mileage,
                            "post_code": car.get("data-listing-zip-code"),
                            "listing_price": listing_price,
                        }
                        cars_to_insert.append(car_info)
                        
                        if len(cars_to_insert) >= batch_size:
                            console.log(f"Inserting {len(cars_to_insert)} cars to the database...")
                            supabase.table(table_name).insert(cars_to_insert).execute()
                            count_added += len(cars_to_insert)
                            cars_to_insert = []
                
                time.sleep(0.01)

            # Check and log if the page limit was reached for this mileage-price combination
            if page_limit_reached:
                console.log(f"Reached page limit for price: {params['pricefrom']}-{params['priceto']} and mileage: {params['kmfrom']}-{params['kmto']}")

            # Update the mileage task for each mileage range
            progress.update(task_mileage, advance=1)
        
        # Mark the mileage task as complete and remove it
        progress.remove_task(task_mileage)

        # Update the price task for each price range
        progress.update(task_price, advance=1)

    # Mark the price task as complete and remove it
    # progress.remove_task(task_price)

# --- Final Batch Insert ---
# Insert any remaining cars after all loops have finished
if cars_to_insert:
    console.log(f"Inserting final {len(cars_to_insert)} cars to the database...")
    supabase.table(table_name).insert(cars_to_insert).execute()
    count_added += len(cars_to_insert)

console.log(f"\nTotal cars added to the database: {count_added}")

Output()

In [None]:
count_added

In [None]:
response = supabase.table(table_name).select("car_id").execute()
car_ids_in_database = response.data

In [None]:
df = pd.DataFrame([d['car_id'] for d in car_ids_in_database])

In [None]:
len(df[0].unique()) - len(df)

## Remove duplicates in database

In [None]:
response = supabase.table(table_name).select("id, car_id, make, listing_price").execute()
car_ids_in_database = response.data

In [None]:
df_full = pd.DataFrame(car_ids_in_database)

In [None]:
id_to_remove = df_full.loc[(df_full.duplicated(subset=['car_id'],keep="first")), 'id'].values

In [None]:
id_to_remove

In [None]:
chunk_size = 1000

for i in tqdm(range(0, len(id_to_remove), chunk_size)):
    chunk = id_to_remove[i:min(i + chunk_size, len(id_to_remove))]
    response = (
        supabase.table(table_name)
        .delete()
        .in_("id", chunk)
        .execute()
    )
