In [165]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from supabase import create_client, Client
from tqdm import tqdm
import json
import time

In [2]:
import os
from dotenv import load_dotenv

# Load variables from .env into the environment
load_dotenv()

# Read variables
supabase_url = os.getenv("SUPABASE_URL")
supabase_key = os.getenv("SUPABASE_KEY")

In [3]:
supabase_key

'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImNudWJybWJoZHFxbHd6d2Nuc2hwIiwicm9sZSI6ImFub24iLCJpYXQiOjE3NTQ1ODg4NDAsImV4cCI6MjA3MDE2NDg0MH0.h1F_3H4FlQJbH4E2HgOJ-Y4cgn_KzN7N2sXg43CNnqc'

In [4]:
supabase_url

'https://cnubrmbhdqqlwzwcnshp.supabase.co'

In [5]:
# Initialize client
supabase: Client = create_client(supabase_url, supabase_key)

In [6]:
supabase

<supabase._sync.client.SyncClient at 0x1e26efc3d50>

In [7]:
table_name = "scraped_data"

In [8]:
response = supabase.table(table_name).select("*").limit(1).execute()

In [9]:
response

APIResponse[~_ReturnT](data=[], count=None)

In [10]:
response.data

[]

## Supabase

In [17]:
url = "https://www.autoscout24.nl/lst?atype=C&cy=NL&damaged_listing=exclude&desc=1&powertype=kw&search_id=dponocfwd5&sort=age&source=homepage_search-mask&ustate=N%2CU"
html = requests.get(url).text
soup = BeautifulSoup(html, "html.parser")
car_listings = soup.find_all("article", class_="cldt-summary-full-item")

In [41]:
cars_data = []

for car in car_listings:
    car_info = {
        "car_id": car.get("id"),                     # e.g., "toyota"
        "make": car.get("data-make"),                     # e.g., "toyota"
        "model": car.get("data-model"),                   # e.g., "aygo"
        "first_registration": car.get("data-first-registration"),  # e.g., "07-2009"
        "fuel_type": car.get("data-fuel-type"),           # e.g., "b" (probably benzine/gasoline)
        "mileage": float(car.get("data-mileage")),                # e.g., "103211"
        "post_code": car.get("data-listing-zip-code"),      # e.g., "5482 VR"
        "listing_price": float(car.get("data-price")),      # e.g., "5482 VR"
    }
    
    # Extract price (if available inside a tag)
    # price_tag = car.find("span", class_="cldt-price")
    # car_info["price"] = price_tag.get_text(strip=True) if price_tag else None

    # # Extract link to car details page
    # link_tag = car.find("a", class_="cldt-summary-link")
    # car_info["link"] = link_tag["href"] if link_tag else None
    
    cars_data.append(car_info)

In [42]:
cars_data[0]

{'car_id': 'c56c2b7f-09fc-4dc9-b5c5-29f7350ea258',
 'make': 'toyota',
 'model': 'aygo',
 'first_registration': '07-2009',
 'fuel_type': 'b',
 'mileage': 103211.0,
 'post_code': '5482 VR',
 'listing_price': 3495.0}

In [98]:
response = supabase.table(table_name).select("car_id").execute()

## Scraping

In [50]:
base_url = "https://www.autoscout24.nl/lst"

In [148]:
params = {
    "atype": "C",
    "cy": "NL",
    "damaged_listing": "exclude",
    "desc": "1",
    "powertype": "kw",
    "sort": "age",
    "source": "homepage_search-mask",
    "ustate": "N,U",
    "kmfrom":0,
    "kmto":1000,
    "pricefrom":0,
    "priceto":10000,
    "page": 1  # start page
}

In [185]:
# price_vec = np.append(np.insert(np.logspace(1, 6, 10), 0, 0, axis=0), 1e8) # Pad with 0 and 1e9
# km_vec = np.append(np.insert(np.logspace(0, 6, 10), 0, 0, axis=0), 1e9) # Pad with 0 and 1e9

In [227]:
# price_vec = np.append(np.insert(np.logspace(4, 5, 20), 0, 0, axis=0), 1e8) # Pad with 0 and 1e9
# km_vec = np.append(np.insert(np.logspace(0, 6, 20), 0, 0, axis=0), 1e9) # Pad with 0 and 1e9
price_vec = np.array([0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 12000, 15000, 17000, 19000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 60000, 70000, 80000, 100000, 150000, 200000, 300000, 1e9])
km_vec = np.array([0, 1, 2, 50, 100, 200, 500, 1000, 2000, 3000, 5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 60000, 70000, 80000, 100000, 150000, 200000, 300000, 1e9])

In [228]:
price_vec

array([0.0e+00, 1.0e+03, 2.0e+03, 3.0e+03, 4.0e+03, 5.0e+03, 6.0e+03,
       7.0e+03, 8.0e+03, 9.0e+03, 1.0e+04, 1.2e+04, 1.5e+04, 1.7e+04,
       1.9e+04, 2.0e+04, 2.5e+04, 3.0e+04, 3.5e+04, 4.0e+04, 4.5e+04,
       5.0e+04, 6.0e+04, 7.0e+04, 8.0e+04, 1.0e+05, 1.5e+05, 2.0e+05,
       3.0e+05, 1.0e+09])

In [229]:
km_vec

array([0.0e+00, 1.0e+00, 2.0e+00, 5.0e+01, 1.0e+02, 2.0e+02, 5.0e+02,
       1.0e+03, 2.0e+03, 3.0e+03, 5.0e+03, 1.0e+04, 1.5e+04, 2.0e+04,
       2.5e+04, 3.0e+04, 3.5e+04, 4.0e+04, 4.5e+04, 5.0e+04, 6.0e+04,
       7.0e+04, 8.0e+04, 1.0e+05, 1.5e+05, 2.0e+05, 3.0e+05, 1.0e+09])

In [234]:
count_added = 0
cars_to_insert = []
batch_size = 500  # Set an appropriate batch size for your database

# --- 1. Optimize Database Lookups: Fetch all existing car IDs once ---
print("Fetching existing car IDs from the database...")
response = supabase.table(table_name).select("car_id").execute()
# --- 2. Simplify Duplicate Checking: Use a set for near-instant lookups ---
car_ids_in_database = {d['car_id'] for d in response.data}
print(f"Found {len(car_ids_in_database)} existing car IDs.")

Fetching existing car IDs from the database...
Found 21461 existing car IDs.


In [None]:
# --- Main Loops ---
# Loop over price range
for k, price in enumerate(tqdm(price_vec[:-1])):
    params['pricefrom'] = round(price_vec[k])
    params['priceto'] = round(price_vec[k+1])

    # Loop over mileage
    for j, km in enumerate(km_vec[:-1]):
        params['kmfrom'] = round(km_vec[j])
        params['kmto'] = round(km_vec[j+1])

        # Loop over pages
        for i in range(page_limit_autoscout):
            params['page'] = i + 1
            html = requests.get(base_url, params=params).text
            soup = BeautifulSoup(html, "html.parser")
            car_listings = soup.find_all("article", class_="cldt-summary-full-item")
            
            if not car_listings:
                # Break out of the page loop if no listings are found
                break

            for car in car_listings:
                try:
                    data_mileage = float(car.get("data-mileage"))
                except (ValueError, TypeError):
                    data_mileage = -1
                
                try:
                    listing_price = float(car.get("data-price"))
                except (ValueError, TypeError):
                    listing_price = -1
                
                car_id = car.get("id")
                
                # Check for duplicates using the optimized set lookup
                if car_id not in car_ids_in_database:
                    car_info = {
                        "car_id": car_id,
                        "make": car.get("data-make"),
                        "model": car.get("data-model"),
                        "first_registration": car.get("data-first-registration"),
                        "fuel_type": car.get("data-fuel-type"),
                        "mileage": data_mileage,
                        "post_code": car.get("data-listing-zip-code"),
                        "listing_price": listing_price,
                    }
                    cars_to_insert.append(car_info)
                    
                    # --- 1. Batch Inserts: Check if the batch is full ---
                    if len(cars_to_insert) >= batch_size:
                        print(f"Inserting {len(cars_to_insert)} cars to the database...")
                        supabase.table(table_name).insert(cars_to_insert).execute()
                        count_added += len(cars_to_insert)
                        cars_to_insert = []  # Reset the batch list
                
            time.sleep(0.1)  # Out of respect for the website

# --- Final Batch Insert ---
# Insert any remaining cars after all loops have finished
if cars_to_insert:
    print(f"Inserting final {len(cars_to_insert)} cars to the database...")
    supabase.table(table_name).insert(cars_to_insert).execute()
    count_added += len(cars_to_insert)

print(f"\nTotal cars added to the database: {count_added}")

  0%|                                                                                                                                                           | 0/29 [00:00<?, ?it/s]

In [None]:
count_added

In [None]:
response = supabase.table(table_name).select("car_id").execute()
car_ids_in_database = response.data

In [None]:
df = pd.DataFrame([d['car_id'] for d in car_ids_in_database])

In [None]:
len(df[0].unique()) - len(df)

## Remove duplicates in database