In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import requests
from tqdm import tqdm
import time
import string
import itertools
import subprocess
from dotenv import load_dotenv
import logging
import os
import numpy as np
from bs4 import BeautifulSoup
import requests
from supabase import create_client, Client
import json
from dotenv import load_dotenv
from rich.console import Console
from rich.progress import Progress, BarColumn, TextColumn, TimeRemainingColumn
from datetime import datetime
import re
import random

In [2]:
# Load variables from .env into the environment
load_dotenv()

# Read variables
supabase_url = os.getenv("SUPABASE_URL")
supabase_key = os.getenv("SUPABASE_KEY")

In [35]:
# Initialize client
supabase: Client = create_client(supabase_url, supabase_key)

In [36]:
# # Example: Check status
# command = r'cd "C:\Program Files\NordVPN" && nordvpn -c -g "Netherlands"'
# result = subprocess.run(command, shell=True, capture_output=True, text=True)
# time.sleep(5)

In [37]:
BASE_URL = "https://openpostcode.nl/api/address"

params = {
    "postcode": "5056TG",
    "huisnummer": '23'
}

response = requests.get(BASE_URL, params=params)

In [38]:
response.status_code

200

In [39]:
car_adverts_table = 'autoscout_car_adverts'
response = supabase.table(car_adverts_table).select("car_id, post_code").not_.is_("post_code","null").execute()
df_full = pd.DataFrame(response.data)
postcodes_in_car_database = set(df_full['post_code'])

In [40]:
def fetch_all_rows_in_batches(
    supabase,
    table_name: str,
    columns: str = "*",
    batch_size: int = 5000,
    max_batches: int | None = None
):
    """
    Fetch all rows from a Supabase table in batches to avoid timeouts.

    Args:
        supabase: Supabase client instance
        table_name: Name of the table to query
        columns: Comma-separated column names or "*" for all
        batch_size: Number of rows per batch
        max_batches: Optional limit (for testing or large tables)

    Returns:
        List of dicts containing all rows fetched.
    """
    all_rows = []
    offset = 0
    batch_count = 0

    while True:
        try:
            response = (
                supabase.table(table_name)
                .select(columns)
                .range(offset, offset + batch_size - 1)
                .execute()
            )

            data = response.data

            # Stop when no more rows
            if not data:
                logging.info(f"No more rows after offset {offset}.")
                break

            all_rows.extend(data)
            offset += batch_size
            batch_count += 1

            logging.info(f"Fetched {len(data)} rows (total {len(all_rows)}).")

            # Optional: stop early if max_batches is set
            if max_batches and batch_count >= max_batches:
                logging.info(f"Reached max_batches ({max_batches}), stopping early.")
                break

        except Exception as e:
            logging.error(f"Error fetching batch starting at {offset}: {e}")
            time.sleep(2)
            break

    return all_rows


In [43]:
tmp = fetch_all_rows_in_batches(supabase, car_adverts_table, "car_id, post_code", 50000)

In [45]:
pd.DataFrame(tmp)

Unnamed: 0,car_id,post_code
0,54160968-7f61-40e3-aa01-d8a144e43778,7447JK
1,83fc4c33-72c9-41a0-933f-b481bfb52dc3,4815PN
2,ed521553-6242-4a23-bdc0-be20ab80b189,1327GE
3,f03e55e9-9ff1-49f5-8dc9-99c56f53b1f2,5141PB
4,3ff5f15f-c976-440a-b617-22bb1dd6832d,5161PA
...,...,...
431410,859fa688-a00f-4c39-8a71-5dddfd8b2d95,5741SX
431411,4b15e529-82db-4870-9adf-326bc2e315d2,5741SX
431412,2ad04bdd-4f4c-4c04-bc03-bfe89186c06a,5469NM
431413,c14feb83-0843-4c0f-85fd-00f01b049670,7333NS


In [27]:
postcodes_table = 'postcode_info_nl'
response = supabase.table(postcodes_table).select("post_code", "latitude").not_.is_("latitude","null").execute()
df_full = pd.DataFrame(response.data)
postcodes_in_database = set(df_full['post_code'])

In [28]:
len(postcodes_in_database)

42663

In [29]:
len(postcodes_in_car_database)

42143

In [20]:
postcodes_not_in_database = postcodes_in_car_database.difference(postcodes_in_database)

In [21]:
len(postcodes_not_in_database)

660

In [47]:
import multiprocessing

num_cores = multiprocessing.cpu_count()
print(f"CPU cores available: {num_cores}")


CPU cores available: 8


In [13]:
# Initialize a rich Console object
console = Console()

batch_size = 100
postcodes_to_insert = []
count_added = 0

# Loop through all combinations (generator approach)
try:
    for code in tqdm(postcodes_in_car_database):
        if code in postcodes_in_database:
            continue
        
        # print(code)  # or process it
        params = {
            "postcode": code,
            "huisnummer": 1
        }
        response = requests.get(BASE_URL, params=params)
        if response.status_code == 500  or response.status_code == 429:
            # command = r'cd "C:\Program Files\NordVPN" && nordvpn -c -g "Netherlands"'
            # result = subprocess.run(command, shell=True, capture_output=True, text=True)
            # time.sleep(10)
            # response = requests.get(BASE_URL, params=params)
            # console.log(f"Reconnecting VPN...")
            # raise Exception("Too many requests.")
            continue
        
        elif 'latitude' and 'longitude' in response.json().keys():
            lat = response.json()['latitude']
            lon = response.json()['longitude']
            straat = response.json()['straat']
            buurt = response.json()['buurt']
            wijk = response.json()['wijk']
            woonplaats = response.json()['woonplaats']
            gemeente = response.json()['gemeente']
            provincie = response.json()['provincie']
            huisnummer = response.json()['huisnummer']
            
        elif response.json()['error'] == 'Huisnummer not found':
            params = {
                "postcode": code,
                "huisnummer": response.json()['suggestions'][0]
            }
            response = requests.get(BASE_URL, params=params)
            if response.status_code == 500  or response.status_code == 429:
                # command = r'cd "C:\Program Files\NordVPN" && nordvpn -c -g "Netherlands"'
                # result = subprocess.run(command, shell=True, capture_output=True, text=True)
                # time.sleep(10)
                # response = requests.get(BASE_URL, params=params)
                # console.log(f"Reconnecting VPN...")
                # raise Exception("Too many requests.")
                continue
            lat = response.json()['latitude']
            lon = response.json()['longitude']
            straat = response.json()['straat']
            buurt = response.json()['buurt']
            wijk = response.json()['wijk']
            woonplaats = response.json()['woonplaats']
            gemeente = response.json()['gemeente']
            provincie = response.json()['provincie']
            huisnummer = response.json()['huisnummer']
        # elif response.status_code == 429:
        #     # subprocess.run(["nordvpn", "disconnect", "Netherlands"])
        #     # time.sleep(1)
        #     # subprocess.run(["nordvpn", "connect", "Netherlands"])
        #     break
        else:
            lat = None
            lon = None
            straat = None
            buurt = None
            wijk = None
            woonplaats = None
            gemeente = None
            provincie = None
            huisnummer = None
            
        postcode_info = {
            "post_code": code,
            "huisnummer": huisnummer,
            "straat": straat,
            "buurt": buurt,
            "wijk": wijk,
            "woonplaats": woonplaats,
            "gemeente": gemeente,
            "provincie": provincie,
            "longitude": lon,
            "latitude": lat,
        }
        postcodes_to_insert.append(postcode_info)
        postcodes_in_database.add(code)
        time.sleep(random.uniform(0.01, 0.05))
        if len(postcodes_to_insert) >= batch_size:
            console.log(f"Inserting {len(postcodes_to_insert)} postcodes to the database...")
            supabase.table(postcodes_table).upsert(postcodes_to_insert).execute()
            count_added += len(postcodes_to_insert)
            postcodes_to_insert = []

    if postcodes_to_insert:
        console.log(f"Inserting final {len(postcodes_to_insert)} cars to the database...")
        supabase.table(postcodes_table).upsert(postcodes_to_insert).execute()
        count_added += len(postcodes_to_insert)
    
except requests.exceptions.RequestException as e:
    
    if postcodes_to_insert:
        console.log(f"Inserting final {len(postcodes_to_insert)} cars to the database...")
        supabase.table(postcodes_table).upsert(postcodes_to_insert).execute()
        count_added += len(postcodes_to_insert)
        raise SystemExit(e)

 14%|████████████████████████████                                                                                                                                                                              | 5807/41770 [00:23<01:32, 389.25it/s]

 29%|█████████████████████████████████████████████████████████▍                                                                                                                                               | 11946/41770 [00:37<00:38, 768.68it/s]

 44%|████████████████████████████████████████████████████████████████████████████████████████▏                                                                                                                | 18333/41770 [00:54<01:01, 380.53it/s]

 59%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                 | 24772/41770 [01:07<00:39, 431.85it/s]

 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                 | 31473/41770 [01:20<00:10, 937.93it/s]

 92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                | 38290/41770 [01:33<00:05, 583.08it/s]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 41770/41770 [01:38<00:00, 423.08it/s]


In [23]:
response

APIResponse[~_ReturnT](data=[{'post_code': '1011AB', 'latitude': 52.378666}, {'post_code': '1011AC', 'latitude': 52.377644}, {'post_code': '1011AD', 'latitude': 52.37682}, {'post_code': '1011AE', 'latitude': 52.3757}, {'post_code': '1011AG', 'latitude': 52.375946}, {'post_code': '1011AH', 'latitude': 52.37576}, {'post_code': '1011AJ', 'latitude': 52.375534}, {'post_code': '1011AK', 'latitude': 52.374645}, {'post_code': '1011AL', 'latitude': 52.36883}, {'post_code': '1011AM', 'latitude': 52.374218}, {'post_code': '1011AN', 'latitude': 52.374073}, {'post_code': '1011AP', 'latitude': 52.373943}, {'post_code': '1011AR', 'latitude': 52.3738}, {'post_code': '1011AS', 'latitude': 52.373497}, {'post_code': '1011AT', 'latitude': 52.373352}, {'post_code': '1011AV', 'latitude': 52.373158}, {'post_code': '1011AW', 'latitude': 52.373005}, {'post_code': '1011AX', 'latitude': 52.374126}, {'post_code': '1011AZ', 'latitude': 52.373497}, {'post_code': '1011BA', 'latitude': 52.373558}, {'post_code': '101

In [227]:
response.data

[{'post_code': '1011AB', 'latitude': 52.378666},
 {'post_code': '1011AC', 'latitude': 52.377644},
 {'post_code': '1011AD', 'latitude': 52.37682},
 {'post_code': '1011AE', 'latitude': 52.3757},
 {'post_code': '1011AG', 'latitude': 52.375946},
 {'post_code': '1011AH', 'latitude': 52.37576},
 {'post_code': '1011AJ', 'latitude': 52.375534},
 {'post_code': '1011AK', 'latitude': 52.374645},
 {'post_code': '1011AL', 'latitude': 52.36883},
 {'post_code': '1011AM', 'latitude': 52.374218},
 {'post_code': '1011AN', 'latitude': 52.374073},
 {'post_code': '1011AP', 'latitude': 52.373943},
 {'post_code': '1011AR', 'latitude': 52.3738},
 {'post_code': '1011AS', 'latitude': 52.373497},
 {'post_code': '1011AT', 'latitude': 52.373352},
 {'post_code': '1011AV', 'latitude': 52.373158},
 {'post_code': '1011AW', 'latitude': 52.373005},
 {'post_code': '1011AX', 'latitude': 52.374126},
 {'post_code': '1011AZ', 'latitude': 52.373497},
 {'post_code': '1011BA', 'latitude': 52.373558},
 {'post_code': '1011BB', 'l