In [41]:
headers = {
		"Content-Type": "application/json",
		"Connection": "keep-alive",
		"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0",
		"Access-Control-Allow-Origin": "*"
}

import re

def remove_whitespace(s: str, keep_spaces: bool = True) -> str:
    """
    Clean whitespace while keeping natural spaces between words.

    - Convert all non-space whitespace (tabs, newlines, etc.) into a space
    - Collapse multiple spaces into one
    - Trim leading/trailing spaces
    """
    # Replace tabs/newlines/etc. with a space
    s = re.sub(r"[^\S ]+", " ", s)

    # Collapse multiple spaces into one
    s = re.sub(r" +", " ", s)
    
    # Replace \ by ""
    s = re.sub(r"\\", "", s)
    
    # Trim
    return s.strip()


def get_specific_element(regex: str, text: str):
    m = re.search(regex, text)
    return m.group(0) if m else None

def safe_text(element, keep_spaces=False):
    if not element:
        return ""
    return remove_whitespace(element, keep_spaces)




In [None]:
import requests
from bs4 import BeautifulSoup
from time import sleep

# Base URL
url = "https://www.paruvendu.fr/immobilier/vente/ile-de-france/?rechpv=1&tt=1&tbApp=1&tbDup=1&tbChb=1&tbLof=1&tbAtl=1&tbPla=1&tbMai=1&tbVil=1&tbCha=1&tbPro=1&tbHot=1&tbMou=1&tbFer=1&nbp0=99&pa=FR&lo=75,77,78,91,92,93,94,95&lol=0&ray=50"


# Prepare page URLs
pages = [url] + [url + "&p=" + str(i) for i in range(2, 501)]

links = []


for idx, p in enumerate(pages, start=1):
    try:
        print(f"[{idx}/{len(pages)}] Fetching: {p}")
        response = requests.get(p, headers=headers, timeout=10)

        if response.status_code != 200:
            print(f"  ⚠️ Failed to fetch page {idx}, status code: {response.status_code}")
            continue

        soup = BeautifulSoup(response.text, "html.parser")
        blocs = soup.find_all("div", class_="blocAnnonce")
        print(f"  Found {len(blocs)} annonces on this page")

        for bloc in blocs:
            a_tag = bloc.find("a", href=True)
            if a_tag:
                links.append(a_tag["href"])

        
        sleep(2)

    except Exception as e:
        print(f"  ❌ Error on page {idx}: {e}")
        continue

# Convert to full URLs
base_url = "https://www.paruvendu.fr"
full_links = [base_url + href for href in links]

print(f"✅ Total links collected: {len(full_links)}")


[1/500] Fetching: https://www.paruvendu.fr/immobilier/vente/ile-de-france/?rechpv=1&tt=1&tbApp=1&tbDup=1&tbChb=1&tbLof=1&tbAtl=1&tbPla=1&tbMai=1&tbVil=1&tbCha=1&tbPro=1&tbHot=1&tbMou=1&tbFer=1&nbp0=99&pa=FR&lo=75,77,78,91,92,93,94,95&lol=0&ray=50
  Found 30 annonces on this page
[2/500] Fetching: https://www.paruvendu.fr/immobilier/vente/ile-de-france/?rechpv=1&tt=1&tbApp=1&tbDup=1&tbChb=1&tbLof=1&tbAtl=1&tbPla=1&tbMai=1&tbVil=1&tbCha=1&tbPro=1&tbHot=1&tbMou=1&tbFer=1&nbp0=99&pa=FR&lo=75,77,78,91,92,93,94,95&lol=0&ray=50&p=2
  Found 30 annonces on this page
[3/500] Fetching: https://www.paruvendu.fr/immobilier/vente/ile-de-france/?rechpv=1&tt=1&tbApp=1&tbDup=1&tbChb=1&tbLof=1&tbAtl=1&tbPla=1&tbMai=1&tbVil=1&tbCha=1&tbPro=1&tbHot=1&tbMou=1&tbFer=1&nbp0=99&pa=FR&lo=75,77,78,91,92,93,94,95&lol=0&ray=50&p=3
  Found 30 annonces on this page
[4/500] Fetching: https://www.paruvendu.fr/immobilier/vente/ile-de-france/?rechpv=1&tt=1&tbApp=1&tbDup=1&tbChb=1&tbLof=1&tbAtl=1&tbPla=1&tbMai=1&tbVil=1