In [1]:
import requests
from bs4 import BeautifulSoup
import time
import random
from datetime import datetime
import pandas as pd

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/114.0.0.0 Safari/537.36",
    "Referer": "https://www.autocasion.com",
    "Accept-Language": "es-ES,es;q=0.9",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
}

def extraer_marcas_urls(url_base="https://www.autocasion.com/coches-ocasion"):
    response = requests.get(url_base, headers=HEADERS)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")

    marcas = {}
    seccion_marcas = soup.find("div", class_="acordeon brand-acordeon")
    if seccion_marcas:
        elementos = seccion_marcas.find_all("a", href=True, class_="icon")
        for a in elementos:
            nombre = a.get_text(strip=True).upper()
            href = a["href"]
            if "/coches-segunda-mano/" in href:
                url_completa = "https://www.autocasion.com" + href
                marcas[nombre] = url_completa
    else:
        print("No se encontró la sección de marcas.")
    return marcas

def extraer_num_paginas(soup):
    div_num = soup.find("div", class_="num")
    if not div_num:
        return 1
    total_pages_a = div_num.find("a", class_="total_pages")
    if total_pages_a:
        try:
            return int(total_pages_a.get_text(strip=True))
        except:
            return 1
    paginas = []
    for a in div_num.find_all("a", href=True):
        try:
            paginas.append(int(a.get_text(strip=True)))
        except:
            continue
    if paginas:
        return max(paginas)
    return 1

def extraer_anuncios_de_marca(marca, url_base):
    anuncios = []
    urls_vistas = set()

    print(f"\nExtrayendo anuncios para {marca}...")

    response = requests.get(url_base, headers=HEADERS)
    if response.status_code != 200:
        print(f"Error HTTP {response.status_code} al cargar la primera página de {marca}")
        return anuncios
    soup = BeautifulSoup(response.text, "html.parser")
    num_paginas = extraer_num_paginas(soup)
    print(f"  Número total de páginas: {num_paginas}")

    for pagina in range(1, num_paginas + 1):
        if pagina == 1:
            url_pagina = url_base
        else:
            url_pagina = f"{url_base}?page={pagina}"

        print(f"  Página {pagina}: {url_pagina}")

        response = requests.get(url_pagina, headers=HEADERS)
        if response.status_code != 200:
            print(f"  Error HTTP {response.status_code} en la página {pagina}")
            break

        soup = BeautifulSoup(response.text, "html.parser")

        bloques_anuncios = soup.find_all("a", href=True)

        encontrados = 0
        for anuncio in bloques_anuncios:
            if anuncio.find("p", class_="relacionados"):
                continue

            titulo_tag = anuncio.find("h2", itemprop="name")
            if not titulo_tag:
                continue

            url_anuncio = "https://www.autocasion.com" + anuncio["href"]
            if url_anuncio in urls_vistas:
                continue
            urls_vistas.add(url_anuncio)

            titulo = titulo_tag.get_text(strip=True)

            ul = anuncio.find("ul")
            tags = [li.get_text(strip=True) for li in ul.find_all("li")] if ul else []

            anuncios.append({
                "id_extraccion": url_anuncio,
                "timestamp_extraccion": datetime.now().isoformat(),
                "marca": marca,
                "titulo": titulo,
                "url": url_anuncio,
                "tags": tags
            })
            encontrados += 1

        if encontrados == 0:
            print("  No se encontraron más anuncios en esta página.")
            break

        time.sleep(random.uniform(0.5, 1.2))

    return anuncios

if __name__ == "__main__":
    marcas = extraer_marcas_urls()
    print(f"\nMarcas encontradas: {len(marcas)}")

    anuncios_totales = []
    urls_vistas_global = set()

    for marca, url in marcas.items():
        anuncios_marca = extraer_anuncios_de_marca(marca, url)
        for anuncio in anuncios_marca:
            if anuncio["id_extraccion"] not in urls_vistas_global:
                anuncios_totales.append(anuncio)
                urls_vistas_global.add(anuncio["id_extraccion"])
        time.sleep(random.uniform(1, 2))

    df = pd.DataFrame(anuncios_totales)
    print(f"\nTotal anuncios extraídos: {len(df)}")
    print(df.head())




Marcas encontradas: 106

Extrayendo anuncios para ABARTH...
  Número total de páginas: 13
  Página 1: https://www.autocasion.com/coches-segunda-mano/abarth-ocasion
  Página 2: https://www.autocasion.com/coches-segunda-mano/abarth-ocasion?page=2
  Página 3: https://www.autocasion.com/coches-segunda-mano/abarth-ocasion?page=3
  Página 4: https://www.autocasion.com/coches-segunda-mano/abarth-ocasion?page=4
  Página 5: https://www.autocasion.com/coches-segunda-mano/abarth-ocasion?page=5
  Página 6: https://www.autocasion.com/coches-segunda-mano/abarth-ocasion?page=6
  Página 7: https://www.autocasion.com/coches-segunda-mano/abarth-ocasion?page=7
  Página 8: https://www.autocasion.com/coches-segunda-mano/abarth-ocasion?page=8
  Página 9: https://www.autocasion.com/coches-segunda-mano/abarth-ocasion?page=9
  Página 10: https://www.autocasion.com/coches-segunda-mano/abarth-ocasion?page=10
  Página 11: https://www.autocasion.com/coches-segunda-mano/abarth-ocasion?page=11
  Página 12: https://

In [2]:
df

Unnamed: 0,id_extraccion,timestamp_extraccion,marca,titulo,url,tags
0,https://www.autocasion.com/coches-segunda-mano...,2025-06-20T21:10:20.086439,ABARTH,ABARTH 500 C 695C 1.4 16v T-Jet 132kW (180 CV),https://www.autocasion.com/coches-segunda-mano...,"[2022, Gasolina, 23.213 km, Madrid]"
1,https://www.autocasion.com/coches-segunda-mano...,2025-06-20T21:10:20.086694,ABARTH,ABARTH 124 SPIDER Spider Turbo Multiair 125kW...,https://www.autocasion.com/coches-segunda-mano...,"[2018, Gasolina, 97.814 km, Barcelona]"
2,https://www.autocasion.com/coches-segunda-mano...,2025-06-20T21:10:20.086996,ABARTH,ABARTH 595 1.4T JET 121KW,https://www.autocasion.com/coches-segunda-mano...,"[2022, Gasolina, 33.602 km, Málaga]"
3,https://www.autocasion.com/coches-segunda-mano...,2025-06-20T21:10:20.087274,ABARTH,ABARTH 695 Pequeño Manual de 3 Puertas,https://www.autocasion.com/coches-segunda-mano...,"[2018, Gasolina, 48.500 km, Islas Baleares]"
4,https://www.autocasion.com/coches-segunda-mano...,2025-06-20T21:10:20.087595,ABARTH,ABARTH 500 1.4 Turbo 595,https://www.autocasion.com/coches-segunda-mano...,"[2020, Gasolina, 32.922 km, Barcelona]"
...,...,...,...,...,...,...
99918,https://www.autocasion.com/coches-km0/km-0/yud...,2025-06-21T00:33:12.156646,YOOUDOOO,YOOUDOOO Y2 1.5L Comfort,https://www.autocasion.com/coches-km0/km-0/yud...,"[2025, Gasolina, 2 km, La Rioja]"
99919,https://www.autocasion.com/coches-km0/km-0/yud...,2025-06-21T00:33:12.156777,YOOUDOOO,YOOUDOOO K3 YUDO Pequeño Automático de 5 Puertas,https://www.autocasion.com/coches-km0/km-0/yud...,"[2024, Eléctrico, 10 km, Barcelona]"
99920,https://www.autocasion.com/coches-km0/km-0/yud...,2025-06-21T00:33:12.156928,YOOUDOOO,YOOUDOOO K3 70kW,https://www.autocasion.com/coches-km0/km-0/yud...,"[2024, Eléctrico, 3.000 km, Sevilla]"
99921,https://www.autocasion.com/coches-km0/km-0/yud...,2025-06-21T00:33:12.157050,YOOUDOOO,YOOUDOOO K3 70kW,https://www.autocasion.com/coches-km0/km-0/yud...,"[2025, Eléctrico, 4.585 km, La Rioja]"


In [3]:
df.to_csv("anuncios_unificados1.csv", index=False, encoding="utf-8")
