In [None]:
# scrapping the posts :
#  well we used the workers to execute the fetching in parallel
#  retry decorators are used to treat the 429 error explain it too

import requests
import pandas as pd
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import random
import time
import os
from google.colab import files
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type

Ce dictionnaire (brands) contient les marques de véhicules qui ont été extraites dynamiquement du site web source. Elles ont été collectées à l'aide du script de web scraping utilisant Selenium, en interagissant avec des listes déroulantes ou d'autres éléments interactifs de la page.

In [None]:
brands = {
    'Mercedes-Benz': 41, 'Renault': 49,
    'Peugeot': 46, 'Toyota': 56, 'Ford': 18, 'Volkswagen': 58, 'Fiat': 17,
    'Hyundai': 24, 'Nissan': 44, 'Chevrolet': 10, 'Kia': 30, 'Citroen': 12,
    'Mazda': 40, 'Opel': 45, 'Suzuki': 55, 'BMW': 5, 'Honda': 22,
    'Audi': 3, 'Alfa Romeo': 1, 'Seat': 50, 'Lada': 72, 'Cadillac': 7,
    'Volvo': 59, 'DFSK': 67, 'Jeep': 29, 'Chery': 9, 'Porsche': 48,
    'Skoda': 51, 'Daewoo': 61, 'Jaguar': 28, 'Rover': 62, 'Ssangyong': 53,
    'Geely': 20, 'Mitsubishi': 43, 'Dacia': 13
}


une technique pour simuler des connexions depuis différents navigateurs réels, afin de réduire le risque d'être détecté et bloqué par le site web source.

In [None]:
# User agents for rotation
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Safari/605.1.15',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
]


In [None]:
@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=5, min=5, max=20),
    retry=retry_if_exception_type(requests.exceptions.HTTPError),
    before_sleep=lambda retry_state: print(f"429 error in collect_links, retrying in {retry_state.next_action.sleep} seconds...")
)
def collect_links(brand_id, max_pages=30):
    """Collect post links for a given brand using requests."""
    base_url = f"https://www.avito.ma/fr/maroc/voitures_d_occasion-%C3%A0_vendre?has_price=true&sp=1&brand={brand_id}"
    links = set()
    headers = {'User-Agent': random.choice(user_agents)}

    for page in range(1, max_pages + 1):
        url = f"{base_url}&o={page}"
        try:
            print(f"Fetching {url}")
            response = requests.get(url, headers=headers, timeout=15)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            listings = soup.select('div.sc-1nre5ec-1.crKvIr.listing > a')
            page_links = [listing['href'] for listing in listings if listing.get('href')]

            if not page_links:
                print(f"No links found on page {page} for brand {brand_id}, stopping.")
                break
            elif len(page_links) < 5 and page > 1:
                print(f"Only {len(page_links)} links found on page {page} for brand {brand_id}, stopping early.")
                links.update(page_links)
                break

            links.update(page_links)
            print(f"Collected {len(page_links)} links from page {page} for brand {brand_id}")
            time.sleep(random.uniform(5, 10))  # Increased delay
        except requests.exceptions.HTTPError as e:
              print(f"Failed to fetch {url}: {e}")
              break
        except Exception as e:
            print(f"Failed to fetch {url}: {e}")
            break

    return list(links)

In [None]:
@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=5, min=5, max=20),
    retry=retry_if_exception_type(requests.exceptions.HTTPError),
    before_sleep=lambda retry_state: print(f"429 error, retrying in {retry_state.next_action.sleep} seconds...")
)
def scrape_detail_page(url, brand_id, brand_name):
    """Scrape a single post's detail page using requests."""
    headers = {'User-Agent': random.choice(user_agents)}
    response = requests.get(url, headers=headers, timeout=15)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')

    def get_text(selector):
        element = soup.select_one(selector)
        return element.text.strip() if element else 'N/A'

    data = {
        'url': url,
        'title': get_text('div.sc-1veij0r-3.cKDbCG > h1'),
        'price': get_text('div.sc-1g3sn3w-1.iIzVCI > div.sc-1gfa0w0-0.gQtXSS > div:nth-child(2) > div.sc-1lz4h6h-0.jRkCWq > div.sc-1g3sn3w-0.cgzdaK > div.sc-1g3sn3w-4.etbZjx > div.sc-1veij0r-0.bDmIpE > div > div.sc-1veij0r-7.jjDooz > div > p'),
        'location': get_text('div.sc-1veij0r-3.cKDbCG > div > span:nth-child(2)'),
        'brand': brand_name
    }

    feature_container = soup.select_one('div.sc-1g3sn3w-4.etbZjx > div:nth-child(5) > div')
    features = {
        'Année-Modèle': 'year',
        'Boite de vitesses': 'transmission',
        'Type de carburant': 'fuel_type',
        'Kilométrage': 'kilometrage',
        'Marque': 'brand_check',
        'Modèle': 'submodel',
        'Nombre de portes': 'door_number',
        'Origine': 'origin',
        'Première main': 'first_owner',
        'État': 'condition',
        'Puissance fiscale': 'puissance_fiscale'
    }

    if feature_container:
        feature_divs = feature_container.select('div.sc-19cngu6-2.kuofIS')
        for div in feature_divs:
            title_span = div.select_one('span.sc-1x0vz2r-0.bXFCIH')
            value_span = div.select_one('span.sc-1x0vz2r-0.fjZBup')
            if title_span and value_span:
                title = title_span.text.strip()
                value = value_span.text.strip()
                if title in features:
                    data[features[title]] = value

    extra_container = soup.select_one('div.sc-1g3sn3w-4.etbZjx > div:nth-child(8) > div')
    extra_features = [
        'ABS', 'Airbags', 'Vitres électriques', 'Verrouillage centralisé à distance',
        'Radar de recul', 'Système de navigation/GPS', 'Caméra de recul',
        'Limiteur de vitesse', 'Jantes aluminium', 'ESP', 'Climatisation',
        'CD/MP3/Bluetooth', 'Sièges cuir', 'Ordinateur de bord', 'Toit ouvrant',
        'Régulateur de vitesse'
    ]
    for feature in extra_features:
        data[feature.lower().replace(' ', '_')] = False

    if extra_container:
        extra_spans = extra_container.select('div > div > div > span')
        for span in extra_spans:
            feature_name = span.text.strip()
            if feature_name in extra_features:
                data[feature_name.lower().replace(' ', '_')] = True

    time.sleep(random.uniform(5, 10))  # Increased delay
    return data


In [None]:
def scrape_brand(brand_id, brand_name, max_pages=30):
    """Scrape all posts for a given brand with parallel detail scraping."""
    print(f"Scraping brand {brand_name} (ID: {brand_id})")
    links = collect_links(brand_id, max_pages)
    print(f"Collected {len(links)} links for brand {brand_name}")

    data = []
    def scrape_link(link):
        try:
            print(f"[{brand_name}] Scraping detail page {links.index(link) + 1}/{len(links)}: {link}")
            return scrape_detail_page(link, brand_id, brand_name)
        except Exception as e:
            print(f"Failed to scrape {link}: {e}")
            return None

    with ThreadPoolExecutor(max_workers=2) as executor:  # Reduced to 2 detail workers
        results = executor.map(scrape_link, links)
        for post_data in results:
            if post_data:
                data.append(post_data)

    if data:
        # df = pd.DataFrame(data)
        # df.to_csv(f'avito_brand_{brand_id}_temp.csv', index=False)
        # print(f"[{brand_name}] Incremental save: {len(data)} posts for brand {brand_id}")
        df = pd.DataFrame(data)
        temp_file = f'avito_brand_{brand_id}_temp.csv'
        df.to_csv(temp_file, index=False)
        print(f"[{brand_name}] Incremental save: {len(data)} posts to {temp_file}")
        if os.path.exists(temp_file):
            print(f"Downloading {temp_file}")
            files.download(temp_file)


    return data

In [None]:
def main():
    output_path = "/content/avito_car_listings_reduced_workers.csv"
    all_data = []
    visited_links = set()
    target_posts = 70000

    # sample_brands = dict(list(brands.items())[:1])


    print("Starting parallel scraping for brands...")
    # with ThreadPoolExecutor(max_workers=5) as executor:
    with ThreadPoolExecutor(max_workers=2) as executor:
        brand_results = executor.map(
            lambda brand: scrape_brand(brand[1], brand[0], max_pages=30),
            brands.items()
        )
        for brand_data in brand_results:
            for post in brand_data:
                if post and post['url'] not in visited_links:
                    all_data.append(post)
                    visited_links.add(post['url'])
                    print(f"Global progress: {len(all_data)}/{target_posts} posts collected")

            if len(all_data) >= 1000:
                df = pd.DataFrame(all_data)
                df.to_csv(output_path, index=False, encoding='utf-8')
                print(f"Saved {len(df)} posts to {output_path}")

            if len(all_data) >= target_posts:
                print(f"Reached target of ~{target_posts} posts")
                break

    df = pd.DataFrame(all_data)
    df = df.drop_duplicates(subset=['url', 'title', 'price', 'location'])
    columns = [
        'url', 'title', 'price', 'location', 'brand', 'submodel', 'year',
        'transmission', 'fuel_type', 'kilometrage', 'door_number', 'origin',
        'first_owner', 'condition', 'puissance_fiscale', 'abs', 'airbags',
        'vitres_électriques', 'verrouillage_centralisé_à_distance', 'radar_de_recul',
        'système_de_navigation/gps', 'caméra_de_recul', 'limiteur_de_vitesse',
        'jantes_aluminium', 'esp', 'climatisation', 'cd/mp3/bluetooth',
        'sièges_cuir', 'ordinateur_de_bord', 'toit_ouvrant', 'régulateur_de_vitesse'
    ]
    df = df.reindex(columns=columns, fill_value='N/A')
    df.to_csv(output_path, index=False, encoding='utf-8')
    print(f"Final save: {len(df)} unique posts to {output_path}")
    files.download(output_path)

if __name__ == "__main__":
    main()