In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
from datetime import datetime
import re

In [2]:
def setup_driver():
    """
    Configure et initialise le driver Selenium
    """
    chrome_options = Options()
    # Commenter la ligne suivante pour voir le navigateur (utile pour debug)
    chrome_options.add_argument('--headless')  
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-blink-features=AutomationControlled')
    chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
    chrome_options.add_argument('--window-size=1920,1080')
    chrome_options.add_argument('--disable-gpu')
    
    # Options pour √©viter la d√©tection
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    # Masquer l'indicateur webdriver
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    
    return driver

In [3]:
def extract_occasion_details(driver, occasion_url):
    """
    Extrait les d√©tails d'une voiture d'occasion
    """
    try:
        print(f"  üìÑ Chargement de l'annonce...")
        driver.get(occasion_url)
        time.sleep(3)
        
        car_data = {}
        car_data['URL'] = occasion_url
        
        # Extraire le nom et la version
        try:
            # M√©thode 1 : Utiliser JavaScript pour extraire le texte proprement
            model_name = driver.execute_script("""
                var h1 = document.querySelector('h1.occasion-title div');
                if (!h1) return 'N/A';
                
                // Extraire le nom (texte direct + span)
                var name = '';
                for (var i = 0; i < h1.childNodes.length; i++) {
                    var node = h1.childNodes[i];
                    if (node.nodeType === 3) { // Text node
                        name += node.textContent.trim() + ' ';
                    } else if (node.tagName === 'SPAN' && !node.classList.contains('h6') && !node.classList.contains('h5')) {
                        name += node.textContent.trim();
                    } else if (node.tagName === 'BR') {
                        break; // Arr√™ter avant la description
                    }
                }
                return name.trim();
            """)
            
            description = driver.execute_script("""
                var span = document.querySelector('h1.occasion-title div span.h6, h1.occasion-title div span.h5');
                return span ? span.textContent.trim() : 'N/A';
            """)
            
            car_data['Mod√®le'] = model_name if model_name else 'N/A'
            car_data['Description'] = description if description else 'N/A'
            
            print(f"  üöó {car_data['Mod√®le']}")
            print(f"     {car_data['Description']}")
        except Exception as e:
            car_data['Mod√®le'] = 'N/A'
            car_data['Description'] = 'N/A'
            print(f"  ‚ö†Ô∏è Erreur extraction titre: {e}")
        
        # Extraire le prix
        try:
            # Utiliser JavaScript pour extraire le prix directement
            prix = driver.execute_script("""
                var priceDiv = document.querySelector('.price-box div.price');
                if (!priceDiv) return null;
                
                // Extraire le texte du nombre (avant le small)
                var priceText = '';
                for (var i = 0; i < priceDiv.childNodes.length; i++) {
                    var node = priceDiv.childNodes[i];
                    if (node.nodeType === 3) { // Text node
                        priceText += node.textContent.trim();
                    }
                }
                return priceText.trim();
            """)
            
            if prix:
                car_data['Prix'] = prix + ' DT'
                print(f"  üí∞ Prix: {car_data['Prix']}")
            else:
                car_data['Prix'] = 'N/A'
                print(f"  ‚ö†Ô∏è Prix non disponible")
        except Exception as e:
            car_data['Prix'] = 'N/A'
            print(f"  ‚ö†Ô∏è Prix non disponible: {e}")
        
        # Extraire les sp√©cifications principales
        try:
            specs_list = driver.find_elements(By.CSS_SELECTOR, '.main-specs ul li')
            
            for spec in specs_list:
                try:
                    spec_name = spec.find_element(By.CSS_SELECTOR, '.spec-name').text.strip()
                    spec_value = spec.find_element(By.CSS_SELECTOR, '.spec-value').text.strip()
                    
                    # Nettoyer le nom (enlever l'ic√¥ne SVG)
                    spec_name_clean = spec_name.replace('\n', ' ').strip()
                    
                    # Mapper les noms de caract√©ristiques
                    if 'Kilom√©trage' in spec_name_clean or 'kilometrage' in spec_name_clean.lower():
                        car_data['Kilom√©trage'] = spec_value
                        print(f"    ‚úì Kilom√©trage: {spec_value}")
                    elif 'Mise en circulation' in spec_name_clean:
                        car_data['Mise en circulation'] = spec_value
                        print(f"    ‚úì Mise en circulation: {spec_value}")
                    elif '√ânergie' in spec_name_clean or 'Energie' in spec_name_clean:
                        car_data['√ânergie'] = spec_value
                        print(f"    ‚úì √ânergie: {spec_value}")
                    elif 'Boite vitesse' in spec_name_clean or 'Bo√Æte' in spec_name_clean:
                        car_data['Bo√Æte de vitesse'] = spec_value
                        print(f"    ‚úì Bo√Æte de vitesse: {spec_value}")
                    elif 'Puissance fiscale' in spec_name_clean:
                        car_data['Puissance fiscale'] = spec_value
                        print(f"    ‚úì Puissance fiscale: {spec_value}")
                    elif 'Transmission' in spec_name_clean:
                        car_data['Transmission'] = spec_value
                        print(f"    ‚úì Transmission: {spec_value}")
                    elif 'Carrosserie' in spec_name_clean:
                        car_data['Carrosserie'] = spec_value
                        print(f"    ‚úì Carrosserie: {spec_value}")
                    elif '√âtat g√©n√©ral' in spec_name_clean or 'Etat' in spec_name_clean:
                        car_data['√âtat g√©n√©ral'] = spec_value
                        print(f"    ‚úì √âtat g√©n√©ral: {spec_value}")
                    elif 'Anciens propri√©taires' in spec_name_clean or 'propri√©taire' in spec_name_clean.lower():
                        car_data['Anciens propri√©taires'] = spec_value
                        print(f"    ‚úì Anciens propri√©taires: {spec_value}")
                    elif 'Date de l\'annonce' in spec_name_clean or 'annonce' in spec_name_clean.lower():
                        car_data['Date annonce'] = spec_value
                        print(f"    ‚úì Date annonce: {spec_value}")
                except Exception as e:
                    continue
            
            print(f"  ‚úÖ D√©tails extraits")
        except Exception as e:
            print(f"  ‚ö†Ô∏è Erreur extraction sp√©cifications: {e}")
        
        return car_data
        
    except Exception as e:
        print(f"  ‚ùå Erreur g√©n√©rale: {e}")
        import traceback
        traceback.print_exc()
        return None

In [4]:
def get_occasion_links(driver, search_url, start_page=1, max_pages=1, max_cars=None):
    """
    R√©cup√®re les liens vers les annonces d'occasion avec pagination
    """
    all_occasion_links = []
    
    try:
        end_page = start_page + max_pages - 1
        print(f"üîç R√©cup√©ration des liens depuis les pages {start_page} √† {end_page}...")
        if max_cars:
            print(f"   Limite: {max_cars} voitures maximum")
        
        for page_num in range(start_page, end_page + 1):
            # V√©rifier si on a atteint la limite
            if max_cars and len(all_occasion_links) >= max_cars:
                print(f"\n‚úÖ Limite de {max_cars} voitures atteinte")
                break
            
            # Construire l'URL de la page
            if page_num == 1:
                page_url = search_url
            else:
                page_url = f"{search_url}?page={page_num}"
            
            print(f"\nüìÑ Page {page_num}: {page_url}")
            
            try:
                driver.get(page_url)
                time.sleep(4)
                
                # Scroll pour charger tous les √©l√©ments
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(2)
                
                # Trouver tous les articles avec data-key
                articles = driver.find_elements(By.CSS_SELECTOR, '.articles div[data-key]')
                print(f"  üìä {len(articles)} annonce(s) trouv√©e(s) sur cette page")
                
                page_links = []
                for article in articles:
                    # V√©rifier si on a atteint la limite
                    if max_cars and len(all_occasion_links) >= max_cars:
                        break
                        
                    try:
                        link_elem = article.find_element(By.CSS_SELECTOR, 'a.occasion-link-overlay')
                        href = link_elem.get_attribute('href')
                        
                        if href and href not in all_occasion_links:
                            # Construire l'URL compl√®te si n√©cessaire
                            if href.startswith('/'):
                                href = 'https://www.automobile.tn' + href
                            
                            all_occasion_links.append(href)
                            page_links.append(href)
                            print(f"    ‚úì Annonce trouv√©e")
                    except:
                        continue
                
                print(f"  ‚úì {len(page_links)} nouvelle(s) annonce(s) sur cette page")
                print(f"  üìä Total actuel: {len(all_occasion_links)} annonces")
                
            except Exception as e:
                print(f"  ‚ùå Erreur sur la page {page_num}: {e}")
                continue
        
        print(f"\n‚úÖ Total: {len(all_occasion_links)} annonce(s) √† traiter")
        return all_occasion_links
        
    except Exception as e:
        print(f"‚ùå Erreur g√©n√©rale: {e}")
        import traceback
        traceback.print_exc()
        return all_occasion_links

In [5]:
def scrape_automobile_tn_occasion(start_page=1, max_pages=1, max_cars=None):
    """
    Fonction principale pour scraper les voitures d'occasion sur automobile.tn
    """
    print(f"{'='*60}")
    print(f"üöÄ SCRAPING AUTOMOBILE.TN OCCASION - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"{'='*60}")
    print(f"Pages: {start_page} √† {start_page + max_pages - 1}")
    if max_cars:
        print(f"Limite: {max_cars} voiture(s)")
    print()
    
    driver = None
    all_cars_data = []
    
    try:
        # Initialiser le driver
        print("üîß Configuration du WebDriver...")
        driver = setup_driver()
        print("‚úì WebDriver configur√©\n")
        
        search_url = "https://www.automobile.tn/fr/occasion"
        
        # R√©cup√©rer les liens des annonces
        occasion_links = get_occasion_links(driver, search_url, start_page, max_pages, max_cars)
        
        if not occasion_links:
            print("‚ùå Aucune annonce trouv√©e")
            return pd.DataFrame()
        
        # Traiter chaque annonce
        for i, occasion_url in enumerate(occasion_links, 1):
            print(f"\n{'='*60}")
            print(f"[{i}/{len(occasion_links)}] TRAITEMENT: {occasion_url}")
            print(f"{'='*60}")
            
            car_data = extract_occasion_details(driver, occasion_url)
            
            if car_data:
                all_cars_data.append(car_data)
                print(f"\n  ‚úÖ Voiture ajout√©e")
            else:
                print(f"\n  ‚ùå √âchec extraction")
            
            # Pause entre les annonces
            if i < len(occasion_links):
                print(f"\n  ‚è∏Ô∏è Pause de 3 secondes...")
                time.sleep(3)
        
    except Exception as e:
        print(f"\n‚ùå Erreur g√©n√©rale: {e}")
        import traceback
        traceback.print_exc()
    
    finally:
        # Fermer le driver
        if driver:
            driver.quit()
            print("\nüîí WebDriver ferm√©")
    
    # Cr√©er le DataFrame
    if all_cars_data:
        df = pd.DataFrame(all_cars_data)
        print(f"\n{'='*60}")
        print(f"‚úÖ SCRAPING TERMIN√â!")
        print(f"{'='*60}")
        print(f"Total: {len(all_cars_data)} voiture(s) extraite(s)")
        print(f"Colonnes: {len(df.columns)}")
        return df
    else:
        print("\n‚ùå Aucune donn√©e extraite")
        return pd.DataFrame()

In [6]:
# Scraper les voitures d'occasion
df_occasions = scrape_automobile_tn_occasion(start_page=1, max_pages=1, max_cars=12)

üöÄ SCRAPING AUTOMOBILE.TN OCCASION - 2025-11-30 11:34:43
Pages: 1 √† 1
Limite: 12 voiture(s)

üîß Configuration du WebDriver...
‚úì WebDriver configur√©

üîç R√©cup√©ration des liens depuis les pages 1 √† 1...
   Limite: 12 voitures maximum

üìÑ Page 1: https://www.automobile.tn/fr/occasion
‚úì WebDriver configur√©

üîç R√©cup√©ration des liens depuis les pages 1 √† 1...
   Limite: 12 voitures maximum

üìÑ Page 1: https://www.automobile.tn/fr/occasion
  üìä 12 annonce(s) trouv√©e(s) sur cette page
    ‚úì Annonce trouv√©e
    ‚úì Annonce trouv√©e
    ‚úì Annonce trouv√©e
    ‚úì Annonce trouv√©e
    ‚úì Annonce trouv√©e
    ‚úì Annonce trouv√©e
    ‚úì Annonce trouv√©e
    ‚úì Annonce trouv√©e
    ‚úì Annonce trouv√©e
    ‚úì Annonce trouv√©e
    ‚úì Annonce trouv√©e
    ‚úì Annonce trouv√©e
  ‚úì 12 nouvelle(s) annonce(s) sur cette page
  üìä Total actuel: 12 annonces

‚úÖ Total: 12 annonce(s) √† traiter

[1/12] TRAITEMENT: https://www.automobile.tn/fr/occasion/peugeot/partne

In [8]:
# Afficher les r√©sultats
if not df_occasions.empty:
    print("\n" + "="*100)
    print("R√âSULTATS DU SCRAPING")
    print("="*100)
    display(df_occasions)
    
    print("\n" + "="*100)
    print("INFORMATIONS SUR LE DATASET")
    print("="*100)
    print(f"Nombre de voitures: {len(df_occasions)}")
    print(f"Nombre de colonnes: {len(df_occasions.columns)}")
    print(f"\nColonnes extraites:")
    for col in df_occasions.columns:
        print(f"  ‚Ä¢ {col}")
else:
    print("\n‚ùå Aucune donn√©e √† afficher")


R√âSULTATS DU SCRAPING


Unnamed: 0,URL,Mod√®le,Description,Prix,Kilom√©trage,Mise en circulation,√ânergie,Bo√Æte de vitesse,Puissance fiscale,Transmission,Carrosserie,√âtat g√©n√©ral,Anciens propri√©taires,Date annonce
0,https://www.automobile.tn/fr/occasion/peugeot/...,Peugeot Partner,L1 1.5 BlueHDi 16V FAP 100 cv,47 500 DT,150 000 KM,5.2021,Diesel,Manuelle,5 CV,Traction,Utilitaire,Tr√®s bon,1√®re main,30.11.2025
1,https://www.automobile.tn/fr/occasion/renault/...,Renault Megane Sedan,GrandCoupe 1.2 TCe 16V 130 cv,53 000 DT,56 000 KM,7.2019,Essence,Manuelle,7 CV,Traction,Berline,Tr√®s bon,1√®re main,30.11.2025
2,https://www.automobile.tn/fr/occasion/ford/eco...,Ford Ecosport Titanium,1.0 SCTi EcoBoost 12V S&S 125 cv,74 000 DT,25 000 KM,11.2022,Essence,Manuelle,7 CV,Traction,SUV,Tr√®s bon,1√®re main,30.11.2025
3,https://www.automobile.tn/fr/occasion/mercedes...,Mercedes-Benz Classe E AMG,200 2.0 CGI 16V 7G-TRONIC+ 184 cv Bo√Æte auto,123 000 DT,150 000 KM,2.2017,Essence,Automatique,10 CV,Propulsion,Berline,Tr√®s bon,1√®re main,30.11.2025
4,https://www.automobile.tn/fr/occasion/volkswag...,Volkswagen Golf 6 Team,1.2 TSi 105 cv,39 900 DT,182 000 KM,4.201,Essence,Manuelle,7 CV,Traction,Compacte,Tr√®s bon,1√®re main,30.11.2025
5,https://www.automobile.tn/fr/occasion/kia/spor...,KIA Sportage,2.0 CRDi 4x4 177 cv Bo√Æte auto,44 000 DT,276 000 KM,6.2013,Diesel,Automatique,11 CV,Int√©grale,SUV,Tr√®s bon,2√®me main,30.11.2025
6,https://www.automobile.tn/fr/occasion/chery/ti...,Chery Tiggo 8 Pro,1.6 T-GDI 7-DCT 197 cv Bo√Æte auto,82 000 DT,102 000 KM,11.2022,Essence,Automatique,11 CV,Traction,SUV,Tr√®s bon,1√®re main,30.11.2025
7,https://www.automobile.tn/fr/occasion/hyundai/...,Hyundai i20 High Grade,1.0 L T-GDI DCT-7 100 cv Bo√Æte auto,77 500 DT,49 100 KM,5.2024,Essence,Automatique,5 CV,Traction,Citadine,Tr√®s bon,1√®re main,30.11.2025
8,https://www.automobile.tn/fr/occasion/opel/cam...,Opel Campo 2001,,45 000 DT,570 000 KM,6.1996,Diesel,Manuelle,10 CV,Traction,Pick up,Normal,1√®re main,30.11.2025
9,https://www.automobile.tn/fr/occasion/mazda/cx...,Mazda CX-5 High Grade,Skyactiv-G 2.0 i 16V AWD S&S 160 cv Bo√Æte auto,78 000 DT,210 000 KM,1.2018,Essence,Automatique,9 CV,Int√©grale,SUV,Tr√®s bon,2√®me main,30.11.2025



INFORMATIONS SUR LE DATASET
Nombre de voitures: 12
Nombre de colonnes: 14

Colonnes extraites:
  ‚Ä¢ URL
  ‚Ä¢ Mod√®le
  ‚Ä¢ Description
  ‚Ä¢ Prix
  ‚Ä¢ Kilom√©trage
  ‚Ä¢ Mise en circulation
  ‚Ä¢ √ânergie
  ‚Ä¢ Bo√Æte de vitesse
  ‚Ä¢ Puissance fiscale
  ‚Ä¢ Transmission
  ‚Ä¢ Carrosserie
  ‚Ä¢ √âtat g√©n√©ral
  ‚Ä¢ Anciens propri√©taires
  ‚Ä¢ Date annonce


In [9]:
# Afficher les d√©tails d'une voiture
if not df_occasions.empty:
    print("\n" + "="*100)
    print("D√âTAILS D'UNE VOITURE")
    print("="*100)
    for col, value in df_occasions.iloc[0].items():
        print(f"{col:30s}: {value}")


D√âTAILS D'UNE VOITURE
URL                           : https://www.automobile.tn/fr/occasion/peugeot/partner/118587
Mod√®le                        : Peugeot Partner
Description                   : L1 1.5 BlueHDi 16V FAP 100 cv
Prix                          : 47 500 DT
Kilom√©trage                   : 150 000 KM
Mise en circulation           : 05.2021
√ânergie                       : Diesel
Bo√Æte de vitesse              : Manuelle
Puissance fiscale             : 5 CV
Transmission                  : Traction
Carrosserie                   : Utilitaire
√âtat g√©n√©ral                  : Tr√®s bon
Anciens propri√©taires         : 1√®re main
Date annonce                  : 30.11.2025


### Sauvegarde des donn√©es

In [10]:
# Sauvegarder les donn√©es en CSV et Excel
if not df_occasions.empty:
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # Cr√©er le dossier Data/row s'il n'existe pas
    import os
    output_dir = '../Data/row'
    os.makedirs(output_dir, exist_ok=True)
    
    # CSV
    csv_filename = os.path.join(output_dir, f"automobile_tn_occasion_{timestamp}.csv")
    df_occasions.to_csv(csv_filename, index=False, encoding='utf-8-sig')
    print(f"\nüíæ Donn√©es sauvegard√©es dans: {csv_filename}")
    
    # Excel
    try:
        excel_filename = os.path.join(output_dir, f"automobile_tn_occasion_{timestamp}.xlsx")
        df_occasions.to_excel(excel_filename, index=False, engine='openpyxl')
        print(f"üíæ Donn√©es sauvegard√©es dans: {excel_filename}")
    except:
        print("‚ö†Ô∏è Installation d'openpyxl n√©cessaire pour Excel")
else:
    print("\n‚ùå Aucune donn√©e √† sauvegarder")


üíæ Donn√©es sauvegard√©es dans: ../Data/row\automobile_tn_occasion_20251130_115220.csv
üíæ Donn√©es sauvegard√©es dans: ../Data/row\automobile_tn_occasion_20251130_115220.xlsx
üíæ Donn√©es sauvegard√©es dans: ../Data/row\automobile_tn_occasion_20251130_115220.xlsx
