In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, ElementClickInterceptedException
import pandas as pd
import logging
import re
import time

# Configuration des logs
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def extract_job_details(driver, job_element):
    """
    Extrait les détails d'une annonce d'emploi à partir d'un élément WebElement.
    """
    job_details = {}
    try:
        # Titre et lien
        title_element = job_element.find_element(By.CSS_SELECTOR, 'h2.jobTitle a')
        job_details['title'] = title_element.text
        job_details['link'] = title_element.get_attribute('href')

        # Entreprise
        try:
            company_element = job_element.find_element(By.CSS_SELECTOR, 'span[data-testid="company-name"]')
            job_details['company'] = company_element.text
        except:
            job_details['company'] = None

        # Localisation
        try:
            location_element = job_element.find_element(By.CSS_SELECTOR, 'div[data-testid="text-location"]')
            job_details['location'] = location_element.text
        except:
            job_details['location'] = None

        # Salaire et autres métadonnées
        try:
            metadata_elements = job_element.find_elements(By.CSS_SELECTOR, 'ul.metadataContainer li')
            metadata_text = [element.text for element in metadata_elements]
            
            # Extraire le salaire si présent
            salary = next((text for text in metadata_text if '€' in text or 'par an' in text or 'par jour' in text), None)
            contract_type = next((text for text in metadata_text if 'CDI' in text or 'Freelance' in text or 'Stage' in text), None)
            schedule = [text for text in metadata_text if 'Temps plein' in text or 'Du lundi au vendredi' in text]
            
            job_details['salary'] = salary
            job_details['contract_type'] = contract_type
            job_details['schedule'] = ', '.join(schedule) if schedule else None
            
        except:
            job_details['salary'] = None
            job_details['contract_type'] = None
            job_details['schedule'] = None

        # Date de publication
        try:
            date_element = job_element.find_element(By.CSS_SELECTOR, 'span[data-testid="myJobsStateDate"]')
            job_details['date_posted'] = date_element.text
        except:
            job_details['date_posted'] = None

        # Description courte
        try:
            description_elements = job_element.find_elements(By.CSS_SELECTOR, 'ul[style="list-style-type:circle;"] li')
            job_details['description'] = [element.text for element in description_elements]
        except:
            job_details['description'] = []

    except Exception as e:
        logging.error(f"Erreur lors de l'extraction des détails : {e}")
    return job_details

def scrape_jobs_with_selenium(driver, query, location, max_pages=5):
    """
    Automatiser la saisie du métier et de la ville, puis extraire les annonces.
    """
    driver.get("https://fr.indeed.com/")
    
    try:
        # Trouver le champ "Quoi", cliquer dedans, vider, et y entrer le poste recherché
        what_input = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "text-input-what"))
        )
        what_input.click()
        what_input.clear()
        time.sleep(1)
        what_input.send_keys(query)
        
        # Trouver le champ "Où", cliquer dedans, vider, et y entrer la ville
        where_input = driver.find_element(By.ID, "text-input-where")
        where_input.click()
        time.sleep(1)  # Assurez-vous que le champ est bien activé
        where_input.send_keys(Keys.CONTROL + "a")  # Sélectionner tout le texte
        where_input.send_keys(Keys.DELETE)  # Supprimer tout le texte
        time.sleep(1)
        where_input.send_keys(location)
        where_input.send_keys(Keys.RETURN)  # Appuyer sur Entrée pour rechercher

        logging.info(f"Recherche effectuée pour le poste '{query}' à '{location}'.")
        time.sleep(3)  # Attendre le chargement des résultats
    except Exception as e:
        logging.error(f"Erreur lors de la saisie des informations : {e}")
        return []

    # Extraction des annonces
    all_jobs = []
    page_count = 0

    while page_count < max_pages:
        try:
            # Attendre que les annonces soient visibles
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'job_seen_beacon')))
            job_elements = driver.find_elements(By.CLASS_NAME, 'job_seen_beacon')
            logging.info(f"Extraction des annonces sur la page {page_count + 1}.")
            
            for job_element in job_elements:
                job_details = extract_job_details(driver, job_element)
                if job_details:
                    all_jobs.append(job_details)

            # Passer à la page suivante
            try:
                next_button = WebDriverWait(driver, 20).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, "a[data-testid='pagination-page-next']"))
                )
                driver.execute_script("arguments[0].scrollIntoView();", next_button)
                time.sleep(1)
                next_button.click()
                page_count += 1
                time.sleep(5)
            except TimeoutException:
                logging.info("Fin des pages ou bouton 'Suivant' introuvable.")
                break
        except TimeoutException:
            logging.error("Les annonces n'ont pas pu être chargées.")
            break

    return all_jobs

def clean_salary(salary):
    """
    Nettoie et formate la colonne 'salary'.
    """
    if pd.isna(salary) or 'France' in salary:
        return None

    if 'par an' in salary:
        salary_type = 'an'
    elif 'par mois' in salary:
        salary_type = 'mois'
    elif 'par jour' in salary:
        salary_type = 'jour'
    else:
        return None

    amounts = re.findall(r"[\d,]+(?:\.\d+)?", salary.replace("â‚¬", "").replace(",", "").replace(" ", ""))
    amounts = [float(amount) for amount in amounts]

    if len(amounts) == 2:
        min_salary, max_salary = amounts
    elif len(amounts) == 1:
        min_salary = max_salary = amounts[0]
    else:
        return None

    if salary_type == 'mois':
        min_salary *= 12
        max_salary *= 12
    elif salary_type == 'jour':
        min_salary *= 250
        max_salary *= 250

    return f"{int(min_salary)} - {int(max_salary)} € par an"

def start_scraping():
    query = "developpeur web"
    locations = ["Paris", "Lyon", "Marseille"]

    driver = webdriver.Chrome()
    all_jobs = []

    for location in locations:
        logging.info(f"Lancement du scraping pour '{query}' à '{location}'.")
        jobs = scrape_jobs_with_selenium(driver, query, location, max_pages=5)
        all_jobs.extend(jobs)
        logging.info(f"{len(jobs)} annonces collectées pour '{location}'.")

    driver.quit()

    df = pd.DataFrame(all_jobs)
    if 'salary' in df.columns:
        df['salary'] = df['salary'].apply(clean_salary)

    df = df.dropna(subset=['salary'], how='all')
    df.to_csv("indeed_jobs_cleaned.csv", index=False)
    logging.info("Les données nettoyées ont été exportées vers 'indeed_jobs_cleaned.csv'.")

if __name__ == "__main__":
    start_scraping()


2024-11-18 13:25:47,025 - INFO - Lancement du scraping pour 'developpeur web' à 'Paris'.
2024-11-18 13:25:53,204 - INFO - Recherche effectuée pour le poste 'developpeur web' à 'Paris'.
2024-11-18 13:25:56,234 - INFO - Extraction des annonces sur la page 1.
2024-11-18 13:26:04,800 - INFO - Extraction des annonces sur la page 2.
2024-11-18 13:26:13,193 - INFO - Extraction des annonces sur la page 3.
2024-11-18 13:26:21,622 - INFO - Extraction des annonces sur la page 4.
2024-11-18 13:26:30,075 - INFO - Extraction des annonces sur la page 5.
2024-11-18 13:26:38,569 - INFO - 75 annonces collectées pour 'Paris'.
2024-11-18 13:26:38,572 - INFO - Lancement du scraping pour 'developpeur web' à 'Lyon'.
2024-11-18 13:26:44,451 - INFO - Recherche effectuée pour le poste 'developpeur web' à 'Lyon'.
2024-11-18 13:26:47,482 - INFO - Extraction des annonces sur la page 1.
2024-11-18 13:26:56,077 - INFO - Extraction des annonces sur la page 2.
2024-11-18 13:27:04,782 - INFO - Extraction des annonces s