In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from datetime import datetime
from dateutil.relativedelta import relativedelta
import re
import time

# EXERCICE 0
Perform a GET request to https://www.leboncoin.fr to retrieve the HTML homepage. You must use an user agent to do so, as basic web security prevent HTTP requests from unknown web browser. You can give this one a try : Mozilla/5.0 (Windows NT 10.0;Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.74 Safari/537.36 Edg/79.0.309.43

In [3]:
def make_request(url, headers, params):
    response = requests.get(url,headers=headers,params=params)
    if response.status_code != 200:
        return None
    return response.content


def exponential_backoff_with_jitter(
        url,
        headers,
        params,
        make_request_func,
        max_retries=5,
        base_delay=1,
        max_delay=10,
        jitter=True
):
    retries = 0
    while retries < max_retries:
        result = make_request_func(url, headers, params)
        if result:
            return result

        delay = base_delay * (2 ** retries)
        if jitter:
            delay += random.uniform(0, delay / 2)

        delay = min(delay, max_delay)

        time.sleep(delay)
        retries += 1

def get_source_page(url):
    """Cette fonction récupère le code source d'un site web grâce au package requests
    Paramètres
    -----------
    url : adresse ou lien du site
    Returns
    ---------
    soup : Document HTML du code source du site"""
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0;Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.74 Safari/537.36 Edg/79.0.309.43'}
    
    params={'api_key': 'b9f96203-33b9-4317-9130-0198f8ec07e8',
        'url': url,'num_results': '100'}
    page_source = exponential_backoff_with_jitter('https://proxy.scrapeops.io/v1/', headers=headers,params=params, make_request_func=make_request)
    # page_source = requests.get('https://proxy.scrapeops.io/v1/',headers=headers,params=params)
    
    if page_source:
        soup = BeautifulSoup(page_source,"html.parser")
        return soup
    else :
        # print(f"La requête a échoué avec le statut code :{page_source.status_code}")
        print('La requête a échoué malgré le backoff')

In [4]:
soup = get_source_page('https://www.leboncoin.fr/')
soup

<!DOCTYPE html>
<html lang="fr"><head><meta charset="utf-8"/><link href="/opensearch.xml" rel="search" type="application/opensearchdescription+xml"/><link href="/favicons/lbc.png" rel="icon" type="image/png"/><meta content="#ffffff" name="theme-color"/><meta content="fr" property="og:locale"/><meta content="leboncoin" property="og:site_name"/><meta content="leboncoin" name="twitter:site"/><meta content='CP="This is not a P3P policy"' http-equiv="P3P"/><meta content="initial-scale=1.0, width=device-width, maximum-scale=1.0, user-scalable=0" name="viewport"/><meta content="uyio912RR__InwCKsO90G9r8UeP6ia3EeZwY6KaBRrk" name="google-site-verification"/><meta content="hbda5pvr14cpxa63ojjexw311iyf7c" name="facebook-domain-verification"/><link href="https://www.leboncoin.fr" rel="canonical"/><link data-test-id="no-unread-msgs" href="/_next/static/media/favicon-16.fe104e12.png" rel="icon" sizes="16x16" type="image/png"/><link href="/_next/static/media/favicon-32.e6c3ce2a.png" rel="icon" sizes="

# EXERCICE 1 (10 PT)
With the help of the Exercice0, and the User-Agent: Create a function get_ps5_prices() that returns data from Playstation 5 game console sold on the website using BeautifulSoup python library. Route : /recherche?category=43&text=ps5 You must retrieve for each ads - the title - the price of the article (sellers may not have set a price for the article, put 0 instead) - the date of when it was posted as ISO8601 format - the city - the postal code
Store all the data into a pandas dataframe.
TIPS The HTML you receive from your HTTP request is a basically a snapshot of the website as if the search was done from a web browser such as Firefox. You should do this search from your web browser at the same time and use the web inspector to identify which HTML tags are relevant to get the data correctly.

In [6]:
def get_articles():
    list_articles = []
    p = 1

    while True:
        url_type = f"https://www.leboncoin.fr/recherche?category=43&text=ps5&page={p}"
        soup = get_source_page(url_type)
        ps5 = soup.find_all("div", class_="styles_adCard_HQRFN styles_classified_rnsg4")

        if not ps5:
            break  # Arrêter la boucle si aucune nouvelle donnée n'est récupérée

        list_articles.extend(ps5)
        p += 1

    return list_articles


In [7]:
def scrape_individual_page(link):
    try:
        # Utilisez votre fonction get_source_page ou requests pour récupérer le contenu de la page
        soup = get_source_page(link)

        # Trouver la balise qui contient à la fois la ville et le code
        container_tag = soup.find('h2', class_='flex items-center text-headline-2')

        # Extraire la ville (premier élément dans la balise container)
        ville = container_tag.text.strip()

        # Extraire le code postal (deuxième élément dans la balise container)
        code_tag = container_tag.find('span', class_='ml-md')
        code = code_tag.text.strip() if code_tag else None

        # Supprimer les parenthèses autour du code postal
        code = code.replace('(', '').replace(')', '') if code else None

        return ville, code

    except Exception as e:
        print(f"Erreur lors du scraping de la page {link}: {str(e)}")
        return None, None


In [8]:
def get_ps5_data(url):
    soup = get_source_page(url)

    # Trouver toutes les annonces
    annonces = soup.find_all('a', class_='group/adcard flex h-[inherit] flex-col')

    # Initialiser des listes pour stocker les données
    titres = []
    prices = []
    dates = []
    villes = []
    codes = []

    iso8601_converter = lambda date_str: (datetime.strptime(date_str, '%d %B %Y') if date_str.lower() != "aujourd'hui" else datetime.now()).isoformat() if date_str else None
    # Fonction lambda pour extraire la ville et le code postal
    extract_city_and_postal_code = lambda address: re.match(r'([^\d]+) (\d+)', address).groups() if re.match(r'([^\d]+) (\d+)', address) else (None, None)

    # Parcourir chaque annonce
    for annonce in annonces:
        try:
            # Modifier la classe pour trouver le titre
            titre = annonce.find('p')['title']
            titres.append(titre)
        except AttributeError:
            titre = None

        try:
            # Modifier la classe pour trouver le prix
            prix_tag = annonce.find('p', class_='flex flex-wrap items-center text-callout font-bold !leading-[--font-size-body-2-line-height] text-on-surface').text
            prix = prix_tag.text if prix_tag else 0
            prices.append(prix)
        except AttributeError:
            prix = 0

        try:
            # Trouver la balise qui contient la ville
            adresse = annonce.find('span', class_='mr-[1.2rem] last:mr-none').text
            ville, code = extract_city_and_postal_code(adresse)
            villes.append(ville)  # Ajouter la ville à la liste
            codes.append(code)    # Ajouter le code postal à la liste
        except AttributeError:
            ville = None

        try:
            # Trouver la balise qui contient la date
            date_tag = annonce.find('span', class_='relative inline-block w-full before:absolute before:right-full before:top-none before:hidden before:w-[1.2rem] before:text-center before:font-bold before:content-dot tiny:w-auto tiny:before:inline-block').text
            date_str = date_tag.text.strip() if date_tag else None
            dates.append(iso8601_converter(date_str))
        except AttributeError:
            date = None

    return titres, prices, dates, villes, codes


In [9]:
titres, prices, dates, villes, codes = get_ps5_data('https://www.leboncoin.fr/recherche?category=43&text=ps5&page=1')

# Afficher les résultats
print("Titres:", titres)
print("Prix:", prices)
print("Date:", dates)

Titres: ['Ps5 édition standard + diablo 4', 'PS5 Slim Édition Standard Neuve', 'Ps5 Slim (Neuve)', 'Ps5', 'Ps5', 'Ps5', 'Ps5', 'Ps5', 'Ps5', 'Ps5', 'Ps5', 'Ps5', 'Ps5 digital', 'Ps5 (prix négociable)', 'PS5 Slim Édition Standard Neuve', 'Ps5 + FC 24', 'PS5 Digital Edition', 'Ps5', 'Ps5 a vendre', 'PS5 digital', 'Ps5 très bon état', 'Ps5 slim 1Tera', 'PS5 en très bon état', 'Ps5 Spiderman édition +manette', 'PS5 Neuve+2jeux(sous blister)', 'Ps5+2 manette ps5', 'Ps5 standard + jeux ps5', 'Ps5 digital + 1 manette', 'Ps5 a vendre', 'PS5 avec une manette garantie 1 an', 'Ps5 standard cd', 'Ps5', 'Manette PS5', 'Ps5', 'Ps5', 'Ps5', 'Ps5']
Prix: []
Date: []
