In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import re
import time
import pickle
import random

# EXERCICE 0
Perform a GET request to https://www.leboncoin.fr to retrieve the HTML homepage. You must use an user agent to do so, as basic web security prevent HTTP requests from unknown web browser. You can give this one a try : Mozilla/5.0 (Windows NT 10.0;Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.74 Safari/537.36 Edg/79.0.309.43

In [2]:
def make_request(url, headers, params):
    response = requests.get(url,headers=headers,params=params)
    if response.status_code != 200:
        return None
    return response.content

In [3]:
def exponential_backoff_with_jitter(
        url,
        headers,
        params,
        make_request_func,
        max_retries=5,
        base_delay=1,
        max_delay=10,
        jitter=True
):
    retries = 0
    while retries < max_retries:
        result = make_request_func(url, headers, params)
        if result:
            return result

        delay = base_delay * (2 ** retries)
        if jitter:
            delay += random.uniform(0, delay / 2)

        delay = min(delay, max_delay)

        time.sleep(delay)
        retries += 1


In [4]:
def get_source_page(url):
    """Cette fonction récupère le code source d'un site web grâce au package requests
    Paramètres
    -----------
    url : adresse ou lien du site
    Returns
    ---------
    soup : Document HTML du code source du site"""
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0;Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.74 Safari/537.36 Edg/79.0.309.43'}
    
    params={'api_key': 'e61a5e7b-68d4-4d79-b7df-68fab6beb59a',
        'url': url,'num_results': '100'}
    page_source = exponential_backoff_with_jitter('https://proxy.scrapeops.io/v1/', headers=headers,params=params, make_request_func=make_request)
    # page_source = requests.get('https://proxy.scrapeops.io/v1/',headers=headers,params=params)
    
    if page_source:
        soup = BeautifulSoup(page_source,"html.parser")
        return soup
    else :
        # print(f"La requête a échoué avec le statut code :{page_source.status_code}")
        print('La requête a échoué malgré le backoff')

In [5]:
leboncoin_soup = get_source_page('https://www.leboncoin.fr/')
print(leboncoin_soup)

<!DOCTYPE html>
<html lang="fr"><head><meta charset="utf-8"/><link href="/opensearch.xml" rel="search" type="application/opensearchdescription+xml"/><link href="/favicons/lbc.png" rel="icon" type="image/png"/><meta content="#ffffff" name="theme-color"/><meta content="fr" property="og:locale"/><meta content="leboncoin" property="og:site_name"/><meta content="leboncoin" name="twitter:site"/><meta content='CP="This is not a P3P policy"' http-equiv="P3P"/><meta content="initial-scale=1.0, width=device-width, maximum-scale=1.0, user-scalable=0" name="viewport"/><meta content="uyio912RR__InwCKsO90G9r8UeP6ia3EeZwY6KaBRrk" name="google-site-verification"/><meta content="hbda5pvr14cpxa63ojjexw311iyf7c" name="facebook-domain-verification"/><link href="https://www.leboncoin.fr" rel="canonical"/><link data-test-id="no-unread-msgs" href="/_next/static/media/favicon-16.fe104e12.png" rel="icon" sizes="16x16" type="image/png"/><link href="/_next/static/media/favicon-32.e6c3ce2a.png" rel="icon" sizes="

# EXERCICE 1 (10 PT)
With the help of the Exercice0, and the User-Agent: Create a function get_ps5_prices() that returns data from Playstation 5 game console sold on the website using BeautifulSoup python library. Route : /recherche?category=43&text=ps5 You must retrieve for each ads - the title - the price of the article (sellers may not have set a price for the article, put 0 instead) - the date of when it was posted as ISO8601 format - the city - the postal code
Store all the data into a pandas dataframe.
TIPS The HTML you receive from your HTTP request is a basically a snapshot of the website as if the search was done from a web browser such as Firefox. You should do this search from your web browser at the same time and use the web inspector to identify which HTML tags are relevant to get the data correctly.

In [6]:
def iso8601_converter(date_str):
    if date_str:
        if date_str.lower() == "hier":
            return (datetime.now() - timedelta(days=1)).isoformat()
        elif date_str.lower() == "aujourd'hui":
            return datetime.now().isoformat()
        else:
            return datetime.strptime(date_str, '%d %B %Y').isoformat()
    else:
        return None

In [7]:
def convertir_prix_en_numeric(prix_str):
    try:
        # Essaye de convertir la chaîne en un nombre décimal
        prix_numeric = float(prix_str.replace(',', '.'))  # Remplace la virgule par le point si nécessaire
        return prix_numeric
    except ValueError:
        print("Erreur de conversion : la chaîne n'est pas un prix valide.")
        return None


In [12]:
def get_ps5_data(url):
    soup = get_source_page(url)

    # Trouver toutes les annonces
    annonces = soup.find_all('a', class_='group/adcard flex h-[inherit] flex-col')

    # Initialiser des listes pour stocker les données
    titres = []
    prices = []
    dates = []
    villes = []
    codes = []

    # Fonction lambda pour extraire la ville et le code postal
    extract_city_and_postal_code = lambda address: re.match(r'([^\d]+) (\d+)', address).groups() if re.match(r'([^\d]+) (\d+)', address) else (None, None)

    # Parcourir chaque annonce
    for annonce in annonces:
        try:
            # Modifier la classe pour trouver le titre
            titre = annonce.find('p')['title']
            titres.append(titre)
        except AttributeError:
            titre = None

        try:
            # Modifier la classe pour trouver le prix
            prix_tag = annonce.find('p', class_='flex flex-wrap items-center text-callout font-bold !leading-[--font-size-body-2-line-height] text-on-surface')
            prix_text = prix_tag.text.strip() if prix_tag else None
            prix = int(re.sub(r'\D', '', prix_text)) if prix_text else None
            prices.append(prix)
        except AttributeError:
            prix = 0

        try:
            # Trouver la balise qui contient la ville
            adresse = annonce.find('span', class_='mr-[1.2rem] last:mr-none').text
            ville, code = extract_city_and_postal_code(adresse)
            villes.append(ville)  # Ajouter la ville à la liste
            codes.append(code)    # Ajouter le code postal à la liste
        except AttributeError:
            ville = None

        try:
            # Trouver la balise qui contient la date
            date_tag = annonce.find('span', class_='relative inline-block w-full before:absolute before:right-full before:top-none before:hidden before:w-[1.2rem] before:text-center before:font-bold before:content-dot tiny:w-auto tiny:before:inline-block')
            date_str = date_tag.text.strip() if date_tag else None
            dates.append(iso8601_converter(date_str))
        except AttributeError:
            date_str = None
            
    data = pd.DataFrame({"title": titres, "price": prices, "date": dates, "city": villes, "postal_code": codes})


    return data


In [13]:
data = get_ps5_data('https://www.leboncoin.fr/recherche?category=43&text=ps5&page=1')
data

La requête a échoué malgré le backoff


AttributeError: 'NoneType' object has no attribute 'find_all'