In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from io import StringIO
import os


-------------------------------------------------------------------
**Variables Globales**
---------------------------------------------------------------------

In [2]:
#Chemin de sortie des csv

output_csv ='./output_csv'


------------------------------------------------

**FONCTION**

-----------------------------------------------

In [3]:
#Fontion pour scrap à partir d'un url

def scrap_an_url(url):
    try:
        # Effectuer la requête HTTP
        response = requests.get(url)
        # Vérifier que la requête a réussi
        response.raise_for_status()

        # Créer l'objet BeautifulSoup à partir du contenu de la réponse
        soup = BeautifulSoup(response.content, 'html.parser')

        return soup
    except requests.exceptions.RequestException as e:
        print(f"Erreur : {e}")
        return None

In [4]:
#Fonction qui prend le resultat de la soupe, un id name et le retourne en df

def chems_to_df(soupResult,idname):

    table = soupResult.find('table', {'id': idname})

    table_html = str(table)

    df = pd.read_html(StringIO(table_html))[0]

    return df

In [43]:
def fetch_page(url):
    """Fait une requête GET et renvoie le contenu de la page."""
    resp = requests.get(url)
    if resp.status_code != 200:
        print(f"Failed to retrieve page. Status code: {resp.status_code}")
        return None
    return resp.content

In [44]:
def parse_product(card):
    """Extrait les informations du produit à partir d'un élément card."""
    price = card.find('h4', class_='price').text.strip() if card.find('h4', class_='price') else None
    name = card.find('a', class_='title').text.strip() if card.find('a', class_='title') else None
    descriptions = card.find('p', {'class': 'description'}).text.strip() if card.find('p', {'class': 'description'}) else None
    reviews = card.find('p', {'class': 'review-count'}).text.strip() if card.find('p', {'class': 'review-count'}) else None
    
    ratings_div = card.find('div', class_='ratings')
    if ratings_div:
        rating_paragraph = ratings_div.find('p', attrs={'data-rating': True})
        rating = rating_paragraph['data-rating'] if rating_paragraph else None
    else:
        rating = None
    
    return {
        'Product Name': name,
        'Price': price,
        'Descriptions': descriptions,
        'Ratings': rating,
        'Reviews': reviews
    }

In [45]:
def scrape_products(base_url, max_pages=20):
    """Scrape les produits depuis une URL de base."""
    products_lists = []
    
    for i in range(1, max_pages + 1):
        url = f"{base_url}?page={i}"
        print(f"Scraping page: {i} - URL: {url}")
        
        page_content = fetch_page(url)
        if page_content is None:
            break  # Sortir si la page n'a pas été récupérée
        
        soup_commerce = BeautifulSoup(page_content, 'html.parser')
        products_on_page = soup_commerce.findAll('div', attrs={'class': 'product-wrapper'})
        
        if not products_on_page:
            print("No more products found. Exiting.")
            break
        
        print(f"Found {len(products_on_page)} products on page {i}")
        
        for card in products_on_page:
            product_info = parse_product(card)
            products_lists.append(product_info)

    df_res = pd.DataFrame(products_lists)
    return df_res

In [47]:
def clean_price_column(df):
    """Nettoie la colonne des prix pour enlever le symbole '$' et convertir en float."""
    df['Price'] = df['Price'].str.replace('$', '', regex=False)
    df['Price'] = df['Price'].astype(float)
    return df

----------------------------------------------------
**FIN PARTIE FONCTION**
----------------------------------------------------

Cas pratique 1 :
Web Scraping à partir du Site:
https://www.basketball-reference.com/leagues/NBA_2022_per_game.html
Le but est de récupérer les informations contenues dans le tableau comportant les statistiques
des joueurs.

In [5]:


soup_basket = scrap_an_url('https://www.basketball-reference.com/leagues/NBA_2022_per_game.html')

In [6]:
print(soup_basket.prettify())

<!DOCTYPE html>
<html class="no-js" data-root="/home/bbr/build" data-version="klecko-" lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="ie=edge" http-equiv="x-ua-compatible"/>
  <meta content="width=device-width, initial-scale=1.0, maximum-scale=2.0" name="viewport">
   <link href="https://cdn.ssref.net/req/202410171" rel="dns-prefetch"/>
   <script>
    /* https://docs.osano.com/hc/en-us/articles/22469433444372-Google-Consent-Mode-v2  */
  window.dataLayer = window.dataLayer ||[];
      function gtag(){dataLayer.push(arguments);}
      gtag('consent','default',{
        'ad_storage':'denied',
        'analytics_storage':'denied',
        'ad_user_data':'denied',
        'ad_personalization':'denied',
        'personalization_storage':'denied',
        'functionality_storage':'granted',
        'security_storage':'granted',
        'wait_for_update': 500
      });
      gtag("set", "ads_data_redaction", true);
   </script>
   <script src="https://cmp.osano.com/16CGnCU8UtNh

In [7]:
table1 = soup_basket.find('table', {'id': 'per_game_stats'})

In [8]:
table_html1 = str(table1)

In [9]:
df_baskett = pd.read_html(StringIO(table_html1))[0]

In [10]:
df_baskett

Unnamed: 0,Rk,Player,Age,Team,Pos,G,GS,MP,FG,FGA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Awards
0,1.0,Joel Embiid,27.0,PHI,C,68.0,68.0,33.8,9.8,19.6,...,2.1,9.6,11.7,4.2,1.1,1.5,3.1,2.7,30.6,"MVP-2,AS,NBA2"
1,2.0,LeBron James,37.0,LAL,C,56.0,56.0,37.2,11.4,21.8,...,1.1,7.1,8.2,6.2,1.3,1.1,3.5,2.2,30.3,"MVP-10,AS,NBA3"
2,3.0,Giannis Antetokounmpo,27.0,MIL,PF,67.0,67.0,32.9,10.3,18.6,...,2.0,9.6,11.6,5.8,1.1,1.4,3.3,3.2,29.9,"MVP-3,DPOY-6,AS,NBA1"
3,4.0,Kevin Durant,33.0,BRK,PF,55.0,55.0,37.2,10.5,20.3,...,0.5,6.9,7.4,6.4,0.9,0.9,3.5,2.1,29.9,"MVP-10,AS,NBA2"
4,5.0,Luka Dončić,22.0,DAL,PG,65.0,65.0,35.4,9.9,21.6,...,0.9,8.3,9.1,8.7,1.2,0.6,4.5,2.2,28.4,"MVP-5,AS,NBA1"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
808,602.0,Trayvon Palmer,27.0,DET,SG,1.0,0.0,17.0,0.0,1.0,...,0.0,2.0,2.0,0.0,0.0,0.0,1.0,2.0,0.0,
809,603.0,Emanuel Terry,25.0,PHO,PF,3.0,0.0,6.0,0.0,1.7,...,2.7,2.3,5.0,0.7,0.3,0.0,1.7,1.3,0.0,
810,604.0,Jon Teske,24.0,MEM,C,3.0,0.0,2.7,0.0,0.3,...,0.0,0.7,0.7,0.3,0.3,0.0,0.0,0.3,0.0,
811,605.0,M.J. Walker,23.0,PHO,SG,2.0,0.0,4.0,0.0,2.0,...,0.0,0.5,0.5,0.5,1.0,0.0,0.0,0.5,0.0,


In [11]:
basket_csv_path = os.path.join(output_csv, 'basketdata.csv')

In [12]:
df_baskett.to_csv(basket_csv_path, index=False)

In [13]:
print(f"DataFrame enregistré dans {basket_csv_path}")

DataFrame enregistré dans ./output_csv\basketdata.csv


Cas pratique 2 :
Web Scraping à partir du Site:
https://content.codecademy.com/courses/beautifulsoup/cacao/index.html
Le but est de récupérer les informations contenues dans les deux colonnes « Cocoa percent »
et « Rating » du tableau, comportant le pourcentage du chocolat en cacao et son évaluation
respectivement. Insérer les informations extraites dans un Dataframe puis l’exporter dans un
fichier.csv et un fichier.json. Pour le format json, choisir la valeur index puis records et values
pour le paramètre orient et constater la différence. Voici un aperçu du Dataframe souhaité:


In [14]:
soup_codecademy = scrap_an_url('https://content.codecademy.com/courses/beautifulsoup/cacao/index.html')

In [15]:
print(soup_codecademy.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <style>
   #cacaoTable {
         border-collapse: collapse;
         border: 2px black solid;
         font: 12px sans-serif;
         }
         #cacaoTable td {
         border: 1px black solid;
         padding: 5px;
         }
         #infoContainer {
         margin-bottom: 10px;
         display: inline-block;
         margin-right: 10px;
         }
         #chocolate {
         width: 300px;
         }
         #picContainer {
         display: inline;
         }
  </style>
 </head>
 <body>
  <!-- <script src="http://d3js.org/d3.v3.min.js"></script> -->
  <div id="banner">
   <h1>
    Cacao Ratings
   </h1>
  </div>
  <div id="infoContainer">
   <table class="infoTable">
    <tr>
     <td>
      Compiled ratings of over 1700 Chocolate bars
     </td>
    </tr>
    <tr>
     <td>
      Ratings are from 1-5
     </td>
    </tr>
   </table>
  </div>
  <div id="picContainer">
   <img id="chocolate" src="chocolat

In [16]:
table_codeca = soup_codecademy.find('table', {'id': 'cacaoTable'})


In [17]:
table_html2 = str(table_codeca)

In [18]:
df_codecae = pd.read_html(StringIO(table_html2))[0]

In [19]:
df_codeca = pd.DataFrame(df_codecae)

In [20]:
df_codeca.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,Company (Maker-if known),Specific Bean Origin or Bar Name,REF,Review Date,Cocoa Percent,Company Location,Rating,Bean Type,Broad Bean Origin
1,A. Morin,Agua Grande,1876,2016,63%,France,3.75,,Sao Tome
2,A. Morin,Kpime,1676,2015,70%,France,2.75,,Togo
3,A. Morin,Atsane,1676,2015,70%,France,3,,Togo
4,A. Morin,Akata,1680,2015,70%,France,3.5,,Togo


In [21]:
df_codeca.columns = df_codeca.iloc[0]


In [22]:
df_codeca.head()

Unnamed: 0,Company (Maker-if known),Specific Bean Origin or Bar Name,REF,Review Date,Cocoa Percent,Company Location,Rating,Bean Type,Broad Bean Origin
0,Company (Maker-if known),Specific Bean Origin or Bar Name,REF,Review Date,Cocoa Percent,Company Location,Rating,Bean Type,Broad Bean Origin
1,A. Morin,Agua Grande,1876,2016,63%,France,3.75,,Sao Tome
2,A. Morin,Kpime,1676,2015,70%,France,2.75,,Togo
3,A. Morin,Atsane,1676,2015,70%,France,3,,Togo
4,A. Morin,Akata,1680,2015,70%,France,3.5,,Togo


In [23]:
df_codeca = df_codeca.drop(index=0).reset_index(drop=True)


In [24]:
df_codeca.head()

Unnamed: 0,Company (Maker-if known),Specific Bean Origin or Bar Name,REF,Review Date,Cocoa Percent,Company Location,Rating,Bean Type,Broad Bean Origin
0,A. Morin,Agua Grande,1876,2016,63%,France,3.75,,Sao Tome
1,A. Morin,Kpime,1676,2015,70%,France,2.75,,Togo
2,A. Morin,Atsane,1676,2015,70%,France,3.0,,Togo
3,A. Morin,Akata,1680,2015,70%,France,3.5,,Togo
4,A. Morin,Quilla,1704,2015,70%,France,3.5,,Peru


In [25]:
print(df_codeca.columns)


Index(['Company (Maker-if known)', 'Specific Bean Origin  or Bar Name', 'REF',
       'Review  Date', 'Cocoa  Percent', 'Company  Location', 'Rating',
       'Bean  Type', 'Broad Bean  Origin'],
      dtype='object', name=0)


In [26]:


df_cacao = df_codeca[['Rating', 'Cocoa  Percent']]


In [27]:
df_cacao

Unnamed: 0,Rating,Cocoa Percent
0,3.75,63%
1,2.75,70%
2,3,70%
3,3.5,70%
4,3.5,70%
...,...,...
1790,3.75,70%
1791,3,65%
1792,3.5,65%
1793,3.25,62%


(enregistrer le df cacao)

**Cas pratique 3:**
Scraping à partir du site :
https://www.webscraper.io/test-sites/e-commerce/static/computers/laptops
Travail demandé :
• Récupérer le nom des articles, la description ainsi que le prix pour tous les laptops dans
la page.
• Insérer les données dans un dataframe.
• Récupérer les données des tablets.
Suite :
• Récupérer les informations (nom, description, prix, rating) des laptops de toutes les
pages en utilisant BeautifulSoup.
• Insérer les données récupérées dans un dataframe.
• En utilisant le dataframe obtenu, récupérer les informations suivantes (afficher chaque
résultat dans un dataframe) :
1. Les 3 laptops les mieux notés.
2. Les 3 laptops les moins bien notés.
3. Les 3 laptops les plus chers.
4. Les 3 laptops les moins chers.
5. Refaire le même travail pour les tablets.

In [28]:
resp = requests.get('https://webscraper.io/test-sites/e-commerce/static/computers/laptops')

In [29]:
soup_commerce = BeautifulSoup(resp.content, 'html.parser')

( C'est un echec total....)

je vais vérifier les selecteur avec une seul produit

In [31]:
single_product = soup_commerce.find('div', attrs={'class': 'product-wrapper'})


In [32]:
product_info = {}

**VERIFIER CHAQUE SELECTEUR**

In [33]:
# Vérifier et extraire le nom du produit
product_name = single_product.find('a', class_='title')
if product_name and product_name.text.strip():
    product_info['Product Name'] = product_name.text.strip()
    print(f"Product Name: {product_info['Product Name']}")
else:
    product_info['Product Name'] = None
    print("Product Name not found")

Product Name: Packard 255 G2


In [34]:
# Vérifier et extraire le prix
price = single_product.find('h4', class_='price')
if price and price.text.strip():
    product_info['Price'] = price.text.strip()
    print(f"Price: {product_info['Price']}")
else:
    product_info['Price'] = None
    print("Price not found")

Price: $416.99


In [35]:
description = single_product.find('p', class_={'description', 'card-text'})

if description and description.text.strip():
    product_info['description'] = description.text.strip()
    print(f"description: {product_info['description']}")
else:
    product_info['description'] = None
    print("description not found")


description: 15.6", AMD E2-3800 1.3GHz, 4GB, 500GB, Windows 8.1


In [36]:
review = single_product.find('p', class_={'review-count', 'card-float-end'})

if review and review.text.strip():
    product_info['review'] = review.text.strip()
    print(f"review: {product_info['review']}")
else:
    product_info['review'] = None
    print("review not found")

review: 2 reviews


In [37]:
# Récupérer le rating (attribut data-rating)
ratings_div = single_product.find('div', class_='ratings')
if ratings_div:
    rating_paragraph = ratings_div.find('p', attrs={'data-rating': True})
    if rating_paragraph:
        rating = rating_paragraph['data-rating']  # Récupérer l'attribut data-rating
        print(f"Rating récupéré : {rating}")  # Afficher le rating
    else:
        print("Aucun rating trouvé")
else:
    print("Section des ratings non trouvée")

Rating récupéré : 2


--------------------------------------------------------------
**TRANSFORMER LES DONNées scrap en 2 DF laptops et Tablets**
---------------------------------------------------------------

In [51]:
# Enregistrer le csv 
#df_laptops.to_csv('output_csv/laptops.csv', index=False)
