In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import mysql.connector
from datetime import datetime

# Base URL
base_url = "https://www.maparatunisie.tn/categorie-produit/visages/page/{}/?_paged={}"
nom_site = "maparatunisie.tn"  # Ajoutez ici le nom du site

# Initialize page number
page_number = 1

# Create an empty dataframe to store the data
columns = ['product_name', 'image_link', 'product_price', 'product_link']
df_products = pd.DataFrame(columns=columns)

while True:
    # Construct the URL with the current page number
    url = base_url.format(page_number, page_number)

    # Make a request to the URL
    response = requests.get(url)

    if response.status_code == 404:
        print("Page not found. Exiting.")
        break
    elif response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Trouver tous les blocs de produits sur la page
        products = soup.find_all('div', class_='product-small')

        if not products:
            # If there are no more product blocks on the page, break out of the loop
            break

        for product in products:
            # Modifier la ligne pour extraire l'URL directe de la balise img
            image_link = product.find('img')['data-src']

            # Example: Extract product name, price, and product link
            product_name = product.find('p', class_='name').text.strip()

            # Nouvelle logique pour extraire le prix le plus bas
            bdi_tags = product.find_all('bdi')
            prices = [float(bdi_tag.get_text(strip=True).replace("TND", "").replace(",", ".")) for bdi_tag in bdi_tags]
            min_price = min(prices) if prices else None

            product_link = product.find('a', class_='woocommerce-LoopProduct-link')['href']

            # Append the extracted information to the dataframe
            row_data = {'product_name': product_name, 'image_link': image_link,
                        'product_price': min_price, 'product_link': product_link}
            df_products = pd.concat([df_products, pd.DataFrame([row_data])], ignore_index=True)

        # Increment the page number for the next iteration
        page_number += 1
    else:
        print("La requête a échoué. Code d'état :", response.status_code)
        break

# Nettoyer les doublons basés sur le lien du produit
df_products.drop_duplicates(subset='product_link', inplace=True)

# MySQL database connection configuration
db_config = {
    'host': 'localhost',
    'user': 'root',
    'database': 'scrapping_db',
    'port': 3306
}

# Establish a connection to the MySQL server
connexion = mysql.connector.connect(**db_config)

# Create a cursor object to interact with the database
curseur = connexion.cursor()

try:
    # Loop through the cleaned data and insert it into the MySQL table
    for _, row in df_products.iterrows():
        # Insert into the "produits" table
        curseur.execute("""
            INSERT INTO produits (nom, nom_site, image, lien)
            VALUES (%s, %s, %s, %s)
        """, (row['product_name'], nom_site, row['image_link'], row['product_link']))

        # Get the ID of the last inserted product
        curseur.execute("SELECT id_produit FROM produits WHERE lien = %s", (row['product_link'],))
        product_id = curseur.fetchone()[0]

        # Insert into the "prix" table without specifying the 'id_produit' column
        curseur.execute("""
            INSERT INTO prix (id_produit, prix, date_prix)
            VALUES (%s, %s, %s)
        """, (product_id, row['product_price'], datetime.today().strftime('%Y-%m-%d')))

    # Commit the changes to the database
    connexion.commit()

    print("Données insérées dans la base de données MySQL.")
except Exception as e:
    print(f"Une erreur s'est produite : {e}")
finally:
    # Close the cursor and connection
    curseur.close()
    connexion.close()


La requête a échoué. Code d'état : 502
Données insérées dans la base de données MySQL.
