In [4]:
# Titre: Projet de Web Scraping - H3 Hitema
# Auteur: Baptiste RINGLER

# Ce projet vise à extraire les titres, les formations, etc. du site H3 Hitema en utilisant Selenium.


In [6]:
!pip install selenium webdriver-manager pandas sqlalchemy pymongo




In [50]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

def highlight(element):
    driver.execute_script("arguments[0].style.border='3px solid red'", element)

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

driver.get("https://www.h3hitema.fr/formations-informatiques/")
time.sleep(2)

cookie_button = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.XPATH, '//*[@id="cmplz-cookiebanner-container"]/div/div[6]/button[1]'))
)
cookie_button.click()

def incremental_scroll(driver, increments=10, delay=1):
    for i in range(increments):
        scroll_position = driver.execute_script("return document.body.scrollHeight") * (i+1) / increments
        driver.execute_script(f"window.scrollTo(0, {scroll_position});")
        time.sleep(delay)

incremental_scroll(driver, increments=10, delay=1)

main_div = WebDriverWait(driver, 10).until(
    EC.visibility_of_element_located((By.ID, "list__formations"))
)

categories = main_div.find_elements(By.TAG_NAME, "h3")

for category in categories:
    highlight(category)
    print(f"Catégorie: {category.text}\n{'-'*20}")
    
    formation_index = categories.index(category) + 1
    formation_cards = driver.find_elements(By.CSS_SELECTOR, f"div.slider--formations:nth-of-type({formation_index}) a.card-formation--small")

    for card in formation_cards:
        title_element = card.find_element(By.CLASS_NAME, "card-formation__title")
        highlight(title_element)
        title = title_element.text
        print("Titre de la formation:", title)

        badge_list_elements = card.find_elements(By.CLASS_NAME, "list-badge")
        if badge_list_elements:
            badge_list = badge_list_elements[0]
            badges = badge_list.find_elements(By.TAG_NAME, "span")
            badge_texts = [badge.text for badge in badges]
            print("Badges:", ", ".join(badge_texts))
        else:
            print("Badges: Aucun")

        try:
            paragraph_element = card.find_element(By.TAG_NAME, "p")
            highlight(paragraph_element)
            paragraph = paragraph_element.text
        except:
            paragraph = "Pas de description disponible"
        print("Description:", paragraph)

        link = card.get_attribute('href')
        print("Lien de la formation:", link)
        print("-" * 30)

    print("\n")

driver.quit()


Catégorie: BTS INFORMATIQUE (BAC+2)
--------------------
Titre de la formation: BTS CIEL Cybersécurité
Badges: Aucun
Description: option IR
Lien de la formation: https://www.h3hitema.fr/formation/bts-cybersecurite-option-ir/
------------------------------
Titre de la formation: BTS SIO
Badges: ALTERNANCE, FORMATION INITIALE
Description: option SLAM
Lien de la formation: https://www.h3hitema.fr/formation/bts-sio-option-slam/
------------------------------
Titre de la formation: BTS SIO
Badges: ALTERNANCE, FORMATION INITIALE
Description: option SISR
Lien de la formation: https://www.h3hitema.fr/formation/bts-sio-option-sisr/
------------------------------


Catégorie: FORMATION INFORMATIQUE NIVEAU 6 (BAC+3)
--------------------
Titre de la formation: Bachelor informatique en 1 an - Développement Web, logiciel et mobile
Badges: ALTERNANCE, APRES BAC+2, BAC+3, BACHELOR, , 
Description: Pas de description disponible
Lien de la formation: https://www.h3hitema.fr/formation/bachelor-en-1-an-de

In [38]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

def highlight(element):
    """Surligne un élément Selenium sur la page."""
    driver.execute_script("arguments[0].style.border='3px solid red'", element)

def accept_cookies(driver):
    try:
        cookie_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, '//*[@id="cmplz-cookiebanner-container"]/div/div[6]/button[1]'))
        )
        cookie_button.click()
    except Exception as e:
        print("Cookie button not found or not clickable.", e)

def incremental_scroll(driver, increments=10, delay=1):
    for i in range(increments):
        scroll_position = driver.execute_script("return document.body.scrollHeight") * (i+1) / increments
        driver.execute_script(f"window.scrollTo(0, {scroll_position});")
        time.sleep(delay)

def scroll_to_top(driver):
    driver.execute_script("window.scrollTo(0, 0);")

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

driver.get("https://www.h3hitema.fr/formation/developpement-web/")
time.sleep(2)

accept_cookies(driver)
incremental_scroll(driver, increments=10, delay=1)
scroll_to_top(driver)

click_target_xpath = "/html/body/div[2]/section[3]/div/ul/li[2]"
click_target_element = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.XPATH, click_target_xpath))
)
click_target_element.click()
time.sleep(2) 

target_xpath = "/html/body/div[2]/section[3]/div/div/div[2]/div/p[7]"
target_element = WebDriverWait(driver, 10).until(
    EC.visibility_of_element_located((By.XPATH, target_xpath))
)
highlight(target_element)

info_div = WebDriverWait(driver, 10).until(
    EC.visibility_of_element_located((By.XPATH, '/html/body/div[2]/section[3]/div/div/div[2]/div'))
)
paragraphs = info_div.find_elements(By.TAG_NAME, "p")
for paragraph in paragraphs:
    highlight(paragraph)
    print("Information:", paragraph.text)
    if paragraph == target_element:
        break

driver.quit()


Information: Frameworks : React/React Natif – Symfony – Django / Flask – Spring / JEE
Information: Developpement : PHP – NodeJS – Linux / Shell – Structures de données et complexité – Cloud Computing – Machine Learning / IA – Web Scrapping
Information: Conception : Docker – Ergonomie, IHM – Sécurité Applicative – UML / Design Patterns – Performance / Selenium – Droits Web
Information: Pilotage de projet : Gestion de projet – Anglais – Mémoire – Projet Technique
Information: Analyse du SI : ASI – Cloud – Kubernetes / Terraform
Information: Evolution du SI : Micro Services – Serverless – PCA / PRA – Redis – Déploiement Continu / Intégration continue
Information: Pilotage du SI : DevOps / DevSecOps – ITIL – Gestion de projet – Anglais (TOEIC) – Projet Annuel


In [42]:
!pip install mysql-connector-python


Collecting mysql-connector-python
  Obtaining dependency information for mysql-connector-python from https://files.pythonhosted.org/packages/d9/91/007a0d60fee8db4f7385075dc50bf62d2d359b417b374ec06b06ce6c2d64/mysql_connector_python-8.3.0-cp311-cp311-win_amd64.whl.metadata
  Downloading mysql_connector_python-8.3.0-cp311-cp311-win_amd64.whl.metadata (2.0 kB)
Downloading mysql_connector_python-8.3.0-cp311-cp311-win_amd64.whl (15.4 MB)
   ---------------------------------------- 0.0/15.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/15.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/15.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/15.4 MB 217.9 kB/s eta 0:01:11
   ---------------------------------------- 0.1/15.4 MB 326.1 kB/s eta 0:00:48
   ---------------------------------------- 0.1/15.4 MB 403.5 kB/s eta 0:00:38
    --------------------------------------- 0.3/15.4 MB 1.0 MB/s eta 0:00:15
   - -----------------------------------

In [6]:
import mysql.connector
from mysql.connector import Error
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
from sqlalchemy import create_engine

def create_mysql_connection():
    """Crée une connexion à la base de données MySQL."""
    try:
        connection = mysql.connector.connect(
            host='localhost',
            user='root',
            password='',  
            database='test' 
        )
        return connection
    except Error as e:
        print(f"Erreur lors de la connexion à MySQL: {e}")
        return None

def create_table(connection):
    """Crée la table formations si elle n'existe pas déjà."""
    cursor = connection.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS formations (
            id INT AUTO_INCREMENT PRIMARY KEY,
            categorie TEXT NOT NULL,
            titre TEXT NOT NULL,
            badges TEXT,
            description TEXT,
            lien TEXT NOT NULL
        );
    ''')
    connection.commit()

def insert_formation(connection, formation):
    """Insère une formation dans la base de données."""
    cursor = connection.cursor()
    cursor.execute('''
        INSERT INTO formations (categorie, titre, badges, description, lien)
        VALUES (%s, %s, %s, %s, %s)
    ''', formation)
    connection.commit()

def highlight(element):
    """Surligne un élément web."""
    driver.execute_script("arguments[0].style.border='3px solid red'", element)

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)
driver.get("https://www.h3hitema.fr/formations-informatiques/")
time.sleep(2)

cookie_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="cmplz-cookiebanner-container"]/div/div[6]/button[1]')))
cookie_button.click()

def incremental_scroll(driver, increments=10, delay=1):
    for i in range(increments):
        scroll_position = driver.execute_script("return document.body.scrollHeight") * (i + 1) / increments
        driver.execute_script(f"window.scrollTo(0, {scroll_position});")
        time.sleep(delay)

incremental_scroll(driver, increments=10, delay=1)

connection = create_mysql_connection()
if connection is not None:
    create_table(connection)

main_div = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.ID, "list__formations")))
categories = main_div.find_elements(By.TAG_NAME, "h3")

for category in categories:
    highlight(category)
    
    formation_index = categories.index(category) + 1
    formation_cards = driver.find_elements(By.CSS_SELECTOR, f"div.slider--formations:nth-of-type({formation_index}) a.card-formation--small")

    for card in formation_cards:
        title_element = card.find_element(By.CLASS_NAME, "card-formation__title")
        highlight(title_element)
        title = title_element.text

        badge_list_elements = card.find_elements(By.CLASS_NAME, "list-badge")
        badges = ", ".join([badge.text for badge in badge_list_elements[0].find_elements(By.TAG_NAME, "span")]) if badge_list_elements else "Aucun"

        paragraph = card.find_element(By.TAG_NAME, "p").text if card.find_elements(By.TAG_NAME, "p") else "Pas de description disponible"
        
        link = card.get_attribute('href')
        
        if connection is not None:
            insert_formation(connection, (category.text, title, badges, paragraph, link))

driver.quit()
if connection is not None:
    connection.close()

engine = create_engine("mysql+mysqlconnector://root:@localhost/test")
df = pd.read_sql_table('formations', engine)
print(df)


    id                                  categorie  \
0   25                   BTS INFORMATIQUE (BAC+2)   
1   26                   BTS INFORMATIQUE (BAC+2)   
2   27                   BTS INFORMATIQUE (BAC+2)   
3   28    FORMATION INFORMATIQUE NIVEAU 6 (BAC+3)   
4   29    FORMATION INFORMATIQUE NIVEAU 6 (BAC+3)   
5   30  FORMATION INFORMATIQUE NIVEAU 7 (BAC+4/5)   
6   31  FORMATION INFORMATIQUE NIVEAU 7 (BAC+4/5)   
7   32  FORMATION INFORMATIQUE NIVEAU 7 (BAC+4/5)   
8   33  FORMATION INFORMATIQUE NIVEAU 7 (BAC+4/5)   
9   34                                     AUTRES   
10  35                                     AUTRES   
11  36                                     AUTRES   

                                                titre  \
0                              BTS CIEL Cybersécurité   
1                                             BTS SIO   
2                                             BTS SIO   
3   Bachelor informatique en 1 an - Développement ...   
4   Bachelor informatique

In [52]:
import mysql.connector
from mysql.connector import Error
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

def create_mysql_connection():
    try:
        connection = mysql.connector.connect(
            host='localhost',
            user='root',
            password='',  
            database='test' 
        )
        print("Connexion à MySQL DB réussie")
        return connection
    except Error as e:
        print(f"Erreur lors de la connexion à MySQL: {e}")
        return None

def create_table(connection):
    cursor = connection.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS info_formation (
            id INT AUTO_INCREMENT PRIMARY KEY,
            information TEXT NOT NULL
        );
    ''')
    connection.commit()

def insert_information(connection, information):
    cursor = connection.cursor()
    query = "INSERT INTO info_formation (information) VALUES (%s)"
    cursor.execute(query, (information,))
    connection.commit()

def highlight(element):
    driver.execute_script("arguments[0].style.border='3px solid red'", element)

def accept_cookies(driver):
    try:
        cookie_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, '//*[@id="cmplz-cookiebanner-container"]/div/div[6]/button[1]'))
        )
        cookie_button.click()
    except Exception as e:
        print("Cookie button not found or not clickable.", e)

def incremental_scroll(driver, increments=10, delay=1):
    for i in range(increments):
        scroll_position = driver.execute_script("return document.body.scrollHeight") * (i+1) / increments
        driver.execute_script(f"window.scrollTo(0, {scroll_position});")
        time.sleep(delay)

def scroll_to_top(driver):
    driver.execute_script("window.scrollTo(0, 0);")

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

driver.get("https://www.h3hitema.fr/formation/developpement-web/")
time.sleep(2)

accept_cookies(driver)
incremental_scroll(driver, increments=10, delay=1)
scroll_to_top(driver)

click_target_xpath = "/html/body/div[2]/section[3]/div/ul/li[2]"
click_target_element = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.XPATH, click_target_xpath))
)
click_target_element.click()
time.sleep(2) 

target_xpath = "/html/body/div[2]/section[3]/div/div/div[2]/div/p[7]"
target_element = WebDriverWait(driver, 10).until(
    EC.visibility_of_element_located((By.XPATH, target_xpath))
)
highlight(target_element)

info_div = WebDriverWait(driver, 10).until(
    EC.visibility_of_element_located((By.XPATH, '/html/body/div[2]/section[3]/div/div/div[2]/div'))
)
paragraphs = info_div.find_elements(By.TAG_NAME, "p")

connection = create_mysql_connection()
if connection:
    create_table(connection)

for paragraph in paragraphs:
    highlight(paragraph)
    print("Information:", paragraph.text)
    if connection:
        insert_information(connection, paragraph.text)
    if paragraph == target_element:
        break

driver.quit()
if connection:
    connection.close()


Connexion à MySQL DB réussie
Information: Frameworks : React/React Natif – Symfony – Django / Flask – Spring / JEE
Information: Developpement : PHP – NodeJS – Linux / Shell – Structures de données et complexité – Cloud Computing – Machine Learning / IA – Web Scrapping
Information: Conception : Docker – Ergonomie, IHM – Sécurité Applicative – UML / Design Patterns – Performance / Selenium – Droits Web
Information: Pilotage de projet : Gestion de projet – Anglais – Mémoire – Projet Technique
Information: Analyse du SI : ASI – Cloud – Kubernetes / Terraform
Information: Evolution du SI : Micro Services – Serverless – PCA / PRA – Redis – Déploiement Continu / Intégration continue
Information: Pilotage du SI : DevOps / DevSecOps – ITIL – Gestion de projet – Anglais (TOEIC) – Projet Annuel
