In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
from bs4 import BeautifulSoup
from unidecode import unidecode
from neo4j import GraphDatabase
import os

In [2]:
def get_adc_nom_prenom():
    # query = "MATCH (h:HOMME)-[:EXERCE]->(a:ACTIVITÉ {nom:'Agent de change'}) RETURN h"
    query = """
    MATCH (h:HOMME)-[e:EXERCE]->(a:ACTIVITÉ {nom:'Agent de change'})
    WHERE e.date_début.year >= 1815
    AND e.date_fin.year <= 1852
    AND NOT EXISTS {
        MATCH (h)-[m:MARIAGE]->(f:FEMME)
        WHERE m.date_mariage IS NOT NULL
    }
    MATCH (h)-[ha:HABITE]->(l:LIEU)
    WHERE ha.date_début >= e.date_début AND ha.date_début <= e.date_fin
    OR ha.date_fin >= e.date_début AND ha.date_fin <= e.date_fin
    RETURN h
    """

    URI = AURA_CONNECTION_URI
    USER = AURA_USERNAME
    PASS = AURA_PASSWORD

    resultats = []

    # Driver instantiation
    driver = GraphDatabase.driver(URI, auth=(USER, PASS))

    # Create a driver session
    with driver.session() as session:
        # Use .data() to access the results array
        results = session.run(query)
        results_data = results.data()

    for record in results_data:
        fullname = record["h"]["nom"]
        nom = fullname.split(",")[0].strip()
        if "," in fullname:
            prenom = fullname.split(",")[1].strip()
        else:
            prenom = ""
        
        if "date_naissance" in record["h"]:
            date_naissance = record["h"]["date_naissance"]
            date_naissance = date_naissance.year
            nom_prenom_date = [nom, prenom, date_naissance]
            resultats.append(nom_prenom_date)
        
        else:
            nom_prenom = [nom, prenom]
            resultats.append(nom_prenom) 

    driver.close()

    resultats = sorted(resultats, key=lambda x: x[0])
    return resultats

In [3]:
def make_search(nom, date):
    driver.get("https://www.fondsenligne.archives-lyon.fr/v2/ac69/mariage.html")
    recherche_avancee = driver.find_element(by=By.ID, value="rech_avancee")
    recherche_avancee.click()

    input_nom_epoux = driver.find_element(by=By.ID, value="inputnom_epoux")
    input_nom_epoux.send_keys(nom)

    input_annee_debut = driver.find_element(by=By.ID, value="inputannee_mariage_1")
    input_annee_debut.send_keys(f"{date}")

    input_annee_fin = driver.find_element(by=By.ID, value="inputannee_mariage_2")
    input_annee_fin.send_keys("1860")

    btn_submit = driver.find_element(by=By.ID, value="btn_btn_submit")
    btn_submit.click()

    driver.switch_to.window(driver.window_handles[-1])

def simplify_name(nom):
    nom = nom.strip()
    nom = unidecode(nom)
    nom = nom.lower()
    return(nom)

In [4]:
def extract_data(nom, prenom, mode, intitule):
    html = driver.page_source
    soup = BeautifulSoup(html)

    result = ""
    for result in soup.find_all('li', class_='element-list'):
        # Récupérer le nom et le prénom de l'Epoux
        epoux_nom_element = result.find('div', class_='epoux-se-content').find('h4', string='Nom')
        epoux_nom_value = epoux_nom_element.find_next('p').text.strip()

        epoux_prenom_element = result.find('div', class_='epoux-se-content').find('h4', string='Prénom')
        epoux_prenom_value = epoux_prenom_element.find_next('p').text.strip()

        # Récupérer le nom et le prénom de l'Epouse
        epouse_nom_element = result.find_all('div', class_='epoux-se-content')[1].find('h4', string='Nom')
        epouse_nom_value = epouse_nom_element.find_next('p').text.strip()

        epouse_prenom_element = result.find_all('div', class_='epoux-se-content')[1].find('h4', string='Prénom')
        epouse_prenom_value = epouse_prenom_element.find_next('p').text.strip()

        # Récupérer la date
        date_element = result.find('h3', string='Date')
        date_value = date_element.find_next('span').text.strip()

        # Récupérer la cote
        cote_element = result.find('h3', string='Cote')
        cote_value = cote_element.find_next('p').text.strip()

        # Récupérer l'attribut href de l'élément <a> pour le lien
        a_tag = result.find('a')
        link = f"https://www.fondsenligne.archives-lyon.fr/v2/ac69/{a_tag['href']}"
        
        epoux = f"{epoux_nom_value}, {epoux_prenom_value}"
        epouse = f"{epouse_nom_value}, {epouse_prenom_value}"

        simple_epoux_nom_value = simplify_name(epoux_nom_value)
        epoux_prenoms_values = epoux_prenom_value.split(" ")
        simple_epoux_prenoms_values = [simplify_name(prenom) for prenom in epoux_prenoms_values]

        simple_epouse_nom_value = simplify_name(epouse_nom_value)
        epouse_prenoms_values = epouse_prenom_value.split(" ")
        simple_epouse_prenoms_values = [simplify_name(prenom) for prenom in epouse_prenoms_values]

        log = f"date : {date_value}\ncote : {cote_value}\népoux : {epoux}\népouse : {epouse}\nlien : {link}\n\n"

        match mode:
            case "_BY_PRENOM":
                prenoms = prenom.split(" ")
                simple_prenoms = [simplify_name(prenom) for prenom in prenoms]

                for simple_prenom in simple_prenoms:
                    if simple_prenom in simple_epoux_prenoms_values:
                        print(f"{simple_prenom} in {simple_epoux_prenoms_values}")
                        with open(f"results/{intitule}{mode}.txt", "a") as f:
                            f.write(log)
            
            case "_ALL":
                with open(f"results/{intitule}{mode}.txt", "a") as f:
                    f.write(log)
            
            case "_ALL_FORCED":
                with open(f"results/{intitule}{mode}.txt", "a") as f:
                    f.write(log)

In [5]:
def next_page(nom, prenom, mode, intitule):
    extract_data(nom, prenom, mode, intitule)

    driver.implicitly_wait(3) # seconds
    # suivant = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//a[text()="Suivant"]')))
    suivant = driver.find_elements(by=By.XPATH, value='//a[text()="Suivant"]')
    if len(suivant) == 0:
        return False
    else:
        suivant = suivant[0]
        suivant.click()

        time.sleep(2)
        try:
            next_page(nom, prenom, mode, intitule)
        except:
            pass

In [6]:
def search(nom, prenom, date, mode, intitule):
    if unidecode(nom) == nom:
        make_search(nom, date)
        next_page(nom, prenom, mode, intitule)
    else:
        make_search(nom, date)
        next_page(nom, prenom, mode, intitule)
        make_search(unidecode(nom), date)
        next_page(nom, prenom, mode, intitule)

In [7]:
driver = webdriver.Chrome()
driver.set_window_size(1440, 900)

adcs = get_adc_nom_prenom()
# adcs = adcs[8:]
# mode = "_ALL"
mode = "_BY_PRENOM"

for record in adcs:
    nom = record[0]
    prenom = record[1]
    if len(record) == 3:
        date = record[2] + 10
    else:
        date = 1780

    # nom, prenom
    intitule = f"{nom}_{prenom}"
    print(intitule)

    # Nettoyage du nom
    nom = nom.replace("dit)", "").replace("(dit", "").replace("(", "").replace(")", "").replace("  ", " ")
    # Nettoyage du prenom
    prenom = prenom.replace("-", " ")
    
    noms = []
    if " " in nom:
        noms = nom.split(" ")
    elif "-" in nom:
        noms = nom.split("-")
    else:
        noms = [nom]

    if "." not in prenom and len(prenom) != 0:
        with open(f"results/{intitule}{mode}.txt", "w") as f:
            f.write("")
        
        for nom in noms:
            search(nom, prenom, date, mode, intitule)
    
    else:
        with open(f"results/{intitule}_ALL_FORCED.txt", "w") as f:
            f.write("")
        
        for nom in noms:
            search(nom, prenom, date, "_ALL_FORCED", intitule)
    
    # Vérifier si le fichier existe
    if os.path.isfile(f"results/{intitule}{mode}.txt"):
        # Vérifier si le fichier est vide
        if os.stat(f"results/{intitule}{mode}.txt").st_size == 0:
            # Supprimer le fichier
            os.remove(f"results/{intitule}{mode}.txt")
            print(f"results/{intitule}{mode}.txt -> a été supprimé.")

Andrieux_Louis Martin Joseph Antoine
antoine in ['jean', 'antoine']
joseph in ['joseph', 'clement']
louis in ['louis']
Arnaud_Félix Christophe
results/Arnaud_Félix Christophe_BY_PRENOM.txt -> a été supprimé.
Arnaud_Félix Christophe
results/Arnaud_Félix Christophe_BY_PRENOM.txt -> a été supprimé.
Arnaud_Félix Christophe
results/Arnaud_Félix Christophe_BY_PRENOM.txt -> a été supprimé.
Beaugelin_Jean
jean in ['jean', 'francois']
Bender_Jean Auguste Joseph
results/Bender_Jean Auguste Joseph_BY_PRENOM.txt -> a été supprimé.
Berlioz_Auguste Prosper
results/Berlioz_Auguste Prosper_BY_PRENOM.txt -> a été supprimé.
Berlioz_Auguste Prosper
results/Berlioz_Auguste Prosper_BY_PRENOM.txt -> a été supprimé.
Berlioz_Auguste Prosper
results/Berlioz_Auguste Prosper_BY_PRENOM.txt -> a été supprimé.
Blanc_Antoine
antoine in ['antoine']
antoine in ['joseph', 'antoine']
antoine in ['jean', 'antoine']
antoine in ['antoine']
antoine in ['antoine']
antoine in ['antoine', 'michel']
antoine in ['antoine']
antoi

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=116.0.5845.140)
Stacktrace:
0   chromedriver                        0x000000010517e65c chromedriver + 4318812
1   chromedriver                        0x0000000105176d00 chromedriver + 4287744
2   chromedriver                        0x0000000104da87ec chromedriver + 296940
3   chromedriver                        0x0000000104d806e4 chromedriver + 132836
4   chromedriver                        0x0000000104e0bde4 chromedriver + 703972
5   chromedriver                        0x0000000104e1e5b8 chromedriver + 779704
6   chromedriver                        0x0000000104dda178 chromedriver + 500088
7   chromedriver                        0x0000000104ddafc0 chromedriver + 503744
8   chromedriver                        0x000000010513ec40 chromedriver + 4058176
9   chromedriver                        0x0000000105143160 chromedriver + 4075872
10  chromedriver                        0x0000000105106e68 chromedriver + 3829352
11  chromedriver                        0x0000000105143c4c chromedriver + 4078668
12  chromedriver                        0x000000010511bf08 chromedriver + 3915528
13  chromedriver                        0x0000000105160140 chromedriver + 4194624
14  chromedriver                        0x00000001051602c4 chromedriver + 4195012
15  chromedriver                        0x00000001051704d0 chromedriver + 4261072
16  libsystem_pthread.dylib             0x0000000185767fa8 _pthread_start + 148
17  libsystem_pthread.dylib             0x0000000185762da0 thread_start + 8


In [None]:
print(type(adcs[10][2]))