# Intro

- Step 1: try to go from a class to a sub-class
- Step 2: On a page without sub-class, go though all pages from the last to the first, and get the urls of all each page

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

wd = webdriver.Chrome()
wd.implicitly_wait(10)


## Step 1
Try to go from a class to a sub-class

In [20]:
wd.get(r'https://www.universalis.fr/classification/')

cookie_button = wd.find_element(By.XPATH, '//*[@id="qc-cmp2-ui"]/div[1]/button')
cookie_button.click()

In [21]:
example = wd.find_element(By.XPATH, '//*[@id="main-content"]/section/section/div/a[1]')
example.click()

## Step 2
On a page without sub-class, go though all pages from the last to the first, and get the urls of all each page

In [32]:
wd.get(r'https://www.universalis.fr/classification/arts/architecture/architectes/architectes-de-1945-a-nos-jours/21/')

In [35]:
articles = wd.find_elements(By.XPATH, '//*[@id="main-content"]/section/section[1]/ul/li/h3/a[contains(@href, "")]')

for article in articles:
    url = article.get_attribute('href')
    print(f'{article.text}\n{url}\n')

WHITE GEORGE M. (1920-2011)
https://www.universalis.fr/encyclopedie/george-m-white/

WISSA WASSEF RAMSÈS (1911-1974)
https://www.universalis.fr/encyclopedie/ramses-wissa-wassef/

WOGENSCKY ANDRÉ (1916-2004)
https://www.universalis.fr/encyclopedie/andre-wogenscky/

YAMASAKI MINORU (1912-1986)
https://www.universalis.fr/encyclopedie/yamasaki/

ZEHRFUSS BERNARD (1911-1996)
https://www.universalis.fr/encyclopedie/bernard-zehrfuss/

ZUMTHOR PETER (1943- )
https://www.universalis.fr/encyclopedie/peter-zumthor/



In [50]:
previous_page = wd.find_element(By.XPATH, '//*[@id="main-content"]/section/section[1]/div/div/ul/li[@class="page-item active"]//preceding-sibling::li[1]')
previous_page.click()

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//*[@id="main-content"]/section/section[1]/div/div/ul/li[@class="page-item active"]//preceding-sibling::li[1]"}
  (Session info: chrome=124.0.6367.207)


### Step 2 finished

## Random tests

In [2]:
wd.get(r'https://www.universalis.fr/classification/arts/architecture/architectes/architectes-de-1945-a-nos-jours/')
#last_page = wd.find_element(By.XPATH, '//*[@id="main-content"]/section/section[1]/div/div/ul/li[last()]/a')

last_page = wd.find_element(By.XPATH, '//*[@id="main-content"]/section/section[1]/div/div/ul/li[last()]/a')

In [3]:
href = last_page.get_attribute('href')
print(type(href))
print(href)

<class 'str'>
https://www.universalis.fr/classification/arts/architecture/architectes/architectes-de-1945-a-nos-jours/21/


In [4]:
another_page = wd.get(href)

### Function test

In [4]:
from typing import List
from selenium.common.exceptions import NoSuchElementException

In [5]:
def get_urls(page_url: str, wd) -> List[str]:
    """Get all articles' url of a page"""
    print(f'Scrapping urls from {page_url}')
    wd.get(page_url)
    articles = wd.find_elements(By.XPATH, '//*[@id="main-content"]/section/section[1]/ul/li/h3/a[contains(@href, "")]')
    url_list = []
    for article in articles:
        url = article.get_attribute('href')
        url_list.append(url)

    return url_list

In [6]:
def get_subclass_pages(page_url: str, wd) -> List[str]:
    """A recursive funct that navigates through all pages of a subclass and get urls"""
    url_list = get_urls(page_url, wd)

    # Try to go to the next page
    try:
        next_page = wd.find_element(By.XPATH, '//*[@class="page-item active"]//following-sibling::li[1]/a')
        next_page_url = next_page.get_attribute('href')
        print(f'Navigating to {next_page_url}')
        url_list.extend(get_subclass_pages(next_page_url, wd))
    # If at the last page
    except NoSuchElementException:
        print("Reached the last page, no further pages to navigate.")
    except Exception as e:  # Catch any other exception that might occur
        print(f"An error occurred: {e}")
        
    return url_list

In [36]:
test_page_url = 'https://www.universalis.fr/classification/arts/architecture/architectes/architectes-de-1945-a-nos-jours/'
url_list = get_subclass_pages(test_page_url, wd)

Scrapping urls from https://www.universalis.fr/classification/arts/architecture/architectes/architectes-de-1945-a-nos-jours/
Navigating to https://www.universalis.fr/classification/arts/architecture/architectes/architectes-de-1945-a-nos-jours/2/
Scrapping urls from https://www.universalis.fr/classification/arts/architecture/architectes/architectes-de-1945-a-nos-jours/2/
Navigating to https://www.universalis.fr/classification/arts/architecture/architectes/architectes-de-1945-a-nos-jours/3/
Scrapping urls from https://www.universalis.fr/classification/arts/architecture/architectes/architectes-de-1945-a-nos-jours/3/
Navigating to https://www.universalis.fr/classification/arts/architecture/architectes/architectes-de-1945-a-nos-jours/4/
Scrapping urls from https://www.universalis.fr/classification/arts/architecture/architectes/architectes-de-1945-a-nos-jours/4/
Navigating to https://www.universalis.fr/classification/arts/architecture/architectes/architectes-de-1945-a-nos-jours/5/
Scrapping 

In [24]:
wd.get('https://www.universalis.fr/classification/arts/architecture/architectes/architectes-de-1945-a-nos-jours/')

next_page = wd.find_element(By.XPATH, '//*[@class="page-item active"]//following-sibling::li[1]/a')
next_page_url = next_page.get_attribute('href')
print(next_page.text)
print(next_page_url)

2
https://www.universalis.fr/classification/arts/architecture/architectes/architectes-de-1945-a-nos-jours/2/


In [37]:
for url in url_list[:5]:
    print(url)

https://www.universalis.fr/encyclopedie/alvar-aalto/
https://www.universalis.fr/encyclopedie/emile-aillaud/
https://www.universalis.fr/encyclopedie/ando/
https://www.universalis.fr/encyclopedie/andrault-parat/
https://www.universalis.fr/encyclopedie/paul-andreu/


In [38]:
print(len(url_list))

206


## Navigate through classes and subclasses

In [40]:
wd.get('https://www.universalis.fr/classification/arts/architecture/architectes/')


first_page = wd.find_element(By.XPATH, '//*[@id="main-content"]/section/section/div/a[1]')
print(first_page.text)
first_page.click()

Architectes, XIXe s.
96 articles
3 médias


In [45]:
wd.get('https://www.universalis.fr/classification/arts/architecture/architectes/architectes-xixe-s/')

try:
    first_page = wd.find_element(By.XPATH, '//*[@id="main-content"]/section/section/div/a[1]')
    first_page.click()
except NoSuchElementException:
    wd.back()
    url_list = wd.find_elements(By.XPATH, '//*[@id="main-content"]/section/section/div/a')

In [47]:
for url in url_list:
    print(url.text)

Architectes, XIXe s.
96 articles
3 médias
Architectes, XVIIe s.
41 articles
Architectes, XVIIIe s.
118 articles
Architectes, Antiquité
12 articles
Architectes, Moyen Âge
26 articles
Architectes, Renaissance
76 articles
2 médias
Architectes, de 1900 à 1945
128 articles
5 médias
Architectes, de 1945 à nos jours
206 articles
3 médias


In [3]:
wd.get('https://www.universalis.fr/classification/arts/architecture/architectes/')

In [None]:
try:
    pages = wd.find_element(By.XPATH, '//*[@id="main-content"]/section/section/div/a')
    current_page = 
except NoSuchElementException:
    wd.back()
    url_list = wd.find_elements(By.XPATH, '//*[@id="main-content"]/section/section/div/a')

In [11]:
def navigate_classes(url, wd) -> List[str]:
    wd.get(url)
    
    try:
        classes = wd.find_elements(By.XPATH, '//*[@id="main-content"]/section/section/div/a')
        classes_url = []
        for cls in classes:
            cls_url = cls.get_attribute('href')
            classes_url.append(cls_url)
        url_list = []
        for cls_url in classes_url:
            url_list.extend(navigate_classes(cls_url, wd))
    except NoSuchElementException:
        return get_subclass_pages(url, wd)
        
    return url_list

In [14]:
url_list = navigate_classes('https://www.universalis.fr/classification/arts/architecture/architectes/architectes-antiquite/', wd)

In [16]:
wd.get(r'https://www.universalis.fr/classification/arts/architecture/architectes/architectes-antiquite/')
classes = wd.find_elements(By.XPATH, '//*[@id="main-content"]/section/section/div/a')

In [32]:
def navigate_classes(url: str, limit: int, wd) -> List[str]:
    wd.get(url)
    classes = wd.find_elements(By.XPATH, '//*[@id="main-content"]/section/section/div/a')

    if not classes:  # Basic case, where on a page of articles
        return get_subclass_pages(url, wd)

    # Where on a page of classes
    classes_url = []
    for cls in classes:
        cls_url = cls.get_attribute('href')
        classes_url.append(cls_url)
    url_list = []
    count = 0
    for cls_url in classes_url:
        url_list.extend(navigate_classes(cls_url, limit, wd))
        count += len(url_list)
        if count >= limit:
            print(f'Scrapping finished. Scrapped {count} urls.')
            break
    return url_list

In [33]:
url_list = navigate_classes('https://www.universalis.fr/classification/arts/architecture/architectes/', 100, wd)

Scrapping urls from https://www.universalis.fr/classification/arts/architecture/architectes/architectes-xixe-s/
Navigating to https://www.universalis.fr/classification/arts/architecture/architectes/architectes-xixe-s/2/
Scrapping urls from https://www.universalis.fr/classification/arts/architecture/architectes/architectes-xixe-s/2/
Navigating to https://www.universalis.fr/classification/arts/architecture/architectes/architectes-xixe-s/3/
Scrapping urls from https://www.universalis.fr/classification/arts/architecture/architectes/architectes-xixe-s/3/
Navigating to https://www.universalis.fr/classification/arts/architecture/architectes/architectes-xixe-s/4/
Scrapping urls from https://www.universalis.fr/classification/arts/architecture/architectes/architectes-xixe-s/4/
Navigating to https://www.universalis.fr/classification/arts/architecture/architectes/architectes-xixe-s/5/
Scrapping urls from https://www.universalis.fr/classification/arts/architecture/architectes/architectes-xixe-s/5/


In [28]:
print(len(url_list))
for url in url_list[:5]:
    print(url)

696
https://www.universalis.fr/encyclopedie/paul-abadie/
https://www.universalis.fr/encyclopedie/jean-antoine-alavoine/
https://www.universalis.fr/encyclopedie/charles-robert-ashbee/
https://www.universalis.fr/encyclopedie/albert-ballu/
https://www.universalis.fr/encyclopedie/theodore-ballu/


In [39]:
def get_urls(current_page_url: str, wd) -> List[str]:
    """Get all articles' urls of a page"""
    print(f'Scrapping urls at {current_page_url}')
    wd.get(current_page_url)
    xpath = '//*[@id="main-content"]/section/section[position()=1 or position()=2]/ul/li/h3/a[contains(@href, "")]'
    articles = wd.find_elements(By.XPATH, xpath)
    url_list = []
    for article in articles:
        url = article.get_attribute('href')
        url_list.append(url)

    return url_list

In [40]:
def navigate_classes(url: str, limit:int, wd) -> List[str]:
    """A recursive function that gets all the pages of a specific class"""
    wd.get(url)
    classes = wd.find_elements(By.XPATH, '//*[@id="main-content"]/section/section/div/a')

    if not classes:  # Basic case, where on a page of articles
        return []

    # Where on a page of classes
    classes_url = []
    for cls in classes:
        cls_url = cls.get_attribute('href')
        classes_url.append(cls_url)
    url_list = get_urls(url, wd)  # In case where this class page also have articles
    count = len(url_list)
    for cls_url in classes_url:
        print(f'Navigating deeper into {cls_url}')
        url_list.extend(navigate_classes(cls_url, limit, wd))
        count += len(url_list)
        if count >= limit:
            break

    return url_list

In [41]:
url_list = navigate_classes('https://www.universalis.fr/classification/arts/arts-generalites/marche-de-l-art/', 1000, wd)

Scrapping urls at https://www.universalis.fr/classification/arts/arts-generalites/marche-de-l-art/
Navigating deeper into https://www.universalis.fr/classification/arts/arts-generalites/marche-de-l-art/galeries-d-art/
Navigating deeper into https://www.universalis.fr/classification/arts/arts-generalites/marche-de-l-art/marchands-d-art/


In [42]:
print(url_list)

['https://www.universalis.fr/encyclopedie/art-l-art-et-son-objet-le-faux-en-art/', 'https://www.universalis.fr/encyclopedie/art-aspects-culturels-le-marche-de-l-art/', 'https://www.universalis.fr/encyclopedie/art-contemporain/', 'https://www.universalis.fr/encyclopedie/christie-s-et-sotheby-s/', 'https://www.universalis.fr/encyclopedie/trans-avant-garde/']
