In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import json
import os

In [None]:
ID_COOKIES_BUTTON = "onetrust-accept-btn-handler"
CLASS_ITEM_SALE = "new-item-box__overlay"
KEY_WORD_LOCATION = "Sverige"
DATA_TESTID_LOCATION_PARENT = "item-details-location"
ITEMPROP_BRAND = "brand"
CLASS_LOCATION_VALUE = "details-list__item-value"
CLASS_NEXT_PAGE = "web_ui__Pagination__next"
CLASS_ITEM_DETAILS = "box--item-details"
DIRECTORY_OUTPUT = "output"
FILENAME_OUTPUT_LOCATION = "url_by_location.json"
FILENAME_OUTPUT_BRANDS = "brands_by_location.json"
FILENAME_CONFIG = "config.txt"

PATH_OUTPUT_LOCATION = os.path.join(DIRECTORY_OUTPUT, FILENAME_OUTPUT_LOCATION)
PATH_OUTPUT_BRANDS = os.path.join(DIRECTORY_OUTPUT, FILENAME_OUTPUT_BRANDS)

## Initialisation

In [None]:
driver = webdriver.Chrome()

 Dans cette partie, on va accéder au site de Vinted. Si l'utilisateur a déjà effectué une recherche auparavant, nous souhaitons charger les filtres qu'il avait précédemment sélectionnés. Ces filtres sont stockés dans l'URL de la recherche précédente. S'il n'avait pas effectué de recherche, l'url de vinted par défaut est utilisée.

In [None]:
url_vinted = "https://www.vinted.se/"

if os.path.isfile(FILENAME_CONFIG):
    with open(FILENAME_CONFIG, "r") as file:
        url_vinted = file.read()
    print("URL récupérée :", url_vinted)

In [None]:
driver.get(url_vinted)

Accepter les cookies si nécessaire

In [None]:
try:
    cookies_button = driver.find_element(By.ID, ID_COOKIES_BUTTON)
    cookies_button.click()
except Exception as e:
    # S'il n'y a pas la popup des cookies
    pass 

L'utilisateur doit d'abord choisir les filtres par défaut manuellement sur l'interface du navigateur. 

Une fois cette étape accomplie, il doit exécuter la cellule suivante pour enregistrer les filtres de Vinted afin de les réutiliser lors de ses prochaines visites.

In [None]:
with open(FILENAME_CONFIG, "w") as file:
    file.write(driver.current_url)

## Fonctions génériques de navigation

On cherche maintenant à récupérer tous les URLS de tous les produits

In [None]:
def extract_urls_page():
    """
    Description:
    Extract all URLs of products for sale on the opened page.

    Returns:
    All product urls of the current opened page. This is a list of urls (strings).
    """
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    items = soup.find_all("a", class_=CLASS_ITEM_SALE)
    urls = [item['href'] for item in items]
    return urls

Parcourir toute les pages

In [None]:
def go_to_next_page():
    """
    Description:
    This function checks if a next page is available in a pagination system.
    If a next page is available, it navigates to that page and returns True.
    If no next page is available, it returns False without navigating.

    Returns:
    - True if a next page is available and the navigation is successful.
    - False if no next page is available or if the navigation fails.
    """
    try:
        next_button = driver.find_element(By.CLASS_NAME, CLASS_NEXT_PAGE)
        next_button.click()
        WebDriverWait(driver, 10).until(EC.staleness_of(driver.find_element(By.CLASS_NAME, CLASS_ITEM_SALE)))
        return True
    except Exception as e:
        return False

In [None]:
def remove_url_already_visited(urls_to_check, old_urls):
    """
    Description:
    This function filters a list of new URLs, removing those that have already been processed based on URLs that have been previously visited.

    Parameters:
    - urls_to_check: A list of new URLs (strings) that need to be filtered based on whether they have been processed or not. The list is ordered by publication date.
    - old_urls: A list of URLs (strings) that have already been processed and should be used for comparison. The list is ordered by publication date.

    Returns:
    - A boolean indicating whether any URLs were removed from the original list.
    - A list containing only the URLs that are newer than all URLs from old_urls.

    Example:
    Consider the following example:
    - urls_to_check = ["a", "b", "c"]
    - old_urls = ["b", "c", "e", "d"]
    The function will return True and ["a"], as all URLs are ordered by time, so "e" is older than "c". The function ensure that only the latest products are included.
    """
    res = []
    
    for url in urls_to_check:
        if url in old_urls:
            return True, res
        res.append(url)
    return False, res

In [None]:
def navigate_between_pages(urls_already_visited, sort_urls, max_nb_pages=-1):
    """
    Description:
    This function facilitates navigation between pages, enabling the traversal of URLs while avoiding previously visited product URLs. The function iterates through pages until an already visited URL is encountered, ensuring efficient traversal of items.

    Parameters:
    - urls_already_visited: A list of URLs (strings) that have already been visited.
    - sort_urls: A callback function that is called with all URLs not yet visited for each page. A list of URLs (strings) is provided for each new page.
    - max_nb_pages : Maximum number of pages that the program must visit. By default the value is -1 which means that it will go through all the pages until the end.

    Returns:
    This function does not explicitly return a value. Instead, it efficiently navigates through URLs while considering those already visited, ensuring optimal processing and traversal.
    """
    next_page = True
    i = 0
    while next_page:
        i+=1
        print(f"Page n°{i}")
        
        urls_items = extract_urls_page()
        has_removed_url, urls_items_not_visited = remove_url_already_visited(urls_items, urls_already_visited)
        sort_urls(urls_items_not_visited)

        if has_removed_url:
            next_page = False
        else:
            next_page = go_to_next_page()
        
        if max_nb_pages == i:
            next_page = False

## Trier produits en fonction des pays

Nous définissons un dictionnaire qui va stocker les différentes URLs des produits triées par localisation de vente.


In [None]:
urls_location = {}

Nous récupérons les données collectées lors des sessions précédentes.

In [None]:
if os.path.exists(PATH_OUTPUT_LOCATION):
    with open(PATH_OUTPUT_LOCATION, "r") as file:
        urls_location = json.load(file)
    print("Données des recherches antérieures chargées")
else:
    print("Aucune recherche antérieure.")

Nous définissons maintenant une fonction qui trie les nouvelles URLs des produits en fonction de leur localisation.

In [None]:
def sort_urls_by_location(urls_items):
    """
    Description:
    This function sorts the URLs of products based on their location. It navigates to each URL, extracts the location information, and organizes the URLs in a global dictionary called urls_location, with the location as the key and a list of URLs belonging to that location.

    Parameters:
    - urls_items: A list of URLs (strings) representing the products to be sorted based on location.

    Operation:
    The function iterates through the URLs, visits each URL to extract location information, and categorizes the URLs based on their location in the global urls_location dictionary.

    Note: The global variable urls_location is used to store the sorted URLs.
    """
    global urls_location

    url_product_list = driver.current_url

    for url in urls_items:
        driver.get(url)
        WebDriverWait(driver, 10).until(
            EC.visibility_of_element_located((By.CLASS_NAME, CLASS_ITEM_DETAILS))
        )
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        div_location = soup.find("div", {"data-testid": DATA_TESTID_LOCATION_PARENT})

        if div_location:
            location = div_location.find("div", class_=CLASS_LOCATION_VALUE)
            if location:
                if location.text in urls_location:
                    if url not in urls_location[location.text]:
                        urls_location[location.text].append(url)
                else:
                    urls_location[location.text] = [url]
            else:
                print("Localisation non mentionnée")
        else:
            print("Description localisation manquante")
            print(driver.page_source)

        # driver.back()  
    driver.get(url_product_list) # Revenir en arrière pour continuer la recherche

Récupère toutes les URLs déjà visitées à partir du dictionnaire des URLs des produits par pays, puis les fusionne dans une liste unique.

In [None]:
urls_already_visited = []
for location in urls_location.values():
    urls_already_visited.extend(location)

Nous lançons la navigation entre les pages avec les URLs des produits déjà visités par pays, intégrant chaque nouveau produit dans le dictionnaire des produits par pays correspondant.

In [None]:
navigate_between_pages(urls_already_visited, sort_urls_by_location)

Sauvegarde des résultats dans le fichier de sortie

In [None]:
if not os.path.exists(DIRECTORY_OUTPUT):
    os.makedirs(DIRECTORY_OUTPUT)

In [None]:
with open(PATH_OUTPUT_LOCATION, "w") as file:
    json.dump(urls_location, file)

## Produits pour un pays donné
Filtrer les produits en fonction d'une localisation précise.
Nécessite l'exécution des cellules de la section précédente.

In [None]:
def filter_urls_by_location(urls_location, location):
    """
    Description:
    This function filters URLs of products based on a specified location. It takes two parameters: the `urls_location` dictionary, which contains URLs categorized by their respective locations, and the `location` string representing the target location for filtering. The function iterates through the `urls_location` dictionary, creating a new dictionary called `url_filtered` that includes only the URLs associated with the specified location.

    Parameters:
    - urls_location: A dictionary containing URLs sorted by location, where keys represent location names, and values are lists of URLs associated with those locations.
    - location: The target location for filtering the URLs.

    Returns:
    A new dictionary (`url_filtered`) containing URLs sorted by location, where the keys are location names containing the specified location, and the values are lists of URLs associated with those locations.
    """
    url_filtered = {}
    for key, value in urls_location.items():
        if location in key:
            url_filtered[key] = value
    return url_filtered
    

In [None]:
filter_urls_by_location(urls_location, KEY_WORD_LOCATION)

## Marques les plus répandues par pays

Nous définissons un dictionnaire qui va stocker un compteur pour chaque marques par localisation de vente.

In [None]:
brands_location = {}

Nous récupérons les données collectées lors des sessions précédentes.

In [None]:
if os.path.exists(PATH_OUTPUT_BRANDS):
    with open(PATH_OUTPUT_BRANDS, "r") as file:
        brands_location = json.load(file)
    print("Données des recherches antérieures chargées")
else:
    print("Aucune recherche antérieure.")

Nous introduisons désormais une fonction qui incrémente le compteur associé à la marque des produits, en tenant compte de leurs localisations.

In [None]:
def sort_brands_by_location(urls_items):
    """
    This function iterates through a list of product URLs, extracts the location and brand for each product,
    sorts this data by location, and counts the number of products per brand in each location.
    The results are stored in the global variable 'brands_location'.

    Parameters:
    - urls_items (list): List of URLs of the products to process.

    Returns:
    No explicit return; the sorted data is stored in the global variable 'brands_location'.
    """
    global brands_location

    url_product_list = driver.current_url

    for url in urls_items:
        driver.get(url)
        
        WebDriverWait(driver, 10).until(
            EC.visibility_of_element_located((By.CLASS_NAME, CLASS_ITEM_DETAILS))
        )
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        div_location = soup.find("div", {"data-testid": DATA_TESTID_LOCATION_PARENT})
        div_brand = soup.find("div", {"itemprop": ITEMPROP_BRAND})

        if div_location and div_brand:
            location = div_location.find("div", class_=CLASS_LOCATION_VALUE)
            brand = div_brand.a
            if location and brand:
                location = location.text
                if ',' in location:
                    location = location.split(',')[1].strip()
                brand = brand.text.strip()
                if location in brands_location:
                    if brand not in brands_location[location]:
                        brands_location[location][brand] = 0
                    brands_location[location][brand] += 1
                else:
                    brands_location[location] = {}
                    brands_location[location][brand] = 1
            else:
                print("Localisation ou marque non mentionnée")
        else:
            print("Description localisation et/ou marque manquante")

    driver.get(url_product_list)

Nous lançons la navigation entre les pages sans les URLs des produits déjà visités par pays car on ne stoque ici pas les URLs des produits, comptant le nombre de produit par marque et par pays.

In [None]:
navigate_between_pages([], sort_brands_by_location, 30)

Sauvegarde des résultats dans le fichier de sortie

In [None]:
if not os.path.exists(DIRECTORY_OUTPUT):
    os.makedirs(DIRECTORY_OUTPUT)

In [None]:
with open(PATH_OUTPUT_BRANDS, "w") as file:
    json.dump(brands_location, file)

## Fermer le navigateur

In [None]:
driver.quit()