In [86]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import os
import urllib
import pandas as pd

In [230]:
class Client:
    SCROLL_PAUSE_TIME = 1
    CHROME_PATH = 'chromedriver.exe'
    
    def __init__(self):
        self.driver = self.create_driver()
           
    def run(self, url: str, scroll: bool, quit: bool) -> BeautifulSoup:
        self.driver.get(url)
        time.sleep(1)
        if scroll:
            self.scroll_down()
        soup = BeautifulSoup(self.driver.page_source, 'lxml')
        if quit:
            self.driver.quit()
        return soup
    
    def create_driver(self):
        self.profile = webdriver.ChromeOptions()
        #profile.add_argument('headless')
        driver = webdriver.Chrome(
            options=self.profile, 
            executable_path=self.CHROME_PATH)
        return driver
    
    def scroll_down(self):
        last_height = self.driver.execute_script("return document.body.scrollHeight")

        while True:
            # Scroll down to bottom
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

            # Wait to load page
            time.sleep(SCROLL_PAUSE_TIME)

            # Calculate new scroll height and compare with last scroll height
            new_height = self.driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

class Interencheres:
    BASE_URL = 'https://www.interencheres.com'
    SEARCH_URL = '/recherche/lots?search='
    
    def __init__(self, item: str):
        self.client = Client()
        self.url = self.BASE_URL + self.SEARCH_URL + urllib.parse.quote(item)
        self.soup = self.client.run(self.url, True)
    
    def transform(self) -> pd.DataFrame:
        items = self.soup.find('div', class_='results') \
            .find('div', class_='row--dense').find_all('div', class_='col-sm-6')
        lst = []
        for div in items:
            link = BASE_URL + div.find('a')['href'] 
            lot = div.find('span').text
            description = div.find('div', class_='description').text.replace('  ','').replace('\n', '')
            estimation = div.find('div', class_='estimates')
            if estimation:
                estimation = estimation.text.replace('  ','')
            lst += [(description, lot, estimation, link)]
        return pd.DataFrame(lst, columns=['item', 'number', 'estimation', 'link'])
    
class Drouot:
    BASE_URL = 'https://www.drouotonline.com'
    SEARCH_URL = '/recherche/lots?query='
    
    def __init__(self, item: str):
        self.client = Client()
        self.url = self.BASE_URL + self.SEARCH_URL + urllib.parse.quote_plus(item)
        self.soup = self.client.run(self.url, False, False)
        
        # They have pages and not infinite scroll down
        self.nb_items = self.soup.find('div', class_='toolbar').find('h4').text.replace('\n', '').replace('  ', '')
        self.nb_items = int(self.nb_items[:self.nb_items.find('résultats')])
        
        # Iterate trought pages
        self.soups = []
        for i in range(0, nb_items, 50):
            url = self.url + f"&max=50&offset={i}"
            if i == max(range(0, nb_items, 50)):
                self.soups.append(self.client.run(url, False, True))
            else:
                self.soups.append(self.client.run(url, False, False))

    def transform(self) -> pd.DataFrame:
        dfs = []
        for soup in self.soups:
            items = soup.find('div', id='list-lots').find_all('div', class_='lot vsListe')
            lst = []
            for div in items:
                link = str(div.find('div', class_='blog-page').find_all('a')[1])
                link = link[6 + link.find('href="'):link.find('?')]
                link = BASE_URL + link
                dt = div.find('div', class_='infoDateListe').text.replace("\n", "").replace('  ', '')
                description = div.find('div', class_='product-cell').find('h5').text.replace("\n", "").replace('  ', '')
                estimation = div.find('h5', class_='Estimation')
                if estimation:
                    estimation = estimation.text.replace("\n", "").replace('  ', '')
                lst += [(description, description, dt, estimation, link)]
            dfs.append(pd.DataFrame(lst, columns=['item', 'description', 'date', 'estimation', 'link']))
        return pd.concat(dfs)

In [107]:
interencheres = Interencheres('appareil photo')
df = interencheres.transform()

In [237]:
drouot = Drouot('appareil photo')
df = drouot.transform()

In [238]:
df

Unnamed: 0,item,description,date,estimation,link
0,Collection d’appareils photo,Collection d’appareils photo,jeudi 04 févr.à- 14:30,Estimation 30 - 50 EUR,https://www.drouotonline.com/lots/14256294
1,un appareil photo Praktica,un appareil photo Praktica,mardi 09 févr.à- 09:30,Aucune estimation,https://www.drouotonline.com/lots/14239018
2,OLYMPUSAppareil de photo IS - 3000,OLYMPUSAppareil de photo IS - 3000,Jusqu'audimanche 31 janv.01j03h08m26s,Estimation 30 - 50 EUR,https://www.drouotonline.com/lots/14234000
3,KODAK - Appareil photo avec écrin en cuir,KODAK - Appareil photo avec écrin en cuir,samedi 06 févr.à- 14:00,Estimation 10 - 20 EUR,https://www.drouotonline.com/lots/14259746
4,LOT comprenant : un appareil photo AGFA BOX et...,LOT comprenant : un appareil photo AGFA BOX et...,lundi 08 févr.à- 13:00,Estimation 10 - 20 EUR,https://www.drouotonline.com/lots/14243642
...,...,...,...,...,...
47,MINOLTA : appareil photo SR T303 avec quatre o...,MINOLTA : appareil photo SR T303 avec quatre o...,lundi 08 févr.à- 13:00,Estimation 40 - 50 EUR,https://www.drouotonline.com/lots/14243907
48,Artiste – Jean Loup SIEFF (1933 2000) – « La f...,Artiste – Jean Loup SIEFF (1933 2000) – « La f...,lundi 08 févr.à- 14:00,Estimation 30 - 40 EUR,https://www.drouotonline.com/lots/14151975
49,Scratch book et carnets de vol du pilote franc...,Scratch book et carnets de vol du pilote franc...,Jusqu'aumardi 09 févr.10j01h33m54s,Estimation 140 - 220 EUR,https://www.drouotonline.com/lots/14171536
0,"Nungesser, L'oiseau Blanc. Buste en bronze, pa...","Nungesser, L'oiseau Blanc. Buste en bronze, pa...",Jusqu'aumardi 09 févr.10j00h50m22s,Estimation 800 - 1 000 EUR,https://www.drouotonline.com/lots/14171449


In [229]:
df.drop_duplicates()

Unnamed: 0,item,description,date,estimation,link
0,Collection d’appareils photo,Collection d’appareils photo,jeudi 04 févr.à- 14:30,Estimation 30 - 50 EUR,https://www.drouotonline.com/lots/14256294
1,un appareil photo Praktica,un appareil photo Praktica,mardi 09 févr.à- 09:30,Aucune estimation,https://www.drouotonline.com/lots/14239018
2,OLYMPUSAppareil de photo IS - 3000,OLYMPUSAppareil de photo IS - 3000,Jusqu'audimanche 31 janv.01j03h25m51s,Estimation 30 - 50 EUR,https://www.drouotonline.com/lots/14234000
3,KODAK - Appareil photo avec écrin en cuir,KODAK - Appareil photo avec écrin en cuir,samedi 06 févr.à- 14:00,Estimation 10 - 20 EUR,https://www.drouotonline.com/lots/14259746
4,LOT comprenant : un appareil photo AGFA BOX et...,LOT comprenant : un appareil photo AGFA BOX et...,lundi 08 févr.à- 13:00,Estimation 10 - 20 EUR,https://www.drouotonline.com/lots/14243642
...,...,...,...,...,...
47,MINOLTA : appareil photo SR T303 avec quatre o...,MINOLTA : appareil photo SR T303 avec quatre o...,lundi 08 févr.à- 13:00,Estimation 40 - 50 EUR,https://www.drouotonline.com/lots/14243907
48,Artiste – Jean Loup SIEFF (1933 2000) – « La f...,Artiste – Jean Loup SIEFF (1933 2000) – « La f...,lundi 08 févr.à- 14:00,Estimation 30 - 40 EUR,https://www.drouotonline.com/lots/14151975
49,Scratch book et carnets de vol du pilote franc...,Scratch book et carnets de vol du pilote franc...,Jusqu'aumardi 09 févr.10j01h51m19s,Estimation 140 - 220 EUR,https://www.drouotonline.com/lots/14171536
0,"Nungesser, L'oiseau Blanc. Buste en bronze, pa...","Nungesser, L'oiseau Blanc. Buste en bronze, pa...",Jusqu'aumardi 09 févr.10j01h07m47s,Estimation 800 - 1 000 EUR,https://www.drouotonline.com/lots/14171449


In [109]:
chrome = webdriver.Chrome('chromedriver.exe')

In [110]:
chrome.get('https://www.drouotonline.com/recherche/lots?query=appareil+photo')
soup = BeautifulSoup(chrome.page_source, 'lxml')

In [126]:
soup

<html class="" lang="fr"><!--<![endif]--><head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="telephone=no" name="format-detection"/>
<title>Rechercher des lots</title>
<meta content="enchères, tableaux, design, bijoux, montres, brocante" name="keywords"/>
<meta content="drouotonline.com" name="author"/>
<meta content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no" name="viewport"/>
<meta content="true" name="HandheldFriendly"/>
<meta content="DrouotDigital" name="copyright"/>
<meta content="no-cache" http-equiv="cache-Control"/>
<meta content="no-cache" http-equiv="pragma"/>
<script async="" src="https://www.google-analytics.com/analytics.js" type="text/javascript"></script><script async="" src="https://www.googletagmanager.com/gtm.js?id=GTM-PN8H4ZT"></script><script>dataLayer = [{"userStatus":"Internaute","lang":"fr","cid":"","contentGroup1":"Recherche","contentGroup2":"","contentGroup3":"","contentGroup4":""}]</scrip

In [127]:
urllib.parse.quote_plus('appareil photo à caca')

'appareil+photo+%C3%A0+caca'

In [239]:
BASE_URL = 'https://www.drouotonline.com/'
SCROLL_PAUSE_TIME = 0.5
url = BASE_URL + '/recherche/lots?search=appareil%20photo&area=france'
r = requests.get("https://www.interencheres.com/recherche/lots?search=appareil%20photo&area=france")
soup = BeautifulSoup(r.text, 'lxml')

In [241]:
 items = soup.find('div', class_='results') \
            .find('div', class_='row--dense') \
            .find_all('div', class_='col-sm-6')

In [243]:
for div in items:
    print(div.find('div', ''))

<div class="col-sm-6 col-md-4 col-lg-3 col-12" id="page-1"><a class="wrapper ma-2 v-card v-card--link v-sheet v-sheet--outlined theme--light elevation-0 wrapper--gallery" data-v-4255d580="" href="/meubles-objets-art/vente-courante-et-bon-fonds-de-maison-et-vehicules-vp-et-utilitairesapres-lj-tres-recent-electrportatifs-bosch-inscription-par-mail-obligatoire-pour-lexposition-et-la-vente-281295/lot-26491510.html" tabindex="0"><div class="v-card__text text--primary pa-0" data-v-4255d580=""><div class="container pa-0 container--fluid fill-height" data-v-4255d580=""><div class="image col col-12 pa-0" data-v-4255d580=""><div data-v-4255d580=""><div class="v-image v-responsive theme--light" data-v-4255d580=""><div class="v-image__image v-image__image--preload v-image__image--cover" style="background-image:;background-position:center center;"></div><div class="v-responsive__content"></div></div> <div class="more-media-wrapper" data-v-4255d580=""><!-- --></div></div></div> <div class="body px-4

In [188]:
BASE_URL = 'https://www.drouotonline.com'
soup.find

<bound method Tag.find of <html class="" lang="fr"><!--<![endif]--><head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="telephone=no" name="format-detection"/>
<title>Rechercher des lots</title>
<meta content="enchères, tableaux, design, bijoux, montres, brocante" name="keywords"/>
<meta content="drouotonline.com" name="author"/>
<meta content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no" name="viewport"/>
<meta content="true" name="HandheldFriendly"/>
<meta content="DrouotDigital" name="copyright"/>
<meta content="no-cache" http-equiv="cache-Control"/>
<meta content="no-cache" http-equiv="pragma"/>
<script async="" src="https://www.google-analytics.com/analytics.js" type="text/javascript"></script><script async="" src="https://www.googletagmanager.com/gtm.js?id=GTM-PN8H4ZT"></script><script>dataLayer = [{"userStatus":"Internaute","lang":"fr","cid":"","contentGroup1":"Recherche","contentGroup2":"","contentGroup3":"","

In [204]:
for div in soup.find('div', id='list-lots').find_all('div', class_='lot vsListe'):
    link = str(div.find('div', class_='blog-page').find_all('a')[1])
    link = link[6 + link.find('href="'):link.find('?')]
    link = BASE_URL + link
    dt = div.find('div', class_='infoDateListe').text.replace("\n", "").replace('  ', '')
    description = div.find('div', class_='product-cell').find('h5').text.replace("\n", "").replace('  ', '')
    estimation = div.find('h5', class_='Estimation').text.replace("\n", "").replace('  ', '')

In [184]:
str(div.find('div', class_='blog-page').find_all('a')[1])

'<a href="/lots/14256294?actionParam=recherche&amp;controllerParam=lot&amp;fromId=&amp;query=appareil+photo&amp;offset=0&amp;max=50">\n<div class="bgStyle">\n<div class="bgStyle jsLazyLoad loaded" data-ll-status="loaded" style=\'transform: rotate(0deg); background-image: url("https://cdn.drouot.com/d/image/lot?size=ftall&amp;path=266/111778/350a2b7214e49350f9a77224b8758a42");\'>\n</div>\n</div>\n</a>'

In [236]:
class Drouot:
    BASE_URL = 'https://www.drouotonline.com'
    SEARCH_URL = '/recherche/lots?query='

    def __init__(self, item: str):
        """Get results for a searches

        Args:
            item (str): Item to search
        """
        self.client = Client()
        self.url = self.BASE_URL + self.SEARCH_URL
        self.url += urllib.parse.quote_plus(item)
        self.soup = self.client.run(self.url, False, False)

        # They have pages and not infinite scroll down
        self.nb_items = self.soup.find('div', class_='toolbar') \
            .find('h4').text.replace('\n', '').replace('  ', '')
        self.nb_items = int(self.nb_items[:self.nb_items.find('résultats')])

        # Iterate trought pages
        self.soups = []
        for i in range(0, self.nb_items, 50):
            url = self.url + f"&max=50&offset={i}"
            if i == max(range(0, self.nb_items, 50)):
                self.soups.append(self.client.run(url, False, True))
            else:
                self.soups.append(self.client.run(url, False, False))

    def transform(self) -> pd.DataFrame:
        """Transform the soup response into a panda dataframe

        Returns:
            pd.DataFrame: Pandas dataframe containing :
                - item description
                - item date
                - item number
                - item price estimation
                - link to the item
        """
        dfs = []
        for soup in self.soups:
            items = soup.find('div', id='list-lots') \
                .find_all('div', class_='lot vsListe')
            lst = []
            for div in items:
                link = str(
                    div.find(
                        'div',
                        class_='blog-page').find_all('a')[1])
                link = link[6 + link.find('href="'):link.find('?')]
                link = self.BASE_URL + link
                dt = div.find(
                    'div',
                    class_='infoDateListe').text.replace(
                        "\n",
                        "").replace('  ', '')
                description = div.find(
                    'div',
                    class_='product-cell').find('h5').text.replace(
                        "\n",
                        "").replace('  ', '')
                estimation = div.find('h5', class_='Estimation')
                if estimation:
                    estimation = estimation.text.replace(
                        "\n",
                        "").replace('  ', '')
                lst += [(description, description, dt, estimation, link)]
            dfs.append(
                pd.DataFrame(
                    lst,
                    columns=[
                        'item',
                        'description',
                        'date',
                        'estimation',
                        'link']))
        return pd.concat(dfs)
