In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import datetime
import re
import time
import os
import json
from PIL import Image 
import PIL 
pd.set_option('display.max_columns', None)
import random
import codecs
import subprocess

# Headers

In [2]:
headers = {
    'authority': 'www.paruvendu.fr',
    'cache-control': 'max-age=0',
    'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="90", "Google Chrome";v="90"',
    'sec-ch-ua-mobile': '?0',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'sec-fetch-site': 'none',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-user': '?1',
    'sec-fetch-dest': 'document',
    'accept-language': 'fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7'
}

# Save pages x15

In [3]:
def save_page_list(req, page=1):
    datetime_1 = datetime.datetime.now().strftime("%Y-%m-%d_%Hh%M")
    page_list_name = "paruvendu" + "-" + datetime_1 + "-" + str(page)
    with open("pages/" + page_list_name + ".html", "w")  as file:
        file.write(req.text)
        file.close()

In [4]:
pages = np.arange(1, 3)
pages

array([1, 2])

In [5]:
for page_ in pages:
    req_x15 = requests.get(f"https://www.paruvendu.fr/auto-moto/listefo/default/default?moto-typeRech=&r=VMOMO000&px1=ex:%2050000&r2=&codeINSEE=&lo=&pa=&ray=100&cy=&nrj=&km1=&a0=&fulltext=&p={page_}",
                           headers = headers)
    save_page_list(req_x15, page=page_)
    time.sleep(random.randint(4, 5))
    print("x15 saved: " + str(page_))

x15 saved: 1
x15 saved: 2


# Extract data from saved x15

## Test file

In [9]:
file_test_name = [file for file in os.listdir("pages") if file.endswith("html")][1]
file_test_name

'paruvendu-2021-06-02_21h20-30.html'

In [10]:
file_check_x15_good = f"pages/{file_test_name}"
file_check_x15_good

'pages/paruvendu-2021-06-02_21h20-30.html'

## Get prices from saved x15

In [11]:
def get_prices_from_saved_x15(r):
    
    # set empty price list
    price_list = []
    
    # get soup
    soup = BeautifulSoup(r, 'html.parser')
    
    # extract price
    step1 = soup.select('div[class*="lazyload_bloc"]')
    if len(step1)>0:
        for k in step1:
            step2 = k.find_all("div", class_="ergov3-priceannonce")
            if len(step2)>0:
                step3 = re.findall('[0-9]', step2[0].text)
                if len(step3)>0:
                    price = float("".join(step3))
                    price_list.append(price)
                else:
                    price_list.append(np.nan)
            else:
                return None
    else:
        return None
                    
    return price_list

In [12]:
with open(file_check_x15_good, 'r') as f:
    readable_html = f.read()
    prices = get_prices_from_saved_x15(readable_html)
prices

[5000.0,
 6200.0,
 3200.0,
 17500.0,
 3790.0,
 2000.0,
 4500.0,
 4500.0,
 13500.0,
 3500.0,
 16500.0,
 6499.0,
 6200.0,
 8500.0,
 10250.0]

## Get references from saved x15

for this website there is no Reference on page x15 but we can still find a unique ID that will differ from announce reference

In [13]:
def get_unique_ID_from_saved_x15(r):
    
    # set empty uniq_ID list
    uniq_ID_list = []
    
    # get soup
    soup = BeautifulSoup(r, 'html.parser')
    
    # extract uniq ID
    step1 = soup.select('div[class*="lazyload_bloc"]')
    if len(step1)>0:
        for k in step1:
            if "data-id" in k.attrs:
                uniq_ID_list.append(k.attrs["data-id"])
            else:
                uniq_ID_list.append(np.nan)
    else:
        return None
    
    return uniq_ID_list

In [14]:
with open(file_check_x15_good, 'r') as f:
    readable_html = f.read()
    unique_id = get_unique_ID_from_saved_x15(readable_html)
unique_id

['1253979582',
 '1253978698',
 '1253977270',
 '1253977145',
 '1253976914',
 '1253889260',
 '1253917685',
 '1253962187',
 '1253870175',
 '1253973918',
 '1253971839',
 '1246648333',
 '1253894110',
 '1253963695',
 '1250913038']

## Get urls from saved x15

In [15]:
def get_urls_from_saved_x15(r):
    
    # set empty url list
    url_list = []
    
    # get soup
    soup = BeautifulSoup(r, 'html.parser')
    
    # extract url
    step1 = soup.select('div[class*="lazyload_bloc"]')
    if len(step1)>0:
        for k in step1:
            step2 = k.find_all("a")
            if len(step2)>2:
                if "href" in step2[1].attrs:
                    url_list.append(step2[1].attrs["href"])
                else:
                    url_list.append(np.nan)
            else:
                return None
    else:
        return None
    
    return url_list

In [16]:
with open(file_check_x15_good, 'r') as f:
    readable_html = f.read()
    urls_list = get_urls_from_saved_x15(readable_html)
urls_list

['https://www.paruvendu.fr/a/moto-scooter/moto/bmw/1200-cm3/1253979582A1KVMOMOBM',
 'https://www.paruvendu.fr/a/moto-scooter/moto/yamaha/689-cm3/1253978698A1KVMOMOYA',
 'https://www.paruvendu.fr/a/moto-scooter/moto/suzuki/650-cm3/1253977270A1KVMOMOSU',
 'https://www.paruvendu.fr/a/moto-scooter/moto/harley-davidson/1680-cm3/1253977145A1KVMOMOHD',
 'https://www.paruvendu.fr/a/moto-scooter/moto/yamaha/900-cm3/1253976914A1KVMOMOYA',
 'https://www.paruvendu.fr/a/moto-scooter/moto/kawasaki/1000-cm3/1253889260A1KVMOMOKA',
 'https://www.paruvendu.fr/a/moto-scooter/moto/royal-enfield/535-cm3/1253917685A1KVMOMORE',
 'https://www.paruvendu.fr/a/moto-scooter/moto/yamaha/1300-cm3/1253962187A1KVMOMOYA',
 'https://www.paruvendu.fr/a/moto-scooter/moto/harley-davidson/1690-cm3/1253870175A1KVMOMOHD',
 'https://www.paruvendu.fr/a/moto-scooter/moto/honda/900-cm3/1253973918A1KVMOMOHO',
 'https://www.paruvendu.fr/a/moto-scooter/moto/harley-davidson/1690-cm3/1253971839A1KVMOMOHD',
 'https://www.paruvendu.fr/

## Get title from saved x15

In [38]:
def get_titles_from_saved_x15(r):
    
    # set empty title list
    title_list = []
    
    # get soup
    soup = BeautifulSoup(r, 'html.parser')
    
    # extract title
    step1 = soup.select('div[class*="lazyload_bloc"]')
    if len(step1)>0:
        for k in step1:
            step2 = k.find_all("h3")
            if len(step2)>0:
                title_list.append(step2[0].text.replace("\n", "").replace("\t", "").replace("Moto ", "").strip())
            else:
                title_list.append(np.nan)
    else:
        return None
    #
    return title_list

In [39]:
with open(file_check_x15_good, 'r') as f:
    readable_html = f.read()
    title_list = get_titles_from_saved_x15(readable_html)
title_list

['BMW',
 'YAMAHA',
 'SUZUKI',
 'HARLEY-DAVIDSON',
 'YAMAHA',
 'KAWASAKI',
 'ROYAL ENFIELD',
 'YAMAHA',
 'HARLEY-DAVIDSON',
 'HONDA',
 'HARLEY-DAVIDSON',
 'TRIUMPH',
 'KAWASAKI',
 'SUZUKI',
 'DUCATI 1198 s']

## temp df template

In [40]:
temp_template = pd.DataFrame({"url": [np.nan],
                              "unique id": [np.nan],
                              "price": [np.nan],
                              "title": [np.nan]
                            })
temp_template

Unnamed: 0,url,unique id,price,title
0,,,,


In [43]:
def save_temporary_data(url_, uniq_id_, price_, title_):

    # load csv if exists or starting from template
    data_paruvendu = "../../../tresboncoin/data/paruvendu_ad.csv"
    #
    if os.path.isfile(data_paruvendu) is False:
        df = temp_template.copy()
    else:
        data = pd.read_csv(data_paruvendu)
        df = temp_template.copy()
        
    # adding data
    df["url"] = url_
    df["unique id"] = uniq_id_
    df["price"] = price_
    df["title"] = title_
    
    # concatenate to csv and write
    try:
        data = pd.concat([data, df], axis=0)
        data.to_csv(path_or_buf = data_paruvendu, index=False)
    except:
        df.to_csv(path_or_buf = data_paruvendu, index=False)
        
    return

# Download single announce pages

##### check if an announce with different price but same unique id exists in dataframe.

In [44]:
data_paruvendu = "../../../tresboncoin/data/paruvendu.csv"

In [45]:
data_price = pd.read_csv(data_paruvendu)

In [46]:
data_price.shape

(1283, 20)

In [63]:
def add_the_announce(df, uniq_id, price):
    if int(uniq_id) in list(df["unique id"]):
        index_ = df.index[df['unique id'] == int(uniq_id)].tolist()[0]
        
        # if an announce with same uniq id and same price is found, return true and skip
        return df.iloc[index_]["price"]!=price
    
    # else, return false and add the announce
    return True

##### save annonce function

In [55]:
def save_page_uniq(req, uniq_id):
    datetime_1 = datetime.datetime.now().strftime("%Y-%m-%d_%Hh%M")
    page_name = "paruvendu" + "-" + uniq_id + "-" + datetime_1
    with open("annonces/" + page_name + ".html", "w")  as file:
        file.write(req.text)
        file.close()

##### check if id exists or price changed and save

In [None]:
page_count = 1
for html_file in [file for file in os.listdir("pages") if file.endswith("html")]:
    with open(f"pages/{html_file}", 'r') as f:
        readable_html = f.read()
        urls_list = get_urls_from_saved_x15(readable_html)
        unique_id = get_unique_ID_from_saved_x15(readable_html)
        prices_list = get_prices_from_saved_x15(readable_html)
    if urls_list != None:
        for url_single, uniq_id, price_ in zip(urls_list, unique_id, prices_list):
            if add_the_announce(data_price, uniq_id, price_):
                req_uniq = requests.get(url_single, headers = headers)
                save_page_uniq(req_uniq, uniq_id)
                print(str(page_count) + ". saved page id: " + uniq_id)
                time.sleep(1)
                page_count += 1
            else:
                print("NO! " + url_single)

In [79]:
1253977270 in list(data_price["unique id"])

True

In [78]:
data_price.iloc[1158]["url"]

'https://www.paruvendu.fr/a/moto-scooter/moto/honda/600-cm3/1254313672A1KVMOMOHO'

# Process single announces
Saving scaled images x3 and adding announce to DataFrame

## DataFrame template

In [121]:
announce_template = pd.DataFrame({"url": [np.nan],
                                  "reference": [np.nan],
                                  "unique id": [np.nan],
                                  "date_scrapped": [np.nan],
                                  "announce_publication_date": [np.nan],
                                  "vehicle brand": [np.nan],
                                  "vehicle type": [np.nan],
                                  "moto scoot": [np.nan],
                                  "color": [np.nan],
                                  "vehicle condition": [np.nan],
                                  "price": [np.nan],
                                  "city": [np.nan],
                                  "postal code": [np.nan],
                                  "vehicle release date": [np.nan],
                                  "mileage": [np.nan],
                                  "Fiscal power [HP]": [np.nan],
                                  "engine capacity [CC]": [np.nan],
                                  "comments": [np.nan],
                                  "seller": [np.nan],
                                  "seller_name": [np.nan]})
announce_template

Unnamed: 0,url,reference,unique id,date_scrapped,announce_publication_date,vehicle brand,vehicle type,moto scoot,color,vehicle condition,price,city,postal code,vehicle release date,mileage,Fiscal power [HP],engine capacity [CC],comments,seller,seller_name
0,,,,,,,,,,,,,,,,,,,,


## test announce

In [80]:
file_check_name_single = "paruvendu-1248033282-2021-05-30_19h04"
#
file_check_single = f"annonces/{file_check_name_single}.html"

## Functions

### Get url

In [81]:
def get_url(r):
    
    # set empty url list
    url_ = ""
    
    # get soup
    soup = BeautifulSoup(r, 'html.parser')
    
    # extract url
    step1 = soup.select('meta[property*="og:url"]')
    if len(step1)>0:
        return step1[0].attrs["content"].split("?")[0]
    else:
        return None

In [82]:
with open(file_check_single, 'r') as f:
    readable_html = f.read()
    g_url = get_url(readable_html)
g_url

FileNotFoundError: [Errno 2] No such file or directory: 'pages/paruvendu-1248033282-2021-05-30_19h04.html'

### Get reference

In [83]:
def get_reference(r):
    
    # set empty reference list
    ref_ = ""
    
    # get soup
    soup = BeautifulSoup(r, 'html.parser')
    
    # extract ref
    step1 = soup.select('div[class*="vvdetails14_refdate"]')
    if len(step1)>0:
        return step1[0].text.replace("\n", "").replace("\t", "").split("ParuVendu")[-1].split("-")[0].strip()
    else:
        return None

In [84]:
with open(file_check_single, 'r') as f:
    readable_html = f.read()
    g_ref = get_reference(readable_html)
g_ref

FileNotFoundError: [Errno 2] No such file or directory: 'pages/paruvendu-1248033282-2021-05-30_19h04.html'

### Get unique ID

In [85]:
def get_uniq_id(url_):
    return url_.split("/")[-1].split("A1")[0]

In [86]:
with open(file_check_single, 'r') as f:
    readable_html = f.read()
    g_url = get_url(readable_html)
get_uniq_id(g_url)

FileNotFoundError: [Errno 2] No such file or directory: 'pages/paruvendu-1248033282-2021-05-30_19h04.html'

### get publication date

In [87]:
def get_publication_date(r):
    
    # set empty 
    pub_ = ""
    
    # get soup
    soup = BeautifulSoup(r, 'html.parser')
    
    # extract
    step1 = soup.select('div[class*="vvdetails14_refdate"]')
    if len(step1)>0:
        try:
            return step1[0].text.replace("\n", "").replace("\t", "").split("ParuVendu")[-1].split("-")[1].strip().split(" ")[1]
        except:
            return np.nan
    else:
        return None

In [88]:
with open(file_check_single, 'r') as f:
    readable_html = f.read()
    g_pub = get_publication_date(readable_html)
g_pub

FileNotFoundError: [Errno 2] No such file or directory: 'pages/paruvendu-1248033282-2021-05-30_19h04.html'

### Get vehicule brand

In [89]:
def get_brand(r):
    
    # set empty 
    brand_ = ""
    
    # get soup
    soup = BeautifulSoup(r, 'html.parser')
    
    # extract
    step1 = soup.select('div[id*="blcheader"]')
    if len(step1)>0:
        return step1[0].select("h1")[0].text.replace(u'\xa0', u' ').strip().split(" ")[-1]
    else:
        return None

In [90]:
with open(file_check_single, 'r') as f:
    readable_html = f.read()
    g_brand = get_brand(readable_html)
g_brand

FileNotFoundError: [Errno 2] No such file or directory: 'pages/paruvendu-1248033282-2021-05-30_19h04.html'

### Get vehicule type

In [91]:
def get_type(r):
    
    # get soup
    soup = BeautifulSoup(r, 'html.parser')
    
    # extract
    step1 = soup.select('div[class*="im12_txt_ann"]')
    if len(step1)>0:
        step2 = step1[0].select('li[class*="nologo"]')
        if len(step2)>0:
            step3 = step2[0].select("span")
            if len(step3)>0:
                return step3[0].text.replace("\n", "")
    return None

In [92]:
with open(file_check_single, 'r') as f:
    readable_html = f.read()
    g_type = get_type(readable_html)
g_type

FileNotFoundError: [Errno 2] No such file or directory: 'pages/paruvendu-1248033282-2021-05-30_19h04.html'

### get moto scoot

In [93]:
def get_moto(r):
        
    # get soup
    soup = BeautifulSoup(r, 'html.parser')
    
    # extract
    step1 = soup.select('div[id*="blcheader"]')
    if len(step1)>0:
        return step1[0].select("h1")[0].text.replace(u'\xa0', u' ').strip().split(" ")[0]
    else:
        return None

In [94]:
with open(file_check_single, 'r') as f:
    readable_html = f.read()
    g_moto = get_moto(readable_html)
g_moto

FileNotFoundError: [Errno 2] No such file or directory: 'pages/paruvendu-1248033282-2021-05-30_19h04.html'

### get color

In [95]:
def get_color(r):
    
    # get soup
    soup = BeautifulSoup(r, 'html.parser')
    
    # extract
    step1 = soup.select('div[class*="im12_txt_ann"]')
    if len(step1)>0:
        step2 = step1[0].select('li[class*="puiss"]')
        if len(step2)>0:
            for k in step2:
                if k.text.find("Couleur")>0:
                    return k.select("span")[0].text.strip()
    return None

In [96]:
with open(file_check_single, 'r') as f:
    readable_html = f.read()
    g_color = get_color(readable_html)
g_color

FileNotFoundError: [Errno 2] No such file or directory: 'pages/paruvendu-1248033282-2021-05-30_19h04.html'

### get vehicle condition

In [97]:
def get_cond(r):
    
    # get soup
    soup = BeautifulSoup(r, 'html.parser')
    
    # extract
    step1 = soup.select('div[class*="im12_txt_ann"]')
    if len(step1)>0:
        step2 = step1[0].select('li[class*="puiss"]')
        if len(step2)>0:
            for k in step2:
                if k.text.find("Etat")>0:
                    return k.select("span")[0].text.strip()
    return None

In [98]:
with open(file_check_single, 'r') as f:
    readable_html = f.read()
    g_cond = get_cond(readable_html)
g_cond

FileNotFoundError: [Errno 2] No such file or directory: 'pages/paruvendu-1248033282-2021-05-30_19h04.html'

### get fiscal power

In [99]:
def get_power(r):
    
    # get soup
    soup = BeautifulSoup(r, 'html.parser')
    
    # extract
    step1 = soup.select('div[class*="im12_txt_ann"]')
    if len(step1)>0:
        step2 = step1[0].select('li[class*="puiss"]')
        if len(step2)>0:
            for k in step2:
                if k.text.find("fiscale")>0:
                    return int(k.select("span")[0].text.strip())
    return None

In [100]:
with open(file_check_single, 'r') as f:
    readable_html = f.read()
    g_power = get_power(readable_html)
g_power

FileNotFoundError: [Errno 2] No such file or directory: 'pages/paruvendu-1248033282-2021-05-30_19h04.html'

### get price

In [101]:
def get_price(r):
    
    # get soup
    soup = BeautifulSoup(r, 'html.parser')
    
    # extract
    step1 = soup.select('div[id*="autoprix"]')
    if len(step1)>0:
        try:
            return float("".join(re.findall("[0-9]", step1[0].text)))
        except:
            return np.nan
    return None

In [102]:
with open(file_check_single, 'r') as f:
    readable_html = f.read()
    g_price = get_price(readable_html)
g_price

FileNotFoundError: [Errno 2] No such file or directory: 'pages/paruvendu-1248033282-2021-05-30_19h04.html'

### get city

In [103]:
def get_city(r):
    
    # get soup
    soup = BeautifulSoup(r, 'html.parser')
    
    # extract
    step1 = soup.select('div[id*="blcheader"]')
    if len(step1)>0:
        return step1[0].select("h2")[0].text.split(" ")[-1].replace("\n", "")
    return None

In [104]:
with open(file_check_single, 'r') as f:
    readable_html = f.read()
    g_city = get_city(readable_html)
g_city

FileNotFoundError: [Errno 2] No such file or directory: 'pages/paruvendu-1248033282-2021-05-30_19h04.html'

### get postal code

In [105]:
def get_postalcode(r):
    
    # get soup
    soup = BeautifulSoup(r, 'html.parser')
    
    # extract
    step1 = soup.select('div[id*="blcheader"]')
    if len(step1)>0:
        return step1[0].select("h2")[0].text.split(" ")[0].replace("\n", "")
    return None

In [106]:
with open(file_check_single, 'r') as f:
    readable_html = f.read()
    g_post = get_postalcode(readable_html)
g_post

FileNotFoundError: [Errno 2] No such file or directory: 'pages/paruvendu-1248033282-2021-05-30_19h04.html'

### get vehicle release date

In [107]:
def get_releasedate(r):
    
    # get soup
    soup = BeautifulSoup(r, 'html.parser')
    
    # extract
    step1 = soup.select('div[class*="im12_txt_ann"]')
    if len(step1)>0:
        step2 = step1[0].select('li[class*="ann"]')
        if len(step2)>0:
            step3 = step2[0].select("span")
            if len(step3)>0:
                return "".join(re.findall("[0-9]", step3[0].text))
    return None

In [108]:
with open(file_check_single, 'r') as f:
    readable_html = f.read()
    g_release = get_releasedate(readable_html)
g_release

FileNotFoundError: [Errno 2] No such file or directory: 'pages/paruvendu-1248033282-2021-05-30_19h04.html'

### get vehicle mileage

In [109]:
def get_mileage(r):
    
    # get soup
    soup = BeautifulSoup(r, 'html.parser')
    
    # extract
    step1 = soup.select('div[class*="im12_txt_ann"]')
    if len(step1)>0:
        step2 = step1[0].select('li[class*="kil"]')
        if len(step2)>0:
            step3 = step2[0].select("span")
            if len(step3)>0:
                return "".join(re.findall("[0-9]", step3[0].text))
    return None

In [110]:
with open(file_check_single, 'r') as f:
    readable_html = f.read()
    g_mileage = get_mileage(readable_html)
g_mileage

FileNotFoundError: [Errno 2] No such file or directory: 'pages/paruvendu-1248033282-2021-05-30_19h04.html'

### get engine capacity

In [111]:
def get_capa(r):
    
    # get soup
    soup = BeautifulSoup(r, 'html.parser')
    
    # extract
    step1 = soup.select('div[class*="im12_txt_ann"]')
    if len(step1)>0:
        step2 = step1[0].select('li[class*="cyl"]')
        if len(step2)>0:
            step3 = step2[0].select("span")
            if len(step3)>0:
                return "".join(re.findall("[0-9]", step3[0].text))
    return None

In [112]:
with open(file_check_single, 'r') as f:
    readable_html = f.read()
    g_capa = get_capa(readable_html)
g_capa

FileNotFoundError: [Errno 2] No such file or directory: 'pages/paruvendu-1248033282-2021-05-30_19h04.html'

### get comment

In [113]:
def get_comment(r):
    
    # get soup
    soup = BeautifulSoup(r, 'html.parser')
    
    # extract
    step1 = soup.select('div[class*="im12_txt_ann"]')
    if len(step1)>0:
        step2 = step1[0].select('div[class*="txt_annonceauto"]')
        if len(step2)>0:
            raw_text = step2[0].text.split("Prix")[0].strip()
            #raw_text = unicode(raw_text, errors='replace')
            return raw_text.replace(u'\x80', u' ').strip().replace("\n", " ").replace("\t", " ")
    return None

In [114]:
with codecs.open(file_check_single, 'r', encoding='utf-8', errors='ignore') as f:
    readable_html = f.read()
    g_comment = get_comment(readable_html)
g_comment

FileNotFoundError: [Errno 2] No such file or directory: 'pages/paruvendu-1248033282-2021-05-30_19h04.html'

### get seller

In [115]:
def get_seller(r):
    
    # get soup
    soup = BeautifulSoup(r, 'html.parser')
    
    # extract
    step1 = soup.select('p[class*="txtpresentation-vendeur"]')
    if len(step1)>0:
        if step1[0].text.find("particulier")>0:
            return ["Particulier", step1[0].text.split(":")[-1].strip().split("\n")[0].strip()]
        else:
            return ["Professionnel", step1[0].text.strip().split("\n")[0]]
    return None

In [116]:
with open(file_check_single, 'r') as f:
    readable_html = f.read()
    g_seller = get_seller(readable_html)
g_seller

FileNotFoundError: [Errno 2] No such file or directory: 'pages/paruvendu-1248033282-2021-05-30_19h04.html'

### Save image functions

In [117]:
def get_images(r, uniq_id):
    
    # get images url list
    image_list = []
    
    # get soup
    soup_ = BeautifulSoup(r, 'html.parser')
    
    # extract
    step1 = soup_.find('div',id="listePhotos")
    
    if step1 != None:
        if len(step1)>0:
            step2 = step1.select("img")
            if len(step2)>0:
                if "src" in step2[0].attrs:
                    for k in range(len(step2)-1):
                        image_list.append(step2[0].attrs["src"].replace("_1.jpeg", "_" + str(k+1) + ".jpeg"))

    # save images
    k=1
    for image_url in image_list[0:3]:
        image_name = f'images/{uniq_id}-{k}.jpg'
        if os.path.isfile(image_name) is False:
            img_data = requests.get(image_url).content
            with open(image_name, 'wb') as handler:
                handler.write(img_data)
            try:
                image = Image.open(image_name) 
                ratio = image.size[0] / image.size[1]
                image = image.resize((300,int(300/ratio)))
                image.save(f'images/{uniq_id}-{k}.jpg',optimize = True, quality = 50)
            except:
                pass
        k+=1
        
    return

In [122]:
with open(file_check_single, 'r') as f:
    readable_html = f.read()
    g_ref = get_reference(readable_html)
    get_images(readable_html, g_ref)

FileNotFoundError: [Errno 2] No such file or directory: 'pages/paruvendu-1248033282-2021-05-30_19h04.html'

<div style="color:orangered; font-weight:700; font-size:20pt">TEST 100 ANNONCES</div>

In [None]:
list_test = []
for single_ in os.listdir("pages")[0:100]:
    with open(f"pages/{single_}", 'r') as f:
        readable_html = f.read()
        g_ref = get_reference(readable_html)
        get_images(readable_html, g_ref)
        print(single_)

In [576]:
#list_test

## Extract 1 announce

In [119]:
def process_announce(file_name):
    """ this function add an announce content from paruvendu.fr to the dataframe """
    
    # load csv if exists or starting from template
    data_paruvendu = "../../../tresboncoin/data/paruvendu.csv"
    #
    if os.path.isfile(data_paruvendu) is False:
        df = announce_template.copy()
    else:
        data = pd.read_csv(data_paruvendu)
        df = announce_template.copy()
    
    # fill df with defined functions
    with open(f"annonces/{file_name}", 'r') as f:
        readable_html = f.read()
        
        df["url"] = get_url(readable_html)
        df["reference"] = get_reference(readable_html)
        df["unique id"] = get_uniq_id(get_url(readable_html))
        df["date_scrapped"] = datetime.datetime.now().strftime("%Y/%m/%d - %Hh%M")
        df["announce_publication_date"] = get_publication_date(readable_html)
        df["vehicle brand"] = get_brand(readable_html)
        df["vehicle type"] = get_type(readable_html)
        df["moto scoot"] = get_moto(readable_html)
        df["color"] = get_color(readable_html)
        df["vehicle condition"] = get_cond(readable_html)
        df["price"] = get_price(readable_html)
        df["city"] = get_city(readable_html)
        df["postal code"] = get_postalcode(readable_html)
        df["vehicle release date"] = get_releasedate(readable_html)
        df["mileage"] = get_mileage(readable_html)
        df["Fiscal power [HP]"] = get_power(readable_html)
        df["engine capacity [CC]"] = get_capa(readable_html)
        df["comments"] = get_comment(readable_html)
        df["seller"] = get_seller(readable_html)[0]
        df["seller_name"] = get_seller(readable_html)[1]
    
        # save images
        get_images(readable_html, get_reference(readable_html))
        
    # concatenate to csv and write
    try:
        data = pd.concat([data, df], axis=0)
        data.to_csv(path_or_buf = data_paruvendu, index=False)
    except:
        df.to_csv(path_or_buf = data_paruvendu, index=False)
    
    # deplacer le fichier html traité dans le vault
    subprocess.run(["mv", f"annonces/{file_name}", "vault"])
    return

In [124]:
for single_ in os.listdir("annonces"):
    process_announce(single_)

##### test

In [182]:
data_paruvendu = "../../../tresboncoin/data/paruvendu.csv"
data = pd.read_csv(data_paruvendu)
data.tail(3)

Unnamed: 0,url,reference,unique id,date_scrapped,announce_publication_date,vehicle brand,vehicle type,moto scoot,color,vehicle condition,price,city,postal code,vehicle release date,mileage,Fiscal power [HP],engine capacity [CC],comments,seller,seller_name
1217,https://www.paruvendu.fr/a/moto-scooter/moto/y...,WV167674084,1254591302,2021/06/01 - 23h55,01/06/2021,YAMAHA,Routière,MOTO,,,,Meulan,78250,1982.0,70785.0,7.0,650.0,Faire offre Yamaha 650 XJ.Kilométrage : 70 785...,Particulier,Yann M
1218,https://www.paruvendu.fr/a/moto-scooter/moto/y...,WV167540801,1254144727,2021/06/01 - 23h55,30/05/2021,YAMAHA,Routière,MOTO,,,12000.0,Fontaine,38600,2015.0,38000.0,12.0,1300.0,"YAMAHA, 1300 Cm3 , Routière, Essence 4 temps, ...",Particulier,Guy D
1219,https://www.paruvendu.fr/a/moto-scooter/moto/d...,WV167690694,1254636763,2021/06/02 - 00h07,01/06/2021,DUCATI,Routière,MOTO,,,9500.0,Pont-à-Mousson,54700,2013.0,12500.0,11.0,1200.0,"DUCATI, 1200 Cm3 , Routière, Essence 4 temps, ...",Particulier,Vincent f


In [183]:
data.shape

(1220, 20)

In [184]:
100 * data.isnull().sum().sort_values(ascending=False)/len(data)

color                        88.442623
vehicle condition            88.196721
Fiscal power [HP]            20.901639
vehicle type                 17.213115
mileage                       2.377049
engine capacity [CC]          2.131148
vehicle release date          1.229508
price                         0.655738
comments                      0.163934
announce_publication_date     0.163934
city                          0.081967
postal code                   0.081967
seller                        0.000000
url                           0.000000
reference                     0.000000
moto scoot                    0.000000
vehicle brand                 0.000000
date_scrapped                 0.000000
unique id                     0.000000
seller_name                   0.000000
dtype: float64

In [185]:
data.iloc[-1]["url"]

'https://www.paruvendu.fr/a/moto-scooter/moto/ducati/1200-cm3/1254636763A1KVMOMODU'

In [186]:
1251175264 in list(data["unique id"])

True