In [47]:
from bs4 import BeautifulSoup
import requests
import lxml
import datetime as dt 
import pandas as pd

# Auxiliary functions

## Standard TimeStamp

In [48]:
def sistema_data_ora(caricato_il):
    if caricato_il == None:
        return None
    if "Oggi" in caricato_il.text:
        index = (caricato_il.text.find(":"))
        ora = int(caricato_il.text[index-2:index])
        minuti = int(caricato_il.text[index+1:index+3])
        orario = dt.time(ora,minuti)
        today = dt.date.today()
        data_con_orario = dt.datetime.combine(today, orario)
    elif "Ieri" in caricato_il.text:
        index = (caricato_il.text.find(":"))
        yesterday = dt.date.today()-dt.timedelta(days=1)
        ora = int(caricato_il.text[index-2:index])
        minuti = int(caricato_il.text[index+1:index+3])
        orario = dt.time(ora,minuti)
        data_con_orario= dt.datetime.combine(yesterday, orario)
    else:
        mesi = ["gen","feb","mar","apr","mag","giu","lug","ago","set","ott","nov","dic"]
        split = caricato_il.text.split(" ")
        strorario = split[3]
        index = (strorario.find(":"))
        ora = int(strorario[index-2:index])
        minuti = int(strorario[index+1:index+3])
        orario = dt.time(ora,minuti)
        giorno = int(split[0])
        mese = mesi.index(split[1])+1
        data= dt.date(2020,mese,giorno)
        data_con_orario= dt.datetime.combine(data, orario)
    return data_con_orario

## Parse the Details Grid

In [54]:
def details_grid(link_page_inside):
    tmp_source = requests.get(link_page_inside).text
    tmp_soup = BeautifulSoup(tmp_source, "lxml")  
    tmp_detail = tmp_soup.find("p", "classes_sbt-text-atom__2GBat classes_token-body__1dLNW size-normal classes_weight-book__3zPi1 jsx-3711062521 description classes_preserve-new-lines__1X-M6").text
    grid_detail = tmp_soup.find_all("span", class_= "classes_sbt-text-atom__2GBat classes_token-body__1dLNW size-normal classes_weight-book__3zPi1 value jsx-3561725324")
    tipo = ["benzina","diesel","gpl","metano","ibrida","elettrica"]
    check_version = []
    version = tmp_soup.find_all("span", class_= "classes_sbt-text-atom__2GBat classes_token-body__1dLNW size-normal classes_weight-book__3zPi1 label jsx-3561725324")
    for iel in range(0,3):
        check_version.append(version[iel].text)
    marca, modello,versione, km, immatricolazione, carburante, euro = None,None,None,None,None,None,None
    i = 0
    for raw in grid_detail:
        if i == 0:
            marca = raw.text.strip().upper()
            i+=1
        elif i == 1:
            modello = raw.text.strip().lower()
            i+=1
        elif i == 2 and "Versione" in check_version:
            versione = raw.text.strip().lower()
            i+=1 
        else:
            raw_text = raw.text 
            if "Km" in raw_text and raw_text!= "Km0":
                if raw_text.split(" ")[0] == "Km":
                    pass
                else:
                    km =int(raw_text.split(" ")[0])
            if "/" in raw_text and raw_text.split("/")[0].isdigit():
                mese = int(raw_text.split("/")[0])
                anno = int(raw_text.split("/")[1])
                if anno > 1900:
                    immatricolazione =  dt.date(anno, mese, 1)
            if raw_text.strip().lower() in tipo:
                carburante = raw_text.strip().lower()
            if "Euro" in raw_text or "euro" in raw_text:
                if len(raw_text.split(" ")) != 1:
                    euro= int(raw_text.split(" ")[1]  )
            for word in tmp_detail.split():
                word = word.lower()
            if "incidente" == word or "incidentata" == word or "incidentato" ==word or "van" ==word :
                return (None,None,None,None,None,None,None)

    return (marca, modello,versione, km, immatricolazione, carburante, euro)  

## Excel format

In [55]:
def excel_date(df):
    new_date_up = []
    new_imm = []
    for iel in range(len(df)):
        if df.loc[iel]["Immatricolazione"] != None:
            new_imm.append(df.loc[iel]["Immatricolazione"].strftime("%x %X"))
        else:
            new_imm.append(None) 
        new_date_up.append(df.loc[iel]["Data_upload"].strftime("%x %X"))
    df["Data_upload"] = new_date_up
    df["Immatricolazione"] = new_imm
    return df

# Scaping all the article uploaded in the last 24 hours

In [56]:
def scraping_web_prices_day_by_day():
    from datetime import datetime
    now = datetime.now()
    df = pd.DataFrame()
    t = []
    for iel in range(1,20):
        link1= "https://www.subito.it/annunci-lombardia/vendita/auto/?o="
        link2= "&ys=2013&ps=1000&pe=11000&me=20"
        link_page = link1+str(iel)+link2
        source = requests.get(link_page).text
        soup = BeautifulSoup(source, "lxml")
        list_of_car = soup.find_all("div", class_="items__item AdItemBigCard_card__xZSdY")
        for visible_description in list_of_car:
            title = visible_description.find("h2", class_="classes_sbt-text-atom__2GBat classes_token-h6__1ZJNe size-normal classes_weight-semibold__1RkLc ItemTitle_item-title__3xYm- AdItemBigCard_card-title__399Ml")
            for word in title.text.split(" "):
                word = word.lower()
                if word == "van":
                    title = None
            if title == None:
                continue
            data = sistema_data_ora(visible_description.find("span","classes_sbt-text-atom__2GBat classes_token-caption__1Ofu6 classes_size-small__3diir classes_weight-semibold__1RkLc classes_date__2lOoE classes_with-spacer__3WQbi"))
            if data != None:
                diff = pd.Timestamp.now() - data
                if diff < dt.timedelta(days=1):
                    link = visible_description.a["href"]
                    luogo = visible_description.find("span", "classes_sbt-text-atom__2GBat classes_token-caption__1Ofu6 classes_size-small__3diir classes_weight-semibold__1RkLc classes_town__W-0Iq").text.strip()
                    provincia = visible_description.find("span", "classes_sbt-text-atom__2GBat classes_token-caption__1Ofu6 classes_size-small__3diir classes_weight-semibold__1RkLc city").text.replace("(","").replace(")","")
                    prize = (visible_description.find("p", class_="classes_price__HmHqw classes_small__38Lur").text.split(" ")[0])
                    if u'\xa0' in prize:
                        prize = prize.split((u'\xa0'))[0]
                    int_prize = int(prize.split(".")[0]+prize.split(".")[1])
                    marca, modello,versione, km, immatricolazione, carburante, euro = details_grid(link)
                    check = [x == None for x in (marca, modello,versione, km, immatricolazione, carburante, euro)]
                    if False not in check:
                        continue
                    tmp = dict(Data_upload= data, Marca=marca, Modello=modello, Versione=versione,Luogo=luogo,Provincia=provincia, Immatricolazione=immatricolazione,Km = km, Carburante=carburante,Classe_emisionni =euro, Prezzo = int_prize, Link=link )
                    t.append(tmp)
                else:
                    df = pd.DataFrame(t)
                    df = df[["Data_upload", "Marca", "Modello", "Versione","Luogo","Provincia", "Immatricolazione","Km", "Carburante","Classe_emisionni", "Prezzo","Link"]]
                    df = df.sort_values("Data_upload", ascending=False)
                    return df  

In [57]:
df2 = scraping_web_prices_day_by_day()

In [120]:
print(df2.iloc[107, 11])

https://www.subito.it/auto/citroen-c3-2-serie-2016-milano-370957318.htm


In [58]:
df2.loc[pd.isna(df2["Classe_emisionni"]), "Link"]

6      https://www.subito.it/auto/renault-twingo-mila...
7      https://www.subito.it/auto/fiat-tipo-1-3-sw-20...
15     https://www.subito.it/auto/ford-fiesta-7-serie...
20     https://www.subito.it/auto/hyundai-i20-2013-be...
41     https://www.subito.it/auto/fiat-500-2015-2019-...
65     https://www.subito.it/auto/fiat-500l-2017-berg...
75     https://www.subito.it/auto/opel-adam-gpl-anno-...
76     https://www.subito.it/auto/renault-twingo-1-2-...
79     https://www.subito.it/auto/fiat-panda-1-2-easy...
88     https://www.subito.it/auto/renault-scenic-1-6-...
89     https://www.subito.it/auto/ford-fiesta-1-5-tdc...
96     https://www.subito.it/auto/dacia-dokker-2017-p...
99     https://www.subito.it/auto/fiat-qubo-a-metano-...
103    https://www.subito.it/auto/fiat-panda-3-serie-...
105    https://www.subito.it/auto/dacia-sandero-ii-20...
107    https://www.subito.it/auto/citroen-c3-2-serie-...
113    https://www.subito.it/auto/peugeot-308-puretec...
114    https://www.subito.it/au

# Scraping all the webpages

In [None]:
def scraping_web_prices():
    from datetime import datetime
    now = datetime.now()
    df = pd.DataFrame()
    t = []
    for iel in range(1,120):
        link1= "https://www.subito.it/annunci-lombardia/vendita/auto/?o="
        link2= "&ys=2013&ps=1000&pe=11000&me=20"
        link_page = link1+str(iel)+link2
        source = requests.get(link_page).text
        soup = BeautifulSoup(source, "lxml")
        list_of_car = soup.find_all("div", class_="items__item AdItemBigCard_card__xZSdY")
        for visible_description in list_of_car:
            link = visible_description.a["href"]
            luogo = visible_description.find("span", "classes_sbt-text-atom__2GBat classes_token-caption__1Ofu6 classes_size-small__3diir classes_weight-semibold__1RkLc classes_town__W-0Iq").text.strip()
            provincia = visible_description.find("span", "classes_sbt-text-atom__2GBat classes_token-caption__1Ofu6 classes_size-small__3diir classes_weight-semibold__1RkLc city").text.replace("(","").replace(")","")
            data = sistema_data_ora(visible_description.find("span","classes_sbt-text-atom__2GBat classes_token-caption__1Ofu6 classes_size-small__3diir classes_weight-semibold__1RkLc classes_date__2lOoE classes_with-spacer__3WQbi"))
            prize = (visible_description.find("p", class_="classes_price__HmHqw classes_small__38Lur").text.split(" ")[0])
            if u'\xa0' in prize:
                prize = prize.split((u'\xa0'))[0]
            int_prize = int(prize.split(".")[0]+prize.split(".")[1])
            marca, modello,versione, km, immatricolazione, carburante, euro = details_grid(link)
            check = [x == None for x in (marca, modello,versione, km, immatricolazione, carburante, euro)]
            if False not in check:
                continue
            tmp = dict(Data_upload= data, Marca=marca, Modello=modello, Versione=versione,Luogo=luogo,Provincia=provincia, Immatricolazione=immatricolazione,Km = km, Carburante=carburante,Classe_emisionni =euro, Prezzo = int_prize, Link=link )
            t.append(tmp)
    df = pd.DataFrame(t)
    df = df[["Data_upload", "Marca", "Modello", "Versione","Luogo","Provincia", "Immatricolazione","Km", "Carburante","Classe_emisionni", "Prezzo","Link"]]
    df = df.sort_values("Data_upload", ascending=False)
    df = df[~pd.isnull(df["Data_upload"])] #object from "Vetrina" so duplicate.
    df.reset_index(inplace=True,drop=True)
    return df   

In [165]:
from bs4 import BeautifulSoup
import requests
import lxml
import datetime as dt
import pandas as pd
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.base import MIMEBase
from email import encoders
from time import strftime, gmtime
from email.mime.text import MIMEText
import time
import re




def sistema_data_ora(caricato_il):
    if caricato_il == None:
        return None
    if "Oggi" in caricato_il.text:
        index = (caricato_il.text.find(":"))
        ora = int(caricato_il.text[index - 2:index])
        minuti = int(caricato_il.text[index + 1:index + 3])
        orario = dt.time(ora, minuti)
        today = dt.date.today()
        data_con_orario = dt.datetime.combine(today, orario)
    elif "Ieri" in caricato_il.text:
        index = (caricato_il.text.find(":"))
        yesterday = dt.date.today() - dt.timedelta(days=1)
        ora = int(caricato_il.text[index - 2:index])
        minuti = int(caricato_il.text[index + 1:index + 3])
        orario = dt.time(ora, minuti)
        data_con_orario = dt.datetime.combine(yesterday, orario)

    else:
        mesi = ["gen", "feb", "mar", "apr", "mag", "giu", "lug", "ago", "set", "ott", "nov", "dic"]
        split = caricato_il.text.split(" ")
        strorario = split[3]
        index = (strorario.find(":"))
        ora = int(strorario[index - 2:index])
        minuti = int(strorario[index + 1:index + 3])
        orario = dt.time(ora, minuti)
        giorno = int(split[0])
        mese = mesi.index(split[1]) + 1
        data = dt.date(2020, mese, giorno)
        data_con_orario = dt.datetime.combine(data, orario)
    return data_con_orario
def details_grid(link_page_inside):
    tmp_source = requests.get(link_page_inside).text
    tmp_soup = BeautifulSoup(tmp_source, "lxml")
    tmp_detail = tmp_soup.find("p",
                               "classes_sbt-text-atom__2GBat classes_token-body__1dLNW size-normal classes_weight-book__3zPi1 jsx-3711062521 description classes_preserve-new-lines__1X-M6").text
    grid_detail = tmp_soup.find_all("span",
                                    class_="classes_sbt-text-atom__2GBat classes_token-body__1dLNW size-normal classes_weight-book__3zPi1 value jsx-3561725324")
    tipo = ["benzina", "diesel", "gpl", "metano", "ibrida", "elettrica"]
    check_version = []
    version = tmp_soup.find_all("span",
                                class_="classes_sbt-text-atom__2GBat classes_token-body__1dLNW size-normal classes_weight-book__3zPi1 label jsx-3561725324")
    for iel in range(0, 3):
        check_version.append(version[iel].text)
    marca, modello, versione, km, immatricolazione, carburante, euro = None, None, None, None, None, None, None
    i = 0
    for raw in grid_detail:
        if i == 0:
            marca = raw.text.strip().upper()
            i += 1
        elif i == 1:
            modello = raw.text.strip().lower()
            i += 1
        elif i == 2 and "Versione" in check_version:
            versione = raw.text.strip().lower()
            i += 1
        else:
            raw_text = raw.text
            if "Km" in raw_text and raw_text != "Km0":
                if raw_text.split(" ")[0] == "Km":
                    pass
                else:
                    km = int(raw_text.split(" ")[0])
            if "/" in raw_text and raw_text.split("/")[0].isdigit():
                mese = int(raw_text.split("/")[0])
                anno = int(raw_text.split("/")[1])
                if anno > 1900:
                    immatricolazione = dt.date(anno, mese, 1)

            if raw_text.strip().lower() in tipo:
                carburante = raw_text.strip().lower()
            if "Euro" in raw_text or "euro" in raw_text:
                if len(raw_text.split(" ")) != 1:
                    euro = int(raw_text.split(" ")[1])
            for word in tmp_detail.split():
                word = word.lower()
                if "incidente" == word or "incidentata" == word or "incidentato" == word:
                    return (None, None, None, None, None, None, None)
            if euro == None:
                try:
                    second_tmp = tmp_soup.find_all("p", class_= "classes_sbt-text-atom__2GBat classes_token-body__1dLNW size-normal classes_weight-book__3zPi1")
                    for row in second_tmp:
                        txt = row.text.replace(" ", "")
                        check = re.search("Euro*", txt)
                        if check != None:
                            if txt[check.span()[0]+4].isdigit():
                                euro = int(txt[check.span()[0]+4])
                except:
                    euro = None

    return (marca, modello, versione, km, immatricolazione, carburante, euro)
def excel_date(df):
    new_date_up = []
    new_imm = []
    for iel in range(len(df)):
        if df.loc[iel]["Immatricolazione"] != None:
            new_imm.append(df.loc[iel]["Immatricolazione"].strftime("%x %X"))
        else:
            new_imm.append(None)
        new_date_up.append(df.loc[iel]["Data_upload"].strftime("%x %X"))
    df["Data_upload"] = new_date_up
    df["Immatricolazione"] = new_imm
    return df
def scraping_web_prices_day_by_day():
    from datetime import datetime
    now = datetime.now()
    df = pd.DataFrame()
    t = []
    for iel in range(1, 20):
        link1 = "https://www.subito.it/annunci-lombardia/vendita/auto/?o="
        link2 = "&ys=2013&ps=1000&pe=11000&me=20"
        link_page = link1 + str(iel) + link2
        # print(link_page)
        source = requests.get(link_page).text
        soup = BeautifulSoup(source, "lxml")
        list_of_car = soup.find_all("div", class_="items__item AdItemBigCard_card__xZSdY")
        for visible_description in list_of_car:
            title = visible_description.find("h2", class_="classes_sbt-text-atom__2GBat classes_token-h6__1ZJNe size-normal classes_weight-semibold__1RkLc ItemTitle_item-title__3xYm- AdItemBigCard_card-title__399Ml")
            for word in title.text.split(" "):
                word = word.lower()
                if word == "van":
                    title = None
            if title == None:
                continue
            data = sistema_data_ora(visible_description.find("span",
                                                             "classes_sbt-text-atom__2GBat classes_token-caption__1Ofu6 classes_size-small__3diir classes_weight-semibold__1RkLc classes_date__2lOoE classes_with-spacer__3WQbi"))
            if data != None:
                diff = pd.Timestamp.now() - data
                if diff > dt.timedelta(days=1):
                    df = pd.DataFrame(t)
                    df = df[
                        ["Data_upload", "Marca", "Modello", "Versione", "Luogo", "Provincia", "Immatricolazione", "Km",
                         "Carburante", "Classe_emisionni", "Prezzo", "Link"]]
                    df = df.sort_values("Data_upload", ascending=False)
                    return df
                else:
                    link = visible_description.a["href"]
                    # print(link)
                    luogo = visible_description.find("span",
                                                     "classes_sbt-text-atom__2GBat classes_token-caption__1Ofu6 classes_size-small__3diir classes_weight-semibold__1RkLc classes_town__W-0Iq").text.strip()
                    provincia = visible_description.find("span",
                                                         "classes_sbt-text-atom__2GBat classes_token-caption__1Ofu6 classes_size-small__3diir classes_weight-semibold__1RkLc city").text.replace(
                        "(", "").replace(")", "")
                    prize = (
                    visible_description.find("p", class_="classes_price__HmHqw classes_small__38Lur").text.split(" ")[
                        0])
                    if u'\xa0' in prize:
                        prize = prize.split((u'\xa0'))[0]
                    int_prize = int(prize.split(".")[0] + prize.split(".")[1])
                    marca, modello, versione, km, immatricolazione, carburante, euro = details_grid(link)
                    check = [x == None for x in (marca, modello, versione, km, immatricolazione, carburante, euro)]
                    if False not in check:
                        continue
                    tmp = dict(Data_upload=data, Marca=marca, Modello=modello, Versione=versione, Luogo=luogo,
                               Provincia=provincia, Immatricolazione=immatricolazione, Km=km, Carburante=carburante,
                               Classe_emisionni=euro, Prezzo=int_prize, Link=link)
                    t.append(tmp)


df2 = scraping_web_prices_day_by_day()

In [176]:
print(df2.iloc[183,11])

https://www.subito.it/auto/ford-focus-station-wagon-benzina-varese-371012623.htm


In [181]:
df2.sort_values("Prezzo", ascending=False)

Unnamed: 0,Data_upload,Marca,Modello,Versione,Luogo,Provincia,Immatricolazione,Km,Carburante,Classe_emisionni,Prezzo,Link
119,2021-03-31 15:15:00,TOYOTA,yaris 3ª serie,yaris 1.5 hybrid 5 porte by d,Milano,MI,2014-04-01,43000,ibrida,5.0,11000,https://www.subito.it/auto/yaris-hybrid-1-5-un...
134,2021-03-31 13:25:00,MINI,mini (f56),mini 1.5 cooper d boost,Pavia,PV,2014-12-01,60421,diesel,6.0,11000,https://www.subito.it/auto/mini-cooper-d-1-5-s...
284,2021-03-30 22:41:00,VOLKSWAGEN,polo 6ª serie,polo business 1.0 evo 5p. trendline bluemotion...,Cava Manara,PV,2018-10-01,50000,benzina,6.0,11000,https://www.subito.it/auto/volkswagen-polo-100...
42,2021-03-31 17:50:00,FORD,fiesta 1ª/2ª serie,,Casei Gerola,PV,2018-01-01,81542,diesel,6.0,11000,https://www.subito.it/auto/ford-fiesta-1-5-tdc...
188,2021-03-31 11:15:00,VOLKSWAGEN,polo 5ª serie,polo 1.4 tdi 5p. comfortline bluemotion techno...,Desenzano del Garda,BS,2017-09-01,68100,diesel,6.0,11000,https://www.subito.it/auto/volkswagen-polo-1-4...
...,...,...,...,...,...,...,...,...,...,...,...,...
130,2021-03-31 13:37:00,FIAT,punto evo,punto evo 1.3 mjt 75 cv dpf 5 porte s&s blue&me,Cesano Maderno,MB,2014-03-01,4990,diesel,5.0,4990,https://www.subito.it/auto/fiat-punto-1-3-mjet...
187,2021-03-31 11:25:00,FIAT,grande punto,grande punto 1.4 gpl 5 porte active,Milano,MI,2013-12-01,99888,gpl,5.0,4950,https://www.subito.it/auto/fiat-grande-punto-g...
133,2021-03-31 13:33:00,FORD,focus 4ª s. 18-->,focus 2.0 ecoblue 150 cv automatico sw busines...,Settimo Milanese,MI,2018-01-01,60000,diesel,6.0,4900,https://www.subito.it/auto/ford-focud-2000-tdi...
78,2021-03-31 17:13:00,RENAULT,twingo 2ª serie,twingo 1.2 16v live,Osio Sopra,BG,2013-02-01,69000,benzina,5.0,4900,https://www.subito.it/auto/renault-twingo-1-2-...


In [162]:
txt = "132312312Euro6dsadas"
txt.replace(" ", "")
x=re.search("Euro*", txt)
if x:
    print(x.span())
    print(txt[9+4].isdigit())

(9, 13)
True
