In [1]:
from bs4 import BeautifulSoup
import requests
import lxml
import datetime as dt 
import pandas as pd

# Auxiliary functions

## Standard TimeStamp

In [2]:
def sistema_data_ora(caricato_il):
    if caricato_il == None:
        return None
    if "Oggi" in caricato_il.text:
        index = (caricato_il.text.find(":"))
        ora = int(caricato_il.text[index-2:index])
        minuti = int(caricato_il.text[index+1:index+3])
        orario = dt.time(ora,minuti)
        today = dt.date.today()
        data_con_orario = dt.datetime.combine(today, orario)
    elif "Ieri" in caricato_il.text:
        index = (caricato_il.text.find(":"))
        yesterday = dt.date.today()-dt.timedelta(days=1)
        ora = int(caricato_il.text[index-2:index])
        minuti = int(caricato_il.text[index+1:index+3])
        orario = dt.time(ora,minuti)
        data_con_orario= dt.datetime.combine(yesterday, orario)
    else:
        mesi = ["gen","feb","mar","apr","mag","giu","lug","ago","set","ott","nov","dic"]
        split = caricato_il.text.split(" ")
        strorario = split[3]
        index = (strorario.find(":"))
        ora = int(strorario[index-2:index])
        minuti = int(strorario[index+1:index+3])
        orario = dt.time(ora,minuti)
        giorno = int(split[0])
        mese = mesi.index(split[1])+1
        data= dt.date(2020,mese,giorno)
        data_con_orario= dt.datetime.combine(data, orario)
    return data_con_orario

## Parse the Details Grid

In [3]:
def details_grid(link_page_inside):
    tmp_source = requests.get(link_page_inside).text
    tmp_soup = BeautifulSoup(tmp_source, "lxml")  
    tmp_detail = tmp_soup.find("p", "classes_sbt-text-atom__2GBat classes_token-body__1dLNW size-normal classes_weight-book__3zPi1 jsx-3711062521 description classes_preserve-new-lines__1X-M6").text
    for word in tmp_detail.split():
        word = word.lower()
        if "incidente" == word or "incidentata" == word or "incidentato" ==word:
            return (None,None,None,None,None,None,None)
    grid_detail = tmp_soup.find_all("span", class_= "classes_sbt-text-atom__2GBat classes_token-body__1dLNW size-normal classes_weight-book__3zPi1 value jsx-3561725324")
    tipo = ["benzina","diesel","gpl","metano","ibrida","elettrica"]
    check_version = []
    version = tmp_soup.find_all("span", class_= "classes_sbt-text-atom__2GBat classes_token-body__1dLNW size-normal classes_weight-book__3zPi1 label jsx-3561725324")
    for iel in range(0,3):
        check_version.append(version[iel].text)
    marca, modello,versione, km, immatricolazione, carburante, euro = None,None,None,None,None,None,None
    i = 0
    for raw in grid_detail:
        if i == 0:
            marca = raw.text.strip().upper()
            i+=1
        elif i == 1:
            modello = raw.text.strip().lower()
            i+=1
        elif i == 2 and "Versione" in check_version:
            versione = raw.text.strip().lower()
            i+=1 
        else:
            raw_text = raw.text 
            if "Km" in raw_text and raw_text!= "Km0":
                if raw_text.split(" ")[0] == "Km":
                    pass
                else:
                    km =int(raw_text.split(" ")[0])
            if "/" in raw_text and raw_text.split("/")[0].isdigit():
                mese = int(raw_text.split("/")[0])
                anno = int(raw_text.split("/")[1])
                if anno > 1900:
                    immatricolazione =  dt.date(anno, mese, 1)
                    
            if raw_text.strip().lower() in tipo:
                carburante = raw_text.strip().lower()
            if "Euro" in raw_text or "euro" in raw_text:
                if len(raw_text.split(" ")) != 1:
                    euro= int(raw_text.split(" ")[1]  )
    return (marca, modello,versione, km, immatricolazione, carburante, euro)  

## Excel format

In [4]:
def excel_date(df):
    new_date_up = []
    new_imm = []
    for iel in range(len(df)):
        if df.loc[iel]["Immatricolazione"] != None:
            new_imm.append(df.loc[iel]["Immatricolazione"].strftime("%x %X"))
        else:
            new_imm.append(None) 
        new_date_up.append(df.loc[iel]["Data_upload"].strftime("%x %X"))
    df["Data_upload"] = new_date_up
    df["Immatricolazione"] = new_imm
    return df

# Scaping all the article uploaded in the last 24 hours

In [5]:
def scraping_web_prices_day_by_day():
    from datetime import datetime
    now = datetime.now()
    df = pd.DataFrame()
    t = []
    for iel in range(1,20):
        link1= "https://www.subito.it/annunci-lombardia/vendita/auto/?o="
        link2= "&ys=2013&ps=1000&pe=11000&me=20"
        link_page = link1+str(iel)+link2
        source = requests.get(link_page).text
        soup = BeautifulSoup(source, "lxml")
        list_of_car = soup.find_all("div", class_="items__item AdItemBigCard_card__xZSdY")
        for visible_description in list_of_car:
            link = visible_description.a["href"]
            data = sistema_data_ora(visible_description.find("span","classes_sbt-text-atom__2GBat classes_token-caption__1Ofu6 classes_size-small__3diir classes_weight-semibold__1RkLc classes_date__2lOoE classes_with-spacer__3WQbi"))
            if data != None:
                diff = pd.Timestamp.now() - data
                if diff < dt.timedelta(days=1):
                    link = visible_description.a["href"]
                    luogo = visible_description.find("span", "classes_sbt-text-atom__2GBat classes_token-caption__1Ofu6 classes_size-small__3diir classes_weight-semibold__1RkLc classes_town__W-0Iq").text.strip()
                    provincia = visible_description.find("span", "classes_sbt-text-atom__2GBat classes_token-caption__1Ofu6 classes_size-small__3diir classes_weight-semibold__1RkLc city").text.replace("(","").replace(")","")
                    prize = (visible_description.find("p", class_="classes_price__HmHqw classes_small__38Lur").text.split(" ")[0])
                    if u'\xa0' in prize:
                        prize = prize.split((u'\xa0'))[0]
                    int_prize = int(prize.split(".")[0]+prize.split(".")[1])
                    marca, modello,versione, km, immatricolazione, carburante, euro = details_grid(link)
                    check = [x == None for x in (marca, modello,versione, km, immatricolazione, carburante, euro)]
                    if False not in check:
                        continue
                    tmp = dict(Data_upload= data, Marca=marca, Modello=modello, Versione=versione,Luogo=luogo,Provincia=provincia, Immatricolazione=immatricolazione,Km = km, Carburante=carburante,Classe_emisionni =euro, Prezzo = int_prize, Link=link )
                    t.append(tmp)
                else:
                    df = pd.DataFrame(t)
                    df = df[["Data_upload", "Marca", "Modello", "Versione","Luogo","Provincia", "Immatricolazione","Km", "Carburante","Classe_emisionni", "Prezzo","Link"]]
                    df = df.sort_values("Data_upload", ascending=False)
                    return df  

In [6]:
df2 = scraping_web_prices_day_by_day()
df2.head()

Unnamed: 0,Data_upload,Marca,Modello,Versione,Luogo,Provincia,Immatricolazione,Km,Carburante,Classe_emisionni,Prezzo,Link
0,2021-03-30 12:06:00,CITROEN,c4 spacetourer,c4 spacetourer bluehdi 150 s&s eat6 shine,Desio,MB,2017-06-01,71000,diesel,6.0,6900,https://www.subito.it/auto/citroen-c4-gran-pic...
1,2021-03-30 11:47:00,SMART,fortwo 3ªs.(c/a453),fortwo 90 0.9 turbo superpassion,Treviglio,BG,2018-04-01,30500,,,10400,https://www.subito.it/auto/smart-fortwo-3s-c-a...
2,2021-03-30 11:40:00,PEUGEOT,308 2ª serie,308 1.6 e-hdi 115 cv stop&start business,Calusco d'Adda,BG,2015-07-01,86215,,,8500,https://www.subito.it/auto/peugeot-308-2-serie...
3,2021-03-30 11:36:00,FIAT,panda 3ª serie,panda 1.2 lounge,Lambrugo,CO,2017-04-01,37924,benzina,6.0,7900,https://www.subito.it/auto/fiat-panda-1-2-loun...
4,2021-03-30 11:36:00,LANCIA,ypsilon,,Brescia,BS,2013-05-01,69900,benzina,,6700,https://www.subito.it/auto/lancia-ypsilon-1-2-...


# Scraping all the webpages

In [None]:
def scraping_web_prices():
    from datetime import datetime
    now = datetime.now()
    df = pd.DataFrame()
    t = []
    for iel in range(1,120):
        link1= "https://www.subito.it/annunci-lombardia/vendita/auto/?o="
        link2= "&ys=2013&ps=1000&pe=11000&me=20"
        link_page = link1+str(iel)+link2
        source = requests.get(link_page).text
        soup = BeautifulSoup(source, "lxml")
        list_of_car = soup.find_all("div", class_="items__item AdItemBigCard_card__xZSdY")
        for visible_description in list_of_car:
            link = visible_description.a["href"]
            luogo = visible_description.find("span", "classes_sbt-text-atom__2GBat classes_token-caption__1Ofu6 classes_size-small__3diir classes_weight-semibold__1RkLc classes_town__W-0Iq").text.strip()
            provincia = visible_description.find("span", "classes_sbt-text-atom__2GBat classes_token-caption__1Ofu6 classes_size-small__3diir classes_weight-semibold__1RkLc city").text.replace("(","").replace(")","")
            data = sistema_data_ora(visible_description.find("span","classes_sbt-text-atom__2GBat classes_token-caption__1Ofu6 classes_size-small__3diir classes_weight-semibold__1RkLc classes_date__2lOoE classes_with-spacer__3WQbi"))
            prize = (visible_description.find("p", class_="classes_price__HmHqw classes_small__38Lur").text.split(" ")[0])
            if u'\xa0' in prize:
                prize = prize.split((u'\xa0'))[0]
            int_prize = int(prize.split(".")[0]+prize.split(".")[1])
            marca, modello,versione, km, immatricolazione, carburante, euro = details_grid(link)
            check = [x == None for x in (marca, modello,versione, km, immatricolazione, carburante, euro)]
            if False not in check:
                continue
            tmp = dict(Data_upload= data, Marca=marca, Modello=modello, Versione=versione,Luogo=luogo,Provincia=provincia, Immatricolazione=immatricolazione,Km = km, Carburante=carburante,Classe_emisionni =euro, Prezzo = int_prize, Link=link )
            t.append(tmp)
    df = pd.DataFrame(t)
    df = df[["Data_upload", "Marca", "Modello", "Versione","Luogo","Provincia", "Immatricolazione","Km", "Carburante","Classe_emisionni", "Prezzo","Link"]]
    df = df.sort_values("Data_upload", ascending=False)
    df = df[~pd.isnull(df["Data_upload"])] #object from "Vetrina" so duplicate.
    df.reset_index(inplace=True,drop=True)
    return df   