https://medium.com/saturdays-ai/web-scraping-para-machine-learning-71f7e673bcf3

In [1]:
import pandas as pd
import pandas_profiling  as pd_eda
from tqdm import tqdm_notebook as tqdm

import requests
from bs4 import BeautifulSoup

# 1. Seleccionar página web

- [mobile.de](https://www.mobile.de/es/): Europa, mas ordenado
- Autoscout
- Coches.net
- [Milanuncios](https://www.milanuncios.com/coches-de-segunda-mano-en-murcia/?hasta=10000&demanda=n&anod=2010) (coches murcia)
  - Hasta 10000 euros
  - Desde el 2010

# 2. Extraer datos (web scrapping)
### Hay 30 coches por página

Cada `<div class="aditem">...</div>` es una **anuncio de un coche**. Y tiene:
- **Título**: `<div class="aditem-detail-title">`
- **Precio**: `<div class="aditem-price">`
- **Año**: `<div class="ano">`
- **Kms**: `<div class="kms">`
- **Cindrada**: `<div class="cc">`
- **Vendedor**: `<div class="pillSellerTypePro">`
- **Cambio**: `cmanual` o `cauto`

### Otros campos interesantes
- **Fecha de publicación** -> Series temporales
- **Imágenes** -> Computer vision
- **Descripción** -> NLP

In [2]:
def getDiv(className, component="div"):
    if anuncio.find(component, class_=className):
        return anuncio.find(component, class_=className).get_text()
    else:
        return None

In [3]:
def getBinary(className1, className2, component="div"):
    if anuncio.find(component, class_=className1):
        return anuncio.find(component, class_=className1).get_text()
    elif anuncio.find(component, class_=className2):
        return anuncio.find(component, class_=className2).get_text()
    else:
        return None

In [4]:
url = "https://www.milanuncios.com/coches-de-segunda-mano-en-murcia/?hasta=10000&demanda=n&anod=2010&pagina="
data = []

for pagina in tqdm(range(1,11)):
    page = requests.get(url+str(pagina))    
    soup = BeautifulSoup(page.content, 'html.parser')
    anuncios = soup.find_all('div', class_='aditem')
    
    for anuncio in anuncios:
        titulo = getDiv('aditem-detail-title', component="a")
        marca  = titulo.split(" - ")[0]
        modelo = titulo.split(" - ")[1]
        precio = getDiv('aditem-price')
        ano    = getDiv('ano')
        kms    = getDiv('kms')
        cc     = getDiv('cc')
        puertas = getDiv('ejes')
        cambio = getBinary("cmanual", "cauto")
        vendedor    = getBinary("pillSellerTypePro", "pillSellerTypePriv")
        combustible = getBinary("die", "gas")
        data.append([marca, modelo, precio, ano, kms, cambio, cc, puertas, vendedor, combustible])

df = pd.DataFrame(data=data, columns=("Marca",
                                      "Modelo",
                                      "Precio",
                                      "Año",
                                      "Kms",
                                      "Cambio",
                                      "Cc",
                                      "Puertas",
                                      "Vendedor",
                                      "Combust"))
df[["Marca", "Precio", "Año", "Kms", "Cambio", "Cc", "Puertas", "Vendedor", "Combust"]]

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




Unnamed: 0,Marca,Precio,Año,Kms,Cambio,Cc,Puertas,Vendedor,Combust
0,FORD,9.800€,año 2016,74.913 kms,manual,120 cv,5 puertas,Profesional,diesel
1,OPEL,9.899€,año 2015,87.531 kms,manual,140 cv,5 puertas,Profesional,diesel
2,CITROEN,8.499€,año 2011,149.952 kms,manual,136 cv,5 puertas,Profesional,diesel
3,CITROEN,5.999€,año 2011,139.987 kms,manual,95 cv,5 puertas,Profesional,gasolina
4,PIAGGIO,9.899€,año 2013,45.050 kms,manual,64 cv,2 puertas,Profesional,diesel
...,...,...,...,...,...,...,...,...,...
295,TOYOTA,6.400€,año 2013,100.000 kms,manual,69 cv,5 puertas,Profesional,gasolina
296,PEUGEOT,8.499€,año 2015,95.000 kms,manual,75 cv,5 puertas,Profesional,diesel
297,PEUGEOT,6.405€,año 2017,100.000 kms,manual,,5 puertas,Profesional,gasolina
298,PEUGEOT,5.999€,año 2015,196.000 kms,manual,92 cv,5 puertas,Profesional,diesel


# EDA

In [7]:
df.profile_report()



In [2]:
source

Unnamed: 0,Name,Miles_per_Gallon,Cylinders,Displacement,Horsepower,Weight_in_lbs,Acceleration,Year,Origin
0,chevrolet chevelle malibu,18.0,8,307.0,130.0,3504,12.0,1970-01-01,USA
1,buick skylark 320,15.0,8,350.0,165.0,3693,11.5,1970-01-01,USA
2,plymouth satellite,18.0,8,318.0,150.0,3436,11.0,1970-01-01,USA
3,amc rebel sst,16.0,8,304.0,150.0,3433,12.0,1970-01-01,USA
4,ford torino,17.0,8,302.0,140.0,3449,10.5,1970-01-01,USA
...,...,...,...,...,...,...,...,...,...
401,ford mustang gl,27.0,4,140.0,86.0,2790,15.6,1982-01-01,USA
402,vw pickup,44.0,4,97.0,52.0,2130,24.6,1982-01-01,Europe
403,dodge rampage,32.0,4,135.0,84.0,2295,11.6,1982-01-01,USA
404,ford ranger,28.0,4,120.0,79.0,2625,18.6,1982-01-01,USA


# Clean the data

# One-hot encoding marca + modelo

# Train Model with target: price

# Predict price for every car

# Compute how good is the price (percentage)