#### First we import the packages

In [21]:
import bs4
import requests
import pandas as pd

#### Then we generate a request to extract the information from the first 39 pages of bycicles.

In [22]:
responses = []
for i in range(51, 1952, 50):
    response = requests.get("https://deportes.mercadolibre.com.co/bicicletas-ciclismo/bicicletas/_Desde_{}".format(i))
    responses.append(response)

In [23]:
len(responses)

39

#### Now it's important to be sure that the information has been successfully loaded. The number "200" means that all is going good.

In [24]:
for i in range(len(responses)):
    print(responses[i].status_code)

200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200


#### Now we extract structured information from the html file to be analized later.

In [25]:
soups = []
for i in range(len(responses)):
    soup = bs4.BeautifulSoup(responses[i].text, "html.parser")
    soups.append(soup)

In [26]:
len(soups)

39

#### Now we begin extracting the title of every bycicle offered.

In [27]:
titles_tog = []
for i in range(len(soups)):
    titles = soups[i].select("h2 a span.main-title")[:]
    for title in titles:
        titles_tog.append(title.string)

In [28]:
len(titles_tog)

1988

In [29]:
titles_tog[1:10]

[' Bicicleta Roadmaster Storm 29 Shimano + Luces + Maletin ',
 ' Bicicletas Gw Hyena 29 Shimano Integrados 21vel + Obsequio ',
 ' Bicicleta Gw Piranha 29 Grupo Shimano Altus 9 V Hidráulico ',
 ' Bicicleta Ruta Gw K2 Modelo 2020 Aluminio 14vel + Obsequio ',
 ' Bicicletas Gw Hyena Shimano Rin 29 Integradas 21 Vel ',
 ' Marco Ruta Carbono Cuadro Cipollini Bicicletas De Ruta ',
 ' Bicicleta Niños Rin 16 Canasta Pito Frenos ',
 ' Bicicleta Todoterreno Aluminio Shimano Susp. Hidraulica + Ob ',
 ' Bicicleta Mtb Todoterreno 18 Cambios Rin 26 Colores ']

#### Then we extract the prices

In [30]:
prices_tog = []
for i in range(len(soups)):
    prices = soups[i].select("div span.price__fraction")[:]
    for price in prices:
        if price.string == " ":
            prices_tog.append("NA")
        else:
            prices_tog.append(price.string)

In [31]:
len(prices_tog)

1988

In [32]:
prices_tog[1:10]

['599.900',
 '659.900',
 '1.499.990',
 '999.900',
 '749.990',
 '2.600.000',
 '220.900',
 '539.900',
 '240.000']

#### We create a DataFrame with those lists.

In [33]:
data = pd.DataFrame({"Titles":titles_tog,
                    "Prices":prices_tog})

In [34]:
data[1:10]

Unnamed: 0,Prices,Titles
1,599.900,Bicicleta Roadmaster Storm 29 Shimano + Luces...
2,659.900,Bicicletas Gw Hyena 29 Shimano Integrados 21v...
3,1.499.990,Bicicleta Gw Piranha 29 Grupo Shimano Altus 9...
4,999.900,Bicicleta Ruta Gw K2 Modelo 2020 Aluminio 14v...
5,749.990,Bicicletas Gw Hyena Shimano Rin 29 Integradas...
6,2.600.000,Marco Ruta Carbono Cuadro Cipollini Bicicleta...
7,220.900,Bicicleta Niños Rin 16 Canasta Pito Frenos
8,539.900,Bicicleta Todoterreno Aluminio Shimano Susp. ...
9,240.000,Bicicleta Mtb Todoterreno 18 Cambios Rin 26 C...


#### From the title we extract the first string that usually is the name of the item that is offered.

In [35]:
data["Name"] = data["Titles"].apply(lambda name: name.split(" ")[1])

In [36]:
data["N_titles"] = (data["Titles"].apply(lambda name: name.split(" ")[2:])
                                 .map(lambda names: ' '.join(names)))

In [37]:
data[["Prices", "Name", "N_titles"]][1:5]

Unnamed: 0,Prices,Name,N_titles
1,599.900,Bicicleta,Roadmaster Storm 29 Shimano + Luces + Maletin
2,659.900,Bicicletas,Gw Hyena 29 Shimano Integrados 21vel + Obsequio
3,1.499.990,Bicicleta,Gw Piranha 29 Grupo Shimano Altus 9 V Hidráulico
4,999.900,Bicicleta,Ruta Gw K2 Modelo 2020 Aluminio 14vel + Obsequio


#### Now we remove the "." that is in the price in order to ease the later calculations we want to do with that.

In [38]:
data["N_prices"] = data["Prices"].apply(lambda price: price.split(".")).map(lambda prices: ''.join(prices))

In [39]:
data["N_prices"][1:5]

1     599900
2     659900
3    1499990
4     999900
Name: N_prices, dtype: object

In [40]:
data[["N_prices", "Name", "N_titles"]][1:5]

Unnamed: 0,N_prices,Name,N_titles
1,599900,Bicicleta,Roadmaster Storm 29 Shimano + Luces + Maletin
2,659900,Bicicletas,Gw Hyena 29 Shimano Integrados 21vel + Obsequio
3,1499990,Bicicleta,Gw Piranha 29 Grupo Shimano Altus 9 V Hidráulico
4,999900,Bicicleta,Ruta Gw K2 Modelo 2020 Aluminio 14vel + Obsequio
