In [304]:
from bs4 import BeautifulSoup
from threading import Thread
from time import perf_counter, sleep
import math
import pandas as pd
import queue
import requests

url = 'https://www.conocerbarcelona.com/actividades'

# Scraping Project

The main objective of the following code is to (i) generate a database from web scraping with all the tours available in Barcelona. The source of information is "https://www.conocerbarcelona.com/actividades" and (ii) generate concurrent HTTP requests while scraping using queue and threading, to measure time savings in programming.

In [305]:
get_next(soup)

'https://www.conocerbarcelona.com/actividades/2/'

In [308]:
#Choosing the information to be extracted

## 1. Returns the title of each tour in Barcelona
def get_title(soup):    
    return [s.get_text().replace("\n"," ").replace("\ue9ce","").replace("\xa0","").replace("   "," ") 
            for s in soup.select("h3.title--activity-card")]

## 2. Returns the description of each tour
def get_description(soup):
    """ Returns the description of the tour"""
    return [s.get_text().replace("\n"," ").replace("\ue9ce","").replace("\xa0","").replace("   "," ") 
            for s in soup.select("div.m-card__text")]

## 3. Returns the price of each tour
def get_price(soup):    
    """ Returns the price of the tour"""
    return [s.get_text().replace("\n"," ").replace("\ue9ce","").replace("\xa0","").replace("   "," ") 
            for s in soup.select("div.o-container-group")]

## 4. Returns the url of each page
def get_next(soup):
    try:
        return ("{}{}".format ('https://www.conocerbarcelona.com',(soup.select('a.next-element')[0]['href'])))
    except IndexError:
        return None
    
## 5. Using beautifulsoup 
def get_soup(url):
    res = requests.get(url)
    return BeautifulSoup(res.text,'lxml')

## 6. TEST: Extract all the information from all pages without using the library(queue) to estimate time 
%time
def agregated(url):
    titles = []
    description = []
    price= []
    while url:
        soup = get_soup(url)
        titles += get_title(soup)
        description += get_description(soup)
        price += get_price(soup)
        url = get_next(soup)
    return pd.concat([(pd.DataFrame(titles)),(pd.DataFrame(description)),(pd.DataFrame(price))], ignore_index=True, axis=1)


CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 6.91 µs


In [316]:
 ## 7. Estimated time
%time
Final=agregated(url)
Final.columns=["Tour", "Description", "Price"]
Final

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 8.11 µs


Unnamed: 0,Tour,Description,Price
0,Entrada a la Sagrada Familia sin colas,Visitar el símbolo de Barcelona es una expe...,18 €
1,Free tour por Barcelona ¡Gratis!,Recorre el centro histórico de la Ciudad Cond...,¡Gratis!
2,Tour de Barcelona al completo,¿Quieres conocer lo mejor de Barcelona en u...,51 €
3,Tour de los misterios y leyendas de Barce...,Descubre con esta ruta nocturna por Barcelo...,12 €
4,Sagrada Familia con subida a las torres,En esta visita no solo nos adentraremos en ...,por coche 39 €
5,Free tour de Gaudí y la Barcelona moderni...,Adéntrate en el corazón modernista de Barcelo...,42 €
6,Tarjeta Hola BCN!,La tarjeta Hola BCN! permite el uso ilimita...,¡Gratis!
7,Visita guiada por el Parque Güell,El Parque Güell es una de las obras más rep...,15 €
8,"Excursión a Girona, Figueres y Museo Dalí",En este tour conoceremos dos de las ciudade...,24 €
9,Excursión a Montserrat con tren cremaller...,"Con 1.236 metros de altura, Montserrat es u...",78 €


In [317]:
ScrapingProjectDB = Final.to_csv('TourInformationBarcelona.csv', index=False)

In [318]:
START_URL = 'https://www.conocerbarcelona.com/actividades'
columns = ["Tour" ,"Description", "Price"]
result = pd.DataFrame(columns=columns)
my_queue = queue.Queue()

def make_urls2():
    url_list2 = []
    url_list2.append(START_URL)
    for page in range(2, 6):
        url_list2.append(START_URL + "/" + str(page) +"/")
    return url_list2
    
## Defining queues and threads

def worker(url, queue):
    
    soup = BeautifulSoup(requests.get(url).text, 'html.parser')

    titles = []
    description = []
    prices = []
        # Find all tour titles
    for h3 in soup.select("h3.title--activity-card"):
        titles.append(h3.text.replace("\n"," ").replace("\ue9ce","").replace("\xa0","").replace("   "," "))     
        # Find all descriptions
    for div in soup.select("div.m-card__text"):
        description.append(div.text.replace("\n"," ").replace("\ue9ce","").replace("\xa0","").replace("   "," "))
        # Find all tour prices       
    for div in soup.select("div.o-container-group"):
        prices.append(div.text.replace("\n"," ").replace("\ue9ce","").replace("\xa0","").replace("   "," "))
    
    queue.put(list(zip(titles, description, prices)))
    
urls = make_urls2()
all_threads = []
for url in urls:
    t = Thread(target=worker, args=(url, my_queue))
    t.start()
    all_threads.append(t)
    
%time
result_count = 0
while result_count < len(urls):
    data = my_queue.get()
    result = result.append(pd.DataFrame(data, columns=columns))
    result_count += 1

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.96 µs


In [299]:
 ## The CPU time decreases considerably when using queues and threads!

In [302]:
FINAL = pd.DataFrame(result)
FINAL

Unnamed: 0,Tour,Description,Price
0,Paseo en catamarán por el Port Vell,En este tour recorreremos la zona vieja de ...,"7,5 €"
1,Cena y espectáculo en el Tablao Cordobés,Contágiate de los ritmos flamencos en el ta...,44 €
2,Tour en 4x4 por los viñedos del Penedés,Recorre los famosos viñedos del Penedés en ...,74 €
3,Visita guiada por Sant Pau,"El recinto modernista Sant Pau, declarado P...",27 €
4,Tour fotográfico por Barcelona,Disfruta de Barcelona de una forma diferent...,45 €
5,Tour de compras a La Roca Village,Vive un día de shopping de lujo en La Roca Vi...,20 €
6,Visita guiada por la Fábrica de Estrella ...,Conoce de primera mano la Fábrica de Estrel...,8 €
7,Montserrat por libre en autobús,Esta excursión a Montserrat en autobús es i...,25 €
8,Entrada a la Casa Vicens,"Diseñada por Antonio Gaudí, la Casa Vicens ...",16 €
9,Entrada al Zoo de Barcelona,"Acércate al Zoológico de Barcelona, una peq...","21,4 €"
