# Hotels data collector

In [1]:
import pandas as pd
import numpy as np
import scrapy
import os
import logging
from scrapy.crawler import CrawlerProcess
import time
import json
import boto3

# Use it if you load your AWS access keys from a .env file :
from dotenv import load_dotenv
load_dotenv() # set the environment variables from .env file

True

## Goal and perimeter

_This notebooks uses the data stored in the file 'weather_data.csv', that you can obtain by executing the Weather data collector notebook._

**Starting from the weather data previously collected, we will explore the best hotels in the area of the top-5 cities, combine the data with weather data in one file and store it in Amazon S3 and a database in Amazon RDS, for later use in travel recommendations.**

In [2]:
weather_data = pd.read_csv('data/weather_data.csv')
weather_data

Unnamed: 0,nominatim_place_id,city,lat,lon,feels_like.day,pop,temp_delta,weather_score,ranking
0,297981358,Bayeux,49.276462,-0.702474,25,0,0,100.0,1
1,298137491,Le Havre,49.493898,0.107973,25,0,0,100.0,2
2,297756747,St Malo,48.649518,-2.026041,26,0,1,92.4,3
3,120791766,Chateau du Haut Koenigsbourg,48.24949,7.344296,26,5,1,81.9,4
4,297653650,La Rochelle,46.159113,-1.152043,28,0,3,77.1,5
5,297472400,Lille,50.636565,3.063528,28,0,3,77.1,6
6,297417241,Paris,48.85889,2.320041,29,0,4,69.4,7
7,297534793,Amiens,49.894171,2.295695,29,0,4,69.4,8
8,298516909,Biarritz,43.471144,-1.552727,27,9,2,66.0,9
9,297668227,Besancon,47.238022,6.024362,29,2,4,65.3,10


In [3]:
# Add 'city_' before column names (except the identifiers) to ease distinction when this dataframe will be merged with hotels data later.  

weather_data.columns = weather_data.columns[:2].append(pd.Index(['city_' + col_name for col_name in weather_data.columns[2:]]))
weather_data.columns

Index(['nominatim_place_id', 'city', 'city_lat', 'city_lon',
       'city_feels_like.day', 'city_pop', 'city_temp_delta',
       'city_weather_score', 'city_ranking'],
      dtype='object')

In [4]:
place_identifier = 'nominatim_place_id' # store the name of the cities' unique identifier column for later use as key for following dataframes

## 1. Data scraping from Booking.com

In [5]:
CITIES_NB = 5 # Collect info only in the 5 top cities from the previous weather ranking
HOTELS_NB = 20 # Limit the search results collected for each city to 20 hotels

top_cities = weather_data.sort_values('city_ranking',ascending = True)['city'].to_list()[:CITIES_NB]
top_cities_ids = weather_data.sort_values('city_ranking',ascending = True)[place_identifier].to_list()[:CITIES_NB]

print(f"Collecting info from the {HOTELS_NB} first recommended hotels on booking.com in the following cities : {', '.join(list(top_cities))}...")
print()

class HotelsInfoSpider(scrapy.Spider):
    # Name of the spider
    name = "Hotels_Scraping"

    # Starting URL
    start_urls = [f'https://www.booking.com/searchresults.fr.html?ss={city}' for city in top_cities]

    # Override the default start_requests method to keep track of the city id as metadata
    def start_requests(self): 
        for url, city_id in zip(self.start_urls, top_cities_ids):
            yield scrapy.Request(url, meta={place_identifier: city_id})
    
    # Parse method for searching most recommended hotels in each city
    def parse(self, response):
        for link in response.css('h3 a[data-testid="title-link"]')[:HOTELS_NB]:
            hotel_url = link.attrib["href"]
            yield response.follow(hotel_url, callback=self.parse_hotel, meta={place_identifier: response.meta[place_identifier]}) # keeping track of the city id metadata for each hotel
    
    # Parse method for collecting informations on each hotel
    def parse_hotel(self, response):
        yield {
            place_identifier : response.meta[place_identifier],
            'hotel_name' : ''.join(response.css('div.hp__hotel-title h2::text').getall()), # some texts contain linebreaks interpreted as separators 
            'booking_url' : response.url,
            'lat_lon coordinates' : response.css('a#hotel_sidebar_static_map').attrib['data-atlas-latlng'],
            'booking_reviews_score' : response.css('div[data-testid="review-score-component"] div:first-child::text').get(), 
            'description' : response.css('#property_description_content p::text').getall() # getting all paragraphs except the two first ones 
                                                                                                          # (which are incentives to login booking.com writes to users)
        }

# Path of the file where the results will be saved
dir_name = 'data'
file_name = "top_hotels.json"
file_path = f"{dir_name}/{file_name}"
# If file already exists, delete it before crawling (because Scrapy will 
# concatenate the last and new results otherwise)
if file_name in os.listdir(dir_name):
        os.remove(file_path)

# Declare CrawlerProcess settings
## USER_AGENT => Simulates a browser on an OS
## LOG_LEVEL => Minimal Level of Log 
## FEEDS => Where the file will be stored 
## More info on built-in settings => https://docs.scrapy.org/en/latest/topics/settings.html?highlight=settings#settings
process = CrawlerProcess(settings = {
    'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36', # copy of my web browser user agent
    'LOG_LEVEL': logging.INFO,
    "FEEDS": {
        file_path: {"format": "json"},
    },
    "AUTOTHROTTLE_ENABLED": True  # Adapt scraping speed to avoid getting banned
})

# Start the crawling using the spider defined above
process.crawl(HotelsInfoSpider)
process.start()
print()
print("Done !")

2022-08-06 13:22:55 [scrapy.utils.log] INFO: Scrapy 2.6.2 started (bot: scrapybot)
2022-08-06 13:22:55 [scrapy.utils.log] INFO: Versions: lxml 4.8.0.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 22.4.0, Python 3.7.13 (default, Mar 29 2022, 02:18:16) - [GCC 7.5.0], pyOpenSSL 21.0.0 (OpenSSL 1.1.1q  5 Jul 2022), cryptography 3.4.8, Platform Linux-5.10.102.1-microsoft-standard-WSL2-x86_64-with-debian-bullseye-sid
2022-08-06 13:22:55 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'LOG_LEVEL': 20,
 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
               '(KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}
2022-08-06 13:22:55 [scrapy.extensions.telnet] INFO: Telnet Password: 917c0ad292de22f0
2022-08-06 13:22:55 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extension

Collecting info from the 20 first recommended hotels on booking.com in the following cities : Bayeux, Le Havre, St Malo, Chateau du Haut Koenigsbourg, La Rochelle...



2022-08-06 13:22:55 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
 'scrapy.downloadermiddlewares.stats.DownloaderStats']
2022-08-06 13:22:55 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
 'scrapy.spidermiddlewares.ref


Done !


In [7]:
with open("data/top_hotels.json", 'r') as top_hotels_file:
    top_hotels_json = json.load(top_hotels_file)
display(pd.DataFrame(top_hotels_json))
display(pd.DataFrame(top_hotels_json).dtypes)

Unnamed: 0,nominatim_place_id,hotel_name,booking_url,lat_lon coordinates,booking_reviews_score,description
0,297981358,\nDomaine de Bayeux\n,https://www.booking.com/hotel/fr/domaine-de-ba...,"49.27232560,-0.69851010",92,[Le Domaine de Bayeux occupe une maison du XVI...
1,297981358,\nGites les Pourquoi Pas - Résidence de Touris...,https://www.booking.com/hotel/fr/gites-les-pou...,"49.28152260,-0.70813440",93,[Bénéficiant d'une vue sur la ville et d'une c...
2,297981358,\nLe Petit Matin\n,https://www.booking.com/hotel/fr/le-petit-mati...,"49.27606133,-0.70905536",94,[Cet établissement est à 1 minute à pied de la...
3,297981358,\nClos de Bellefontaine B&B\n,https://www.booking.com/hotel/fr/chambres-d-ha...,"49.27267164,-0.69486111",97,[Le Clos de Bellefontaine occupe une maison du...
4,297981358,\nManoir Sainte Victoire\n,https://www.booking.com/hotel/fr/manoir-sainte...,"49.27649171,-0.70575960",96,[Doté d'un jardin et d'une connexion Wi-Fi gra...
...,...,...,...,...,...,...
95,297981358,\nChambre d'hôtes Logis de Saint Jean\n,https://www.booking.com/hotel/fr/logis-de-sain...,"49.27600709,-0.69828361",91,"[Installé dans le centre historique de Bayeux,..."
96,297981358,\nHôtel Le Saint Patrice\n,https://www.booking.com/hotel/fr/saint-patrice...,"49.27981023,-0.71067492",83,[L’Hôtel Le Saint Patrice vous accueille à Bay...
97,297981358,\nImmolidays\n,https://www.booking.com/hotel/fr/immolidays.fr...,"49.27905535,-0.70856206",83,[Vous pouvez bénéficier d'une réduction Genius...
98,297981358,\nAu Loup Historic Apartments\n,https://www.booking.com/hotel/fr/au-loup-histo...,"49.27279800,-0.70755600",85,"[Installé à Bayeux, à 550 mètres de la cathédr..."


nominatim_place_id        int64
hotel_name               object
booking_url              object
lat_lon coordinates      object
booking_reviews_score    object
description              object
dtype: object

## 2. Data cleaning and processing

In [8]:
top_hotels_df = pd.DataFrame(top_hotels_json)

top_hotels_df['hotel_name'] = top_hotels_df['hotel_name'].str.strip()

top_hotels_df['booking_reviews_score'] = top_hotels_df['booking_reviews_score'].str.replace(',','.').astype(float)

top_hotels_df['description'] = top_hotels_df['description'].str.join('\n')

top_hotels_df['latitude'] = top_hotels_df['lat_lon coordinates'].str.split(',').apply(lambda x: x[0]).astype(float)
top_hotels_df['longitude'] = top_hotels_df['lat_lon coordinates'].str.split(',').apply(lambda x: x[1]).astype(float)
top_hotels_df = top_hotels_df.drop(columns = ['lat_lon coordinates'])

top_hotels_df

Unnamed: 0,nominatim_place_id,hotel_name,booking_url,booking_reviews_score,description,latitude,longitude
0,297981358,Domaine de Bayeux,https://www.booking.com/hotel/fr/domaine-de-ba...,9.2,Le Domaine de Bayeux occupe une maison du XVII...,49.272326,-0.698510
1,297981358,Gites les Pourquoi Pas - Résidence de Tourisme...,https://www.booking.com/hotel/fr/gites-les-pou...,9.3,Bénéficiant d'une vue sur la ville et d'une co...,49.281523,-0.708134
2,297981358,Le Petit Matin,https://www.booking.com/hotel/fr/le-petit-mati...,9.4,Cet établissement est à 1 minute à pied de la ...,49.276061,-0.709055
3,297981358,Clos de Bellefontaine B&B,https://www.booking.com/hotel/fr/chambres-d-ha...,9.7,Le Clos de Bellefontaine occupe une maison du ...,49.272672,-0.694861
4,297981358,Manoir Sainte Victoire,https://www.booking.com/hotel/fr/manoir-sainte...,9.6,Doté d'un jardin et d'une connexion Wi-Fi grat...,49.276492,-0.705760
...,...,...,...,...,...,...,...
95,297981358,Chambre d'hôtes Logis de Saint Jean,https://www.booking.com/hotel/fr/logis-de-sain...,9.1,"Installé dans le centre historique de Bayeux, ...",49.276007,-0.698284
96,297981358,Hôtel Le Saint Patrice,https://www.booking.com/hotel/fr/saint-patrice...,8.3,L’Hôtel Le Saint Patrice vous accueille à Baye...,49.279810,-0.710675
97,297981358,Immolidays,https://www.booking.com/hotel/fr/immolidays.fr...,8.3,Vous pouvez bénéficier d'une réduction Genius ...,49.279055,-0.708562
98,297981358,Au Loup Historic Apartments,https://www.booking.com/hotel/fr/au-loup-histo...,8.5,"Installé à Bayeux, à 550 mètres de la cathédra...",49.272798,-0.707556


In [9]:
# Add 'hotel_' before column names (except the nominatim_place_id and hotel_name) to ease distinction when merging with the weather dataframe  

top_hotels_df.columns = top_hotels_df.columns[:2].append(pd.Index(['hotel_' + col_name for col_name in top_hotels_df.columns[2:]]))
top_hotels_df.columns

Index(['nominatim_place_id', 'hotel_name', 'hotel_booking_url',
       'hotel_booking_reviews_score', 'hotel_description', 'hotel_latitude',
       'hotel_longitude'],
      dtype='object')

In [10]:
print("------------------------------ Showing description of randomly picked hotels : --------------------------------------\n")
for idx in top_hotels_df.sample(5).index:
    print(top_hotels_df['hotel_booking_url'][idx], "\n", top_hotels_df['hotel_description'][idx])
    print("------------------------------------------------------------")
    print()

------------------------------ Showing description of randomly picked hotels : --------------------------------------

https://www.booking.com/hotel/fr/alpha-ocean.fr.html?aid=304142&ucfs=1&arphpl=1&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&hpos=19&hapos=19&sr_order=popularity&srpvid=bcd1500b7d2c0208&srepoch=1659784983&from_beach_sr=1&beach_sr_walking_distance=39&beach_rating_score=8.8&from=searchresults 
 Cet établissement est à 1 minute à pied de la plage. L'Alpha Ocean est situé à 30 mètres de la plage de Saint-Malo, où vous pourrez pratiquer des activités nautiques. Cet hôtel à la gestion familiale propose des chambres dotées d'une connexion Wi-Fi gratuite et d'une salle de bains privative. La plupart donnent sur la mer.
Durant votre séjour à l'Alpha Ocean Hotel, vous pourrez vous détendre avec un verre dans le bar ou sur la terrasse qui offre une vue sur la mer. Le dimanche, vous pourrez profiter d'un apéritif en dégustant les fameuses huîtres de Canca

In [11]:
# Merge all city geoloc/weather data and hotels data in one single dataframe

top_destinations_df = weather_data.merge(top_hotels_df, how = 'left', on = place_identifier) # left join to keep the weather infos even on cities for which we didn't look for hotels
top_destinations_df

Unnamed: 0,nominatim_place_id,city,city_lat,city_lon,city_feels_like.day,city_pop,city_temp_delta,city_weather_score,city_ranking,hotel_name,hotel_booking_url,hotel_booking_reviews_score,hotel_description,hotel_latitude,hotel_longitude
0,297981358,Bayeux,49.276462,-0.702474,25,0,0,100.0,1,Domaine de Bayeux,https://www.booking.com/hotel/fr/domaine-de-ba...,9.2,Le Domaine de Bayeux occupe une maison du XVII...,49.272326,-0.698510
1,297981358,Bayeux,49.276462,-0.702474,25,0,0,100.0,1,Gites les Pourquoi Pas - Résidence de Tourisme...,https://www.booking.com/hotel/fr/gites-les-pou...,9.3,Bénéficiant d'une vue sur la ville et d'une co...,49.281523,-0.708134
2,297981358,Bayeux,49.276462,-0.702474,25,0,0,100.0,1,Le Petit Matin,https://www.booking.com/hotel/fr/le-petit-mati...,9.4,Cet établissement est à 1 minute à pied de la ...,49.276061,-0.709055
3,297981358,Bayeux,49.276462,-0.702474,25,0,0,100.0,1,Clos de Bellefontaine B&B,https://www.booking.com/hotel/fr/chambres-d-ha...,9.7,Le Clos de Bellefontaine occupe une maison du ...,49.272672,-0.694861
4,297981358,Bayeux,49.276462,-0.702474,25,0,0,100.0,1,Manoir Sainte Victoire,https://www.booking.com/hotel/fr/manoir-sainte...,9.6,Doté d'un jardin et d'une connexion Wi-Fi grat...,49.276492,-0.705760
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,298221742,Collioure,42.525050,3.083155,32,12,7,21.5,31,,,,,,
126,297389050,Ariege,42.945537,1.406554,27,33,2,16.0,32,,,,,,
127,297639071,Grenoble,45.187560,5.735782,32,18,7,9.0,33,,,,,,
128,298222566,Toulouse,43.604462,1.444247,34,14,9,2.1,34,,,,,,


In [12]:
# Saving csv file in 'data' directory in local
dir_name = 'data'
file_name = 'top_destinations.csv'
file_path = f"{dir_name}/{file_name}"
top_destinations_df.to_csv(f"{dir_name}/{file_name}", index = False) # storing the data in a csv in local folder if needed

## 3. Storing data in AWS S3

In [13]:
# Get AWS access keys loaded in the environment variables

AWS_ACCESS_KEY = os.getenv('AWS_ACCESS_KEY') # <- replace with your own AWS access key 
AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY') # <- replace with your own AWS secret access key

# Create the bucket and store the data in it

REGION = 'eu-west-3' # <- replace with the AWS region you prefer to use
session = boto3.Session(aws_access_key_id=AWS_ACCESS_KEY, 
                        aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
s3 = session.client("s3")
bucket_name = 'project-etl-scraping'
try: # Create the bucket if not already existing
    print(f"Creating S3 bucket '{bucket_name}'...")
    bucket = s3.create_bucket(Bucket = bucket_name, CreateBucketConfiguration = {'LocationConstraint': REGION}) # if the bucket does not already exists, create it
except (s3.exceptions.BucketAlreadyExists, s3.exceptions.BucketAlreadyOwnedByYou) as err:
    print("Bucket already existing.")

print("Storing the file in bucket...")
s3.put_object(Bucket = bucket_name, Body = top_destinations_df.to_csv(index = False), Key = file_name) # Storing the file in the bucket
s3.put_object_acl(ACL="public-read", Bucket = bucket_name, Key = file_name) # Granting public read access to the file

print()
print(f"File successfully loaded to S3 in path '{bucket_name}/{file_name}'")

Creating S3 bucket 'project-etl-scraping'...
Bucket already existing.
Storing the file in bucket...

File successfully loaded to S3 in path 'project-etl-scraping/top_destinations.csv'
