# Analysis of Airbnb data on multiple locations spread across Spain

---


#### Setup


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys, os
import logging
import pandas as pd
import seaborn as sns
import json
import numpy as np

from copy import deepcopy
import datetime
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
from pathlib import Path
from typing import Iterable
from IPython import display as ICD
from matplotlib import pyplot as plt

In [3]:
src_path: str = "../src"
sys.path.append(src_path)
logging.getLogger().setLevel(logging.WARN)

In [4]:
from data_wrangling import *

In [5]:
random_seed: int = 8080

---


In [6]:
pd.read_csv("../data/airbnb/euskadi/calendar.csv.bz2", index_col=0)

Unnamed: 0_level_0,date,available,price,adjusted_price,minimum_nights,maximum_nights
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
132068,2022-09-27,f,$220.00,$220.00,1.0,63.0
132068,2022-09-28,f,$190.00,$190.00,1.0,63.0
132068,2022-09-29,f,$220.00,$220.00,1.0,63.0
132068,2022-09-30,f,$220.00,$220.00,1.0,63.0
132068,2022-10-01,f,$170.00,$170.00,2.0,63.0
...,...,...,...,...,...,...
41535995,2023-09-21,f,$62.00,$62.00,2.0,1125.0
41535995,2023-09-22,f,$62.00,$62.00,2.0,1125.0
41535995,2023-09-23,f,$62.00,$62.00,2.0,1125.0
41535995,2023-09-24,f,$62.00,$62.00,2.0,1125.0


In [7]:
pd.read_csv("../data/airbnb/euskadi/listings.csv.bz2", index_col=0)

Unnamed: 0_level_0,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,host_url,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
132068,https://www.airbnb.com/rooms/132068,20220926182502,2022-09-27,city scrape,Great attic+terrace+parking. Beach views. ESS0...,"Amazing modern, clean & fully equiped flat of...",Gros es el barrio de moda de Donosti. El apart...,https://a0.muscache.com/pictures/22b9f457-1f65...,648938.0,https://www.airbnb.com/users/show/648938,...,4.96,4.94,4.75,,t,1.0,1.0,0.0,0.0,4.54
142760,https://www.airbnb.com/rooms/142760,20220926182502,2022-09-26,city scrape,Piso en la playa cerca de Gaztelugatxe,Beach front apartment close to San Juan de Gaz...,,https://a0.muscache.com/pictures/71900ac6-da9b...,693119.0,https://www.airbnb.com/users/show/693119,...,4.57,4.65,4.74,,f,1.0,1.0,0.0,0.0,0.35
225692,https://www.airbnb.com/rooms/225692,20220926182502,2022-09-27,city scrape,"Parte Vieja, San Sebastian",<b>The space</b><br />Apartment 80 meters in t...,,https://a0.muscache.com/pictures/31866298/de9d...,1176053.0,https://www.airbnb.com/users/show/1176053,...,4.92,4.94,4.72,ESS00994,f,1.0,1.0,0.0,0.0,1.54
309802,https://www.airbnb.com/rooms/309802,20220926182502,2022-09-27,city scrape,Luxury w/terrace near the beach WIF,The apartment is located in the mezzanine of a...,In this neighborhood with a strong personality...,https://a0.muscache.com/pictures/50730314/9cfb...,1589633.0,https://www.airbnb.com/users/show/1589633,...,4.43,5.00,4.33,,f,8.0,7.0,1.0,0.0,0.07
309813,https://www.airbnb.com/rooms/309813,20220926182502,2022-09-27,city scrape,Near the beach with terrace WIFI,Elegant three bedrooms apartment located in a ...,The apartment is located in the area known as ...,https://a0.muscache.com/pictures/47628822/3b0e...,1589633.0,https://www.airbnb.com/users/show/1589633,...,4.58,4.73,4.64,,f,8.0,7.0,1.0,0.0,0.09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
723327530877091506,https://www.airbnb.com/rooms/723327530877091506,20220926182502,2022-09-26,city scrape,"Habitación, acceso a cocina y aparcamiento gratis",Habitacion privada con acceso a cocina y baño....,,https://a0.muscache.com/pictures/26b9f56e-fcd4...,17367603.0,https://www.airbnb.com/users/show/17367603,...,,,,,f,4.0,0.0,4.0,0.0,
723338417213770077,https://www.airbnb.com/rooms/723338417213770077,20220926182502,2022-09-26,city scrape,"Habitación, acceso a cocina, aparcamiento grat...",Habitacion privada con acceso a cocina y baño....,,https://a0.muscache.com/pictures/91334988-af67...,17367603.0,https://www.airbnb.com/users/show/17367603,...,,,,,f,4.0,0.0,4.0,0.0,
723811793502781496,https://www.airbnb.com/rooms/723811793502781496,20220926182502,2022-09-27,city scrape,Amplio y céntrico piso frente a Teatro Arriaga.,Precioso apartamento reformado frente al teatr...,,https://a0.muscache.com/pictures/miso/Hosting-...,478255347.0,https://www.airbnb.com/users/show/478255347,...,,,,,t,1.0,1.0,0.0,0.0,
724334346105297717,https://www.airbnb.com/rooms/724334346105297717,20220926182502,2022-09-27,city scrape,Gaindegi by FeelFree Rentals,"PLEASE NOTE: Only adults, babies not allowed.<...",,https://a0.muscache.com/pictures/prohost-api/H...,98419892.0,https://www.airbnb.com/users/show/98419892,...,,,,,t,33.0,33.0,0.0,0.0,


## 1. Data exploration

In this section I will be exploring the Airbnb data schema. I will choose Madrid for this task, but all locations follow the same schema.


In [8]:
madrid_files: Path = Path("../data/airbnb/madrid")

In [9]:
listings_schema_df = pd.read_csv(
    madrid_files.parent.joinpath("listings_schema.csv")
).set_index("Field")
listings_df = pd.read_csv(madrid_files.joinpath("listings.csv.bz2"))
calendar_df = pd.read_csv(madrid_files.joinpath("calendar.csv.bz2"))

### 1.1. Listings


Listings fields with descriptions


In [10]:
listings_with_desc = listings_schema_df[["Description"]].dropna()
listings_with_desc

Unnamed: 0_level_0,Description
Field,Unnamed: 1_level_1
id,Airbnb's unique identifier for the listing
scrape_id,"Inside Airbnb ""Scrape"" this was part of"
last_scraped,"UTC. The date and time this listing was ""scrap..."
source,"One of ""neighbourhood search"" or ""previous scr..."
name,Name of the listing
description,Detailed description of the listing
neighborhood_overview,Host's description of the neighbourhood
picture_url,URL to the Airbnb hosted regular sized image f...
host_id,Airbnb's unique identifier for the host/user
host_url,The Airbnb page for the host


Listings fields without descriptions


In [11]:
listings_without_desc = listings_schema_df[["Description"]][
    listings_schema_df["Description"].isna()
]
listings_without_desc.index.tolist()

['listing_url',
 'host_response_time',
 'host_response_rate',
 'host_is_superhost',
 'host_thumbnail_url',
 'host_picture_url',
 'host_neighbourhood',
 'host_verifications',
 'host_has_profile_pic',
 'host_identity_verified',
 'neighbourhood',
 'amenities',
 'calendar_updated',
 'calendar_last_scraped',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value']

### 1.2. Calendar


In [12]:
calendar_df.head()

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,6369,2022-09-12,f,$77.00,$77.00,21.0,1125.0
1,6369,2022-09-13,f,$77.00,$77.00,21.0,1125.0
2,6369,2022-09-14,f,$77.00,$77.00,21.0,1125.0
3,6369,2022-09-15,f,$77.00,$77.00,21.0,1125.0
4,6369,2022-09-16,f,$77.00,$77.00,21.0,1125.0


## 2. Data wrangling

In this section we will be massaging the data to answer our business questions.


In [13]:
airbnb_files: Path = Path("../data/airbnb")

### 2.1. _What is the average price of each location type per neighbourhood? What are the most expensive neighbourhoods on average?_


In [14]:
most_expensive_hoods = {}
for path in airbnb_files.glob("**/listings.csv.bz2"):
    region_name = path.parent.name
    most_expensive_hoods[region_name.title()] = airbnb_avg_price(path)[1]

most_expensive_hoods_df = pd.DataFrame(most_expensive_hoods).transpose()
most_expensive_hoods_df

Unnamed: 0,Entire home/apt,Hotel room,Private room,Shared room
Sevilla,La Barzola,Santa Catalina,Bellavista,Santa Clara
Valencia,Favara,El Pilar,Rafalell-Vistabella,El Grau
Menorca,Sant Lluís,Mahón,Sant Lluís,Alaior
Mallorca,Andratx,Capdepera,Sencelles,Llucmajor
Euskadi,Laguardia,Karrantza Harana,Zarautz,Vitoria-Gasteiz
Barcelona,La Font D'En Fargues,La Font De La Guatlla,El Guinardó,Les Tres Torres
Girona,Torroella De Fluvià,Tossa De Mar,Sant Feliu De Pallerols,Vidreres
Madrid,Canillejas,Numancia,Arcos,San Pascual
Malaga,Cruz De Humilladero,Churriana,Churriana,Cruz De Humilladero


### 2.2. _What is the average host acceptance rate per location type and neighborhood? In which neighbourhoods is it the highest and in which the lowest?_

This can give us an idea of the negotiating power of the hosts or the desirability of guests.


In [15]:
highest_accept_rate_hoods = {}
for path in airbnb_files.glob("**/listings.csv.bz2"):
    region_name = path.parent.name
    highest_accept_rate_hoods[region_name.title()] = airbnb_avg_accept_rate(path)[1]

highest_accept_rate_hoods_df = pd.DataFrame(highest_accept_rate_hoods).transpose()
highest_accept_rate_hoods_df

Unnamed: 0,Entire home/apt,Hotel room,Private room,Shared room
Sevilla,Huerta Del Pilar,San Vicente,Retiro Obrero,San Vicente
Valencia,Safranar,En Corts,Carpesa,Russafa
Menorca,Alaior,Alaior,Es Migjorn Gran,Alaior
Mallorca,Costitx,Llucmajor,Montuïri,Llucmajor
Euskadi,Zeanuri,Getaria,Deba,Donostia-San Sebastiã¡N
Barcelona,Canyelles,La Sagrada Família,Ciutat Meridiana,Navas
Girona,Riudellots De La Selva,Pals,Madremanya,Ogassa
Madrid,Corralejos,Embajadores,Corralejos,Quintana
Malaga,Centro,Centro,Centro,Este


### 2.3. _What number and proportion of listings per neighbourhood belong to hosts owning different numbers of locations? In which neighbourhoods is the concentration bigger?_


In [16]:
most_dense_hoods = {}
for path in airbnb_files.glob("**/listings.csv.bz2"):
    region_name = path.parent.name
    most_dense_hoods[region_name.title()] = airbnb_hood_hosts(path)[1]

most_dense_hoods_df = pd.DataFrame(most_dense_hoods).transpose()
most_dense_hoods_df

Unnamed: 0,1,2_to_5,6_to_20,21_to_many
Sevilla,Museo,Alfalfa,Museo,San Bartolomé
Valencia,Mont-Olivet,L'Hort De Senabre,El Mercat,El Grau
Menorca,Ciutadella De Menorca,Ferreries,Ferreries,Ciutadella De Menorca
Mallorca,Sant Llorenç Des Cardassar,Santanyí,Andratx,Felanitx
Euskadi,Berango,Sopelana,Hondarribia,Bilbao
Barcelona,Hostafrancs,Sant Antoni,El Putxet I El Farró,La Sagrada Família
Girona,Viladamat,Sant Joan De Les Abadesses,Torroella De Montgrí,L'Escala
Madrid,Imperial,Cortes,Goya,Universidad
Malaga,Bailen-Miraflores,Ciudad Jardin,Cruz De Humilladero,Cruz De Humilladero


### 2.4. _What is the expected average profit per room type and neighborhood when looking at the reservations for the next N weeks? What is the neighbourhood expected to be the most profitable in that period?_

Here we assume that none of the reserved dates will be cancelled and that they are a good representation of the observed period.


In [20]:
n_weeks = 6 * 4

In [21]:
most_profitable_hoods = {}
for listings_path, calendar_path in zip(
    sorted(airbnb_files.glob("**/listings.csv.bz2")),
    sorted(airbnb_files.glob("**/calendar.csv.bz2")),
):
    region_name = listings_path.parent.name
    most_profitable_hoods[region_name.title()] = airbnb_avg_profit(
        listings_path, calendar_path, n_weeks=n_weeks
    )[1]

most_profitable_hoods_df = pd.DataFrame(most_profitable_hoods).transpose()
most_profitable_hoods_df

Unnamed: 0,Entire home/apt,Hotel room,Private room,Shared room
Barcelona,La Font De La Guatlla,El Camp D'En Grassot I Gràcia Nova,El Guinardó,Les Tres Torres
Euskadi,Muskiz,Oiartzun,Zarautz,Labastida
Girona,Cabanelles,Tossa De Mar,Forallac,Vidreres
Madrid,Amposta,Numancia,Argüelles,San Pascual
Malaga,Cruz De Humilladero,Centro,Centro,Cruz De Humilladero
Mallorca,Deyá,Campos,Santanyí,Llucmajor
Menorca,Es Migjorn Gran,Es Mercadal,Ferreries,Alaior
Sevilla,"León Xiii, Los Naranjos",Museo,Feria,Santa Clara
Valencia,Favara,Russafa,El Pla Del Remei,Rafalell-Vistabella


### 2.5. _What listings' factors affect the total profit in the next N weeks? Can the total profit be predicted?_

Here we assume that none of the reserved dates will be cancelled and that they are a good representation of the observed period.


In [22]:
profit_predictions = {}
for listings_path, calendar_path in zip(
    sorted(airbnb_files.glob("**/listings.csv.bz2")),
    sorted(airbnb_files.glob("**/calendar.csv.bz2")),
):
    region_name = listings_path.parent.name
    num_features, cat_features, mean_r2_score = airbnb_predict_profit(
        listings_path,
        calendar_path,
        feature_th=0,
        n_weeks=n_weeks,
        random_seed=random_seed,
    )
    profit_predictions[region_name.title()] = {
        "num_features": len(num_features),
        "cat_features": len(cat_features),
        "mean_r2_score": mean_r2_score,
    }

profit_predictions_df = pd.DataFrame(profit_predictions).transpose()
profit_predictions_df

Unnamed: 0,num_features,cat_features,mean_r2_score
Barcelona,28.0,44.0,0.224911
Euskadi,24.0,47.0,0.740419
Girona,24.0,42.0,0.118887
Madrid,28.0,34.0,0.32785
Malaga,28.0,25.0,0.709042
Mallorca,24.0,41.0,0.42055
Menorca,28.0,33.0,0.853593
Sevilla,28.0,25.0,0.571142
Valencia,24.0,40.0,0.84117
