# Analysis of Airbnb data on multiple locations spread across Spain

---


#### Setup


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys, os
import logging
import pandas as pd
from pathlib import Path
from typing import Iterable
from IPython import display as ICD


In [3]:
src_path: str = "../src"
sys.path.append(src_path)
logging.getLogger().setLevel(logging.INFO)


---


## 1. Data exploration

In this section I will be exploring the Airbnb data schema. I will choose Madrid for this task, but all locations follow the same schema.


In [4]:
files_root: Path = Path("../data/airbnb/madrid")


In [5]:
listings_schema_df = pd.read_csv(
    files_root.parent.joinpath("listings_schema.csv")
).set_index("Field")
listings_df = pd.read_csv(files_root.joinpath("listings.csv.gz"))
calendar_df = pd.read_csv(files_root.joinpath("calendar.csv.gz"))
reviews_df = pd.read_csv(files_root.joinpath("reviews.csv.gz"))


### 1.1. Listings


Listings fields with descriptions


In [6]:
pd.set_option("display.max_colwidth", None)
listings_with_desc = listings_schema_df[["Description"]].dropna()
listings_with_desc


Unnamed: 0_level_0,Description
Field,Unnamed: 1_level_1
id,Airbnb's unique identifier for the listing
scrape_id,"Inside Airbnb ""Scrape"" this was part of"
last_scraped,"UTC. The date and time this listing was ""scraped""."
source,"One of ""neighbourhood search"" or ""previous scrape"". ""neighbourhood search"" means that the listing was found by searching the city, while ""previous scrape"" means that the listing was seen in another scrape performed in the last 65 days, and the listing was confirmed to be still available on the Airbnb site."
name,Name of the listing
description,Detailed description of the listing
neighborhood_overview,Host's description of the neighbourhood
picture_url,URL to the Airbnb hosted regular sized image for the listing
host_id,Airbnb's unique identifier for the host/user
host_url,The Airbnb page for the host


Listings fields without descriptions


In [7]:
listings_without_desc = listings_schema_df[["Description"]][
    listings_schema_df["Description"].isna()
]
listings_without_desc.index.tolist()


['listing_url',
 'host_response_time',
 'host_response_rate',
 'host_is_superhost',
 'host_thumbnail_url',
 'host_picture_url',
 'host_neighbourhood',
 'host_verifications',
 'host_has_profile_pic',
 'host_identity_verified',
 'neighbourhood',
 'amenities',
 'calendar_updated',
 'calendar_last_scraped',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value']

### 1.2. Calendar


In [8]:
calendar_df.head()


Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,6369,2022-09-12,f,$77.00,$77.00,21.0,1125.0
1,6369,2022-09-13,f,$77.00,$77.00,21.0,1125.0
2,6369,2022-09-14,f,$77.00,$77.00,21.0,1125.0
3,6369,2022-09-15,f,$77.00,$77.00,21.0,1125.0
4,6369,2022-09-16,f,$77.00,$77.00,21.0,1125.0


### 1.3. Reviews


In [9]:
reviews_df.head()


Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,6369,29428,2010-03-14,84790,Nancy,"Simon and Arturo have the ultimate location in Madrid! Steps from the Metro and bus line to everywhere you want to visit. The accomodations are clean, well appointed and fantastic views of Madrid skyline. We felt like it was a spa when walking into the flat. We were greeted warmly and offered lots of information to get up and running for our short stay. We wished we could have stayed longer and we highly recommend this location and neighborhood."
1,6369,31018,2010-03-23,84338,David,"Myself and Kristy originally planned on staying with Arturo for only a week, but when our plans changed, Arturo was very open to working with our changing schedule. Arturo and Simon were very friendly and helpful throughout our ten day visit. Our room was very clean and well kept. The view is amazing and the location is perfect. We're going to be traveling around Europe for another four months, and were joking that Arturo set the standards so high that we highly doubt we're going to be able to do any better, and it's true! Thank you so much Arturo and Simon, you were both very gracious, helpful and respectful of our space. I would highly recommend you and your flat to anyone traveling who wants a quite place with a great views, wonderful/clean rooms, a respectful and honest atmosphere. Thank you so much for being such an outstanding host!"
2,6369,34694,2010-04-10,98655,Marion,"We had a great time at Arturo and Simon's ! A cosy apartment and a wonderful terrasse, and above all, really nice hosts ! Thank you..."
3,6369,37146,2010-04-21,109871,Kurt,"I very much enjoyed the stay. \r<br/>It's a wonderful room and bath in a great apartment with a lovely terrace. The location is perfect for me, especially given the easy access to the the Metro and the airport. I look forward to returning! \r<br/>Kurt S."
4,6369,38168,2010-04-26,98901,Dennis,Arturo and Simon are polite and friendly hosts who provide a very pleasant and convenient place to stay in Madrid.\r<br/>\r<br/>Dennis


## 2. Data wrangling

In this section we will be massaging the data to answer our business questions.


### 2.1. _What is the average price of each location type per neighbourhood? What are the most expensive neighbourhoods on average?_


In [10]:
price_str_to_float = lambda x: float(x.replace("$", "").replace(",", ""))
listings_df["price_num"] = listings_df["price"].apply(price_str_to_float)


In [11]:
df = (
    listings_df[["neighbourhood_group_cleansed", "room_type", "price_num"]]
    .groupby(["neighbourhood_group_cleansed", "room_type"])
    .mean()
    .round(2)
)
df.unstack(level=1)


Unnamed: 0_level_0,price_num,price_num,price_num,price_num
room_type,Entire home/apt,Hotel room,Private room,Shared room
neighbourhood_group_cleansed,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Arganzuela,141.89,38.0,147.51,111.83
Barajas,140.52,,113.77,47.5
Carabanchel,123.61,,47.72,83.2
Centro,151.53,194.2,90.86,79.73
Chamartín,139.86,166.5,98.62,171.67
Chamberí,181.72,90.0,82.79,135.62
Ciudad Lineal,134.61,29.0,86.79,426.91
Fuencarral - El Pardo,164.68,,96.58,900.0
Hortaleza,181.63,29.33,92.19,28.5
Latina,202.8,,97.2,114.18


In [12]:
sorted_sums = df["price_num"].groupby(level=0).sum().sort_values(ascending=False)
df.reindex(sorted_sums.index, level=0).unstack(level=1)


Unnamed: 0_level_0,price_num,price_num,price_num,price_num
room_type,Entire home/apt,Hotel room,Private room,Shared room
neighbourhood_group_cleansed,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Fuencarral - El Pardo,164.68,,96.58,900.0
San Blas - Canillejas,353.0,,288.0,501.75
Moncloa - Aravaca,178.4,254.0,224.94,149.33
Ciudad Lineal,134.61,29.0,86.79,426.91
Salamanca,200.48,220.89,88.93,83.25
Chamartín,139.86,166.5,98.62,171.67
Puente de Vallecas,123.97,350.0,47.49,26.11
Centro,151.53,194.2,90.86,79.73
Chamberí,181.72,90.0,82.79,135.62
Arganzuela,141.89,38.0,147.51,111.83


### 2.2. _What is the average host acceptance rate per location type and neighborhood? In which neighbourhoods is it the highest and in which the lowest?_

This can give us an idea of the negotiating power of the hosts or the desirability of guests.


In [13]:
perc_str_to_float = (
    lambda x: float(x.replace("%", "")) / 100 if isinstance(x, str) else x
)
listings_df["host_acceptance_rate_num"] = listings_df["host_acceptance_rate"].apply(
    perc_str_to_float
)


In [14]:
df = (
    listings_df[
        ["neighbourhood_group_cleansed", "room_type", "host_acceptance_rate_num"]
    ]
    .groupby(["neighbourhood_group_cleansed", "room_type"])
    .mean()
    .round(2)
)
df.unstack(level=1)


Unnamed: 0_level_0,host_acceptance_rate_num,host_acceptance_rate_num,host_acceptance_rate_num,host_acceptance_rate_num
room_type,Entire home/apt,Hotel room,Private room,Shared room
neighbourhood_group_cleansed,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Arganzuela,0.91,,0.77,0.92
Barajas,0.92,,0.79,0.96
Carabanchel,0.8,,0.8,0.99
Centro,0.92,0.98,0.88,0.92
Chamartín,0.85,0.96,0.79,
Chamberí,0.82,0.97,0.73,0.67
Ciudad Lineal,0.89,0.96,0.77,1.0
Fuencarral - El Pardo,0.83,,0.8,
Hortaleza,0.89,0.99,0.8,0.92
Latina,0.87,,0.81,1.0


In [15]:
sorted_sums = (
    df["host_acceptance_rate_num"].groupby(level=0).sum().sort_values(ascending=False)
)
df.reindex(sorted_sums.index, level=0).unstack(level=1)


Unnamed: 0_level_0,host_acceptance_rate_num,host_acceptance_rate_num,host_acceptance_rate_num,host_acceptance_rate_num
room_type,Entire home/apt,Hotel room,Private room,Shared room
neighbourhood_group_cleansed,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Centro,0.92,0.98,0.88,0.92
Salamanca,0.89,1.0,0.8,0.97
Ciudad Lineal,0.89,0.96,0.77,1.0
Hortaleza,0.89,0.99,0.8,0.92
Moncloa - Aravaca,0.85,1.0,0.83,0.54
Chamberí,0.82,0.97,0.73,0.67
Latina,0.87,,0.81,1.0
Retiro,0.87,,0.86,0.95
Barajas,0.92,,0.79,0.96
Usera,0.87,,0.83,0.94


### 2.3. _How is competition in each neighbourhood? What number and proportion of listings belong to hosts owning different numbers of locations?_


In [16]:
listings_df.groupby("neighbourhood_group_cleansed").count()["id"].sort_values(
    ascending=False
)


neighbourhood_group_cleansed
Centro                   9181
Salamanca                1401
Chamberí                 1261
Arganzuela               1085
Tetuán                    937
Carabanchel               722
Retiro                    721
Moncloa - Aravaca         678
Ciudad Lineal             631
Chamartín                 584
Latina                    579
Puente de Vallecas        560
San Blas - Canillejas     466
Hortaleza                 406
Fuencarral - El Pardo     366
Usera                     342
Villaverde                242
Barajas                   151
Moratalaz                 137
Vicálvaro                 135
Villa de Vallecas          96
Name: id, dtype: int64

In [17]:
df = (
    listings_df[["host_id", "neighbourhood_group_cleansed", "id"]]
    .groupby(["host_id", "neighbourhood_group_cleansed"])
    .count()["id"]
    .sort_values(ascending=False)
    .unstack(level=0)
)
df


host_id,7952,13660,17453,31622,53526,66327,67353,71602,75744,75944,...,478551223,478564645,478608136,478615464,478631020,478637567,478795127,478796705,478911429,478917335
neighbourhood_group_cleansed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Arganzuela,,,,,,,,,,,...,,,,,,,,,,
Barajas,,,,,,,,,,,...,,,,,,,,,,
Carabanchel,,,,,,,,,,,...,,,,,,,,,,
Centro,,,1.0,,6.0,,4.0,,1.0,,...,1.0,,1.0,1.0,1.0,1.0,,1.0,,
Chamartín,,1.0,,,,,,,,,...,,,,,,,,,,
Chamberí,,,,1.0,,,,1.0,,,...,,,,,,,,,,
Ciudad Lineal,,,,,,,,,,,...,,,,,,,,,,
Fuencarral - El Pardo,,,,,,,,,,,...,,,,,,,,,,
Hortaleza,,,,,,,,,,,...,,,,,,,1.0,,,
Latina,,,,,,,,,,,...,,,,,,,,,,


In [18]:
from collections import defaultdict


neighborhood_hosts_groups = defaultdict(dict)
for neighborhood, neighborhood_hosts in df.iterrows():
    # Hosts with only one listing
    total = sum(neighborhood_hosts == 1)
    p = total / sum(neighborhood_hosts >= 1) * 100
    neighborhood_hosts_groups[neighborhood]["1"] = f"{total} ({p:.2f}%)"

    # Hosts managing between 2 and 5 listings
    total = sum((neighborhood_hosts >= 2) & (neighborhood_hosts <= 5))
    p = total / sum(neighborhood_hosts >= 1) * 100
    neighborhood_hosts_groups[neighborhood]["2_to_5"] = f"{total} ({p:.2f}%)"

    # Hosts managing between 6 to 20 listings
    total = sum((neighborhood_hosts >= 6) & (neighborhood_hosts <= 20))
    p = total / sum(neighborhood_hosts >= 1) * 100
    neighborhood_hosts_groups[neighborhood]["6_to_20"] = f"{total} ({p:.2f}%)"

    # Hosts managing 21 or more listings
    total = sum(neighborhood_hosts >= 21)
    p = total / sum(neighborhood_hosts >= 1) * 100
    neighborhood_hosts_groups[neighborhood]["21_to_many"] = f"{total} ({p:.2f}%)"


In [33]:
host_counts_df = pd.DataFrame(neighborhood_hosts_groups).transpose()
host_counts_df.sort_values(
    by=host_counts_df.columns.tolist(),
    key=lambda x: [int(r.split(" ")[0]) for r in x],
    ascending=False,
)


Unnamed: 0,1,2_to_5,6_to_20,21_to_many
Centro,3231 (73.82%),878 (20.06%),236 (5.39%),32 (0.73%)
Salamanca,647 (81.38%),119 (14.97%),23 (2.89%),6 (0.75%)
Chamberí,626 (81.51%),115 (14.97%),23 (2.99%),4 (0.52%)
Arganzuela,611 (81.25%),127 (16.89%),14 (1.86%),0 (0.00%)
Tetuán,504 (83.03%),82 (13.51%),20 (3.29%),1 (0.16%)
Ciudad Lineal,419 (84.14%),75 (15.06%),4 (0.80%),0 (0.00%)
Carabanchel,402 (79.60%),95 (18.81%),8 (1.58%),0 (0.00%)
Retiro,378 (80.25%),81 (17.20%),11 (2.34%),1 (0.21%)
Latina,364 (82.17%),75 (16.93%),4 (0.90%),0 (0.00%)
Moncloa - Aravaca,346 (79.00%),83 (18.95%),7 (1.60%),2 (0.46%)


### 2.4. _What is the expected average profit per room type and neighborhood when looking one year ahead of reservations (September 2022 to September 2023)?_
