# Analysis of Airbnb data on multiple locations spread across Spain

---


#### Setup


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys, os
import logging
import pandas as pd
from pathlib import Path
from typing import Iterable
from IPython import display as ICD


In [3]:
src_path: str = "../src"
sys.path.append(src_path)
logging.getLogger().setLevel(logging.INFO)


---


## 1. Data exploration

In this section I will be exploring the Airbnb data schema. I will choose Madrid for this task, but all locations follow the same schema.


In [4]:
files_root: Path = Path("../data/airbnb/madrid")


### 1.1. Listings


In [5]:
listings_schema_df = pd.read_csv(
    files_root.parent.joinpath("listings_schema.csv")
).set_index("Field")
listings_df = pd.read_csv(files_root.joinpath("listings.csv.gz"))


Listings fields with descriptions

In [6]:
pd.set_option("display.max_colwidth", None)
listings_with_desc = listings_schema_df[["Description"]].dropna()
listings_with_desc


Unnamed: 0_level_0,Description
Field,Unnamed: 1_level_1
id,Airbnb's unique identifier for the listing
scrape_id,"Inside Airbnb ""Scrape"" this was part of"
last_scraped,"UTC. The date and time this listing was ""scraped""."
source,"One of ""neighbourhood search"" or ""previous scrape"". ""neighbourhood search"" means that the listing was found by searching the city, while ""previous scrape"" means that the listing was seen in another scrape performed in the last 65 days, and the listing was confirmed to be still available on the Airbnb site."
name,Name of the listing
description,Detailed description of the listing
neighborhood_overview,Host's description of the neighbourhood
picture_url,URL to the Airbnb hosted regular sized image for the listing
host_id,Airbnb's unique identifier for the host/user
host_url,The Airbnb page for the host


Listings fields without descriptions


In [7]:
listings_without_desc = listings_schema_df[["Description"]][
    listings_schema_df["Description"].isna()
]
listings_without_desc.index.tolist()


['listing_url',
 'host_response_time',
 'host_response_rate',
 'host_is_superhost',
 'host_thumbnail_url',
 'host_picture_url',
 'host_neighbourhood',
 'host_verifications',
 'host_has_profile_pic',
 'host_identity_verified',
 'neighbourhood',
 'amenities',
 'calendar_updated',
 'calendar_last_scraped',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value']

In [8]:
calendar_df = pd.read_csv(files_root.joinpath("calendar.csv.gz"))
reviews_df = pd.read_csv(files_root.joinpath("reviews.csv.gz"))


In [15]:
calendar_df['date'] = pd.to_datetime(calendar_df['date'])
calendar_df[(calendar_df["date"].dt.month == 9) & (calendar_df["date"].dt.year == 2022)].sort_values("date")

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,6369,2022-09-12,f,$77.00,$77.00,21.0,1125.0
3214213,38500236,2022-09-12,f,$123.00,$98.00,1.0,1125.0
3214578,38504383,2022-09-12,f,$20.00,$20.00,90.0,1095.0
3214943,38505613,2022-09-12,t,$98.00,$98.00,2.0,1125.0
3215308,38505801,2022-09-12,t,$32.00,$29.00,1.0,1125.0
...,...,...,...,...,...,...,...
2006820,28478505,2022-09-30,t,$45.00,$45.00,1.0,1125.0
4557787,50451044,2022-09-30,f,$150.00,$150.00,7.0,1125.0
2007185,28482100,2022-09-30,f,$52.00,$52.00,3.0,15.0
4568007,50563674,2022-09-30,f,$100.00,$100.00,32.0,1125.0


In [17]:
calendar_df

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,6369,2022-09-12,f,$77.00,$77.00,21.0,1125.0
1,6369,2022-09-13,f,$77.00,$77.00,21.0,1125.0
2,6369,2022-09-14,f,$77.00,$77.00,21.0,1125.0
3,6369,2022-09-15,f,$77.00,$77.00,21.0,1125.0
4,6369,2022-09-16,f,$77.00,$77.00,21.0,1125.0
...,...,...,...,...,...,...,...
7547137,361053,2023-09-07,f,$100.00,$100.00,2.0,365.0
7547138,361053,2023-09-08,f,$100.00,$100.00,2.0,365.0
7547139,361053,2023-09-09,f,$100.00,$100.00,2.0,365.0
7547140,361053,2023-09-10,f,$100.00,$100.00,2.0,365.0


### 1.1. Listings
