# Analysis of Airbnb data on multiple locations spread across Spain

---


#### Setup


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys, os
import logging
import pandas as pd
import seaborn as sns
import json
import numpy as np

from copy import deepcopy
import datetime
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
from pathlib import Path
from typing import Iterable
from IPython import display as ICD
from matplotlib import pyplot as plt

In [None]:
src_path: str = "../src"
sys.path.append(src_path)
logging.getLogger().setLevel(logging.INFO)

In [None]:
from data_wrangling import *

In [None]:
random_seed: int = 8080

---


In [None]:
pd.read_csv("../data/airbnb/euskadi/calendar.csv.bz2", index_col=0)

In [None]:
pd.read_csv("../data/airbnb/euskadi/listings.csv.bz2", index_col=0)

## 1. Data exploration

In this section I will be exploring the Airbnb data schema. I will choose Madrid for this task, but all locations follow the same schema.


In [None]:
madrid_files: Path = Path("../data/airbnb/madrid")

In [None]:
listings_schema_df = pd.read_csv(
    madrid_files.parent.joinpath("listings_schema.csv")
).set_index("Field")
listings_df = pd.read_csv(madrid_files.joinpath("listings.csv.bz2"))
calendar_df = pd.read_csv(madrid_files.joinpath("calendar.csv.bz2"))

### 1.1. Listings


Listings fields with descriptions


In [None]:
listings_with_desc = listings_schema_df[["Description"]].dropna()
listings_with_desc

Listings fields without descriptions


In [None]:
listings_without_desc = listings_schema_df[["Description"]][
    listings_schema_df["Description"].isna()
]
listings_without_desc.index.tolist()

### 1.2. Calendar


In [None]:
calendar_df.head()

## 2. Data wrangling

In this section we will be massaging the data to answer our business questions.


In [None]:
airbnb_files: Path = Path("../data/airbnb")

### 2.1. _What is the average price of each location type per neighbourhood? What are the most expensive neighbourhoods on average?_


In [None]:
most_expensive_hoods = {}
for path in airbnb_files.glob("**/listings.csv.bz2"):
    region_name = path.parent.name
    most_expensive_hoods[region_name.title()] = airbnb_avg_price(path)[1]

most_expensive_hoods_df = pd.DataFrame(most_expensive_hoods).transpose()
most_expensive_hoods_df

### 2.2. _What is the average host acceptance rate per location type and neighborhood? In which neighbourhoods is it the highest and in which the lowest?_

This can give us an idea of the negotiating power of the hosts or the desirability of guests.


In [None]:
highest_accept_rate_hoods = {}
for path in airbnb_files.glob("**/listings.csv.bz2"):
    region_name = path.parent.name
    highest_accept_rate_hoods[region_name.title()] = airbnb_avg_accept_rate(path)[1]

highest_accept_rate_hoods_df = pd.DataFrame(highest_accept_rate_hoods).transpose()
highest_accept_rate_hoods_df

### 2.3. _What number and proportion of listings per neighbourhood belong to hosts owning different numbers of locations? In which neighbourhoods is the concentration bigger?_


In [None]:
most_dense_hoods = {}
for path in airbnb_files.glob("**/listings.csv.bz2"):
    region_name = path.parent.name
    most_dense_hoods[region_name.title()] = airbnb_hood_hosts(path)[1]

most_dense_hoods_df = pd.DataFrame(most_dense_hoods).transpose()
most_dense_hoods_df

### 2.4. _What is the expected average profit per room type and neighborhood when looking at the reservations for the next N weeks? What is the neighbourhood expected to be the most profitable in that period?_

Here we assume that none of the reserved dates will be cancelled and that they are a good representation of the observed period.


In [None]:
n_weeks = 8

In [None]:
most_profitable_hoods = {}
for listings_path, calendar_path in zip(
    sorted(airbnb_files.glob("**/listings.csv.bz2")),
    sorted(airbnb_files.glob("**/calendar.csv.bz2")),
):
    region_name = listings_path.parent.name
    most_profitable_hoods[region_name.title()] = airbnb_avg_profit(
        listings_path, calendar_path, n_weeks=n_weeks
    )[1]

most_profitable_hoods_df = pd.DataFrame(most_profitable_hoods).transpose()
most_profitable_hoods_df

### 2.5. _What listings' factors affect the number of reservations? Can they be predicted?_

Here we assume that none of the reserved dates will be cancelled and that they are a good representation of the yearly trend.


In [None]:
listings_reservations_df = (
    calendar_df[calendar_df["available"] == "f"][["listing_id", "date"]]
    .groupby("listing_id")
    .count()
    .rename(columns={"date": "reservations_count"})
    .join(listings_df.set_index("id"))
)
listings_reservations_df = listings_reservations_df.drop(
    columns=[
        c
        for c in listings_reservations_df.columns
        if any(s in c for s in ["availability", "calculated"])
    ]
)
listings_reservations_df.head()

In [None]:
corr_matrix_quant = listings_reservations_df.select_dtypes(include=(float, int)).corr(
    method="spearman"
)
reservations_quant_corr = (
    corr_matrix_quant["reservations_count"]
    .dropna()
    .sort_values(key=abs, ascending=False)
)
reservations_quant_corr.head(20)

In [None]:
cat_df = listings_reservations_df.select_dtypes(include=object)
cat_df = cat_df[cat_df.columns[cat_df.nunique() <= 10]]
cat_df = pd.concat(
    [
        pd.get_dummies(cat_df),
        listings_reservations_df["reservations_count"],
    ],
    axis=1,
)
cat_df.columns = [c.lower().replace(" ", "_") for c in cat_df.columns]
cat_df.head()

In [None]:
corr_matrix_qual = cat_df.corr(method="spearman")
reservations_qual_corr = (
    corr_matrix_qual["reservations_count"]
    .dropna()
    .sort_values(key=abs, ascending=False)
)
reservations_qual_corr.head(20)

In [None]:
X = pd.concat(
    [
        listings_reservations_df[
            reservations_quant_corr[abs(reservations_quant_corr) > 0.2].index
        ],
        cat_df[reservations_qual_corr[abs(reservations_qual_corr) > 0.2].index],
    ],
    axis=1,
).drop(columns=["reservations_count"])
X.columns = [c.lower().replace(" ", "_") for c in X.columns]

# we want complete, verified data for training a model, hence we remove all listings with NaN values.
X.dropna(inplace=True)

y = listings_reservations_df[["reservations_count"]]

In [None]:
X.head()

In [None]:
k_fold_cross_valitor = KFold(10, random_state=random_seed, shuffle=True)
model_base = ElasticNetCV(
    l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1],
    alphas=[0.0001, 0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 1],
    max_iter=int(1e5),
    cv=10,
    n_jobs=8,
    random_state=random_seed,
)

k_fold_scores = []
for train_index, test_index in k_fold_cross_valitor.split(X):
    model = deepcopy(model_base)
    normalizer = Normalizer()
    X_train = normalizer.fit_transform(np.ascontiguousarray(X.iloc[train_index]))
    model.fit(
        X_train,
        np.ravel(np.ascontiguousarray(y.iloc[train_index])),
    )
    k_fold_scores.append(
        r2_score(
            np.ravel(np.ascontiguousarray(y.iloc[test_index])),
            model.predict(
                normalizer.transform(np.ascontiguousarray((X.iloc[test_index])))
            ),
        )
    )

print(k_fold_scores)
print(np.mean(k_fold_scores))

Using the selected columns, the model is able to explain X% of the variance in the number of reservations.
