# Analysis of Airbnb data on multiple locations spread across Spain

---


#### Setup


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys, os
import logging
import pandas as pd
import seaborn as sns
import json
import numpy as np

from copy import deepcopy
import datetime
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
from pathlib import Path
from typing import Iterable
from IPython import display as ICD
from matplotlib import pyplot as plt

In [3]:
src_path: str = "../src"
sys.path.append(src_path)
logging.getLogger().setLevel(logging.INFO)

In [4]:
random_seed: int = 8080

---


## 1. Data exploration

In this section I will be exploring the Airbnb data schema. I will choose Madrid for this task, but all locations follow the same schema.


In [5]:
files_root: Path = Path("../data/airbnb/madrid")

In [6]:
listings_schema_df = pd.read_csv(
    files_root.parent.joinpath("listings_schema.csv")
).set_index("Field")
listings_df = pd.read_csv(files_root.joinpath("listings.csv.bz2"))
calendar_df = pd.read_csv(files_root.joinpath("calendar.csv.bz2"))

### 1.1. Listings


Listings fields with descriptions


In [7]:
listings_with_desc = listings_schema_df[["Description"]].dropna()
listings_with_desc

Unnamed: 0_level_0,Description
Field,Unnamed: 1_level_1
id,Airbnb's unique identifier for the listing
scrape_id,"Inside Airbnb ""Scrape"" this was part of"
last_scraped,"UTC. The date and time this listing was ""scrap..."
source,"One of ""neighbourhood search"" or ""previous scr..."
name,Name of the listing
description,Detailed description of the listing
neighborhood_overview,Host's description of the neighbourhood
picture_url,URL to the Airbnb hosted regular sized image f...
host_id,Airbnb's unique identifier for the host/user
host_url,The Airbnb page for the host


Listings fields without descriptions


In [8]:
listings_without_desc = listings_schema_df[["Description"]][
    listings_schema_df["Description"].isna()
]
listings_without_desc.index.tolist()

['listing_url',
 'host_response_time',
 'host_response_rate',
 'host_is_superhost',
 'host_thumbnail_url',
 'host_picture_url',
 'host_neighbourhood',
 'host_verifications',
 'host_has_profile_pic',
 'host_identity_verified',
 'neighbourhood',
 'amenities',
 'calendar_updated',
 'calendar_last_scraped',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value']

### 1.2. Calendar


In [9]:
calendar_df.head()

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,6369,2022-09-12,f,$77.00,$77.00,21.0,1125.0
1,6369,2022-09-13,f,$77.00,$77.00,21.0,1125.0
2,6369,2022-09-14,f,$77.00,$77.00,21.0,1125.0
3,6369,2022-09-15,f,$77.00,$77.00,21.0,1125.0
4,6369,2022-09-16,f,$77.00,$77.00,21.0,1125.0


## 2. Data wrangling

In this section we will be massaging the data to answer our business questions.


### 2.1. _What is the average price of each location type per neighbourhood? What are the most expensive neighbourhoods on average?_


In [10]:
price_str_to_float = lambda x: float(x.replace("$", "").replace(",", ""))
listings_df["price_num"] = listings_df["price"].apply(price_str_to_float)

In [11]:
df = (
    listings_df[["neighbourhood_group_cleansed", "room_type", "price_num"]]
    .groupby(["neighbourhood_group_cleansed", "room_type"])
    .mean()
    .round(2)
)
df.unstack(level=1)["price_num"]

room_type,Entire home/apt,Hotel room,Private room,Shared room
neighbourhood_group_cleansed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Arganzuela,141.89,38.0,147.51,111.83
Barajas,140.52,,113.77,47.5
Carabanchel,123.61,,47.72,83.2
Centro,151.53,194.2,90.86,79.73
Chamartín,139.86,166.5,98.62,171.67
Chamberí,181.72,90.0,82.79,135.62
Ciudad Lineal,134.61,29.0,86.79,426.91
Fuencarral - El Pardo,164.68,,96.58,900.0
Hortaleza,181.63,29.33,92.19,28.5
Latina,202.8,,97.2,114.18


In [12]:
sorted_sums = df["price_num"].groupby(level=0).sum().sort_values(ascending=False)
df.reindex(sorted_sums.index, level=0).unstack(level=1)["price_num"]

room_type,Entire home/apt,Hotel room,Private room,Shared room
neighbourhood_group_cleansed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fuencarral - El Pardo,164.68,,96.58,900.0
San Blas - Canillejas,353.0,,288.0,501.75
Moncloa - Aravaca,178.4,254.0,224.94,149.33
Ciudad Lineal,134.61,29.0,86.79,426.91
Salamanca,200.48,220.89,88.93,83.25
Chamartín,139.86,166.5,98.62,171.67
Puente de Vallecas,123.97,350.0,47.49,26.11
Centro,151.53,194.2,90.86,79.73
Chamberí,181.72,90.0,82.79,135.62
Arganzuela,141.89,38.0,147.51,111.83


### 2.2. _What is the average host acceptance rate per location type and neighborhood? In which neighbourhoods is it the highest and in which the lowest?_

This can give us an idea of the negotiating power of the hosts or the desirability of guests.


In [13]:
perc_str_to_float = (
    lambda x: float(x.replace("%", "")) / 100 if isinstance(x, str) else x
)
listings_df["host_acceptance_rate_num"] = listings_df["host_acceptance_rate"].apply(
    perc_str_to_float
)

In [14]:
df = (
    listings_df[
        ["neighbourhood_group_cleansed", "room_type", "host_acceptance_rate_num"]
    ]
    .groupby(["neighbourhood_group_cleansed", "room_type"])
    .mean()
    .round(2)
)
df.unstack(level=1)

Unnamed: 0_level_0,host_acceptance_rate_num,host_acceptance_rate_num,host_acceptance_rate_num,host_acceptance_rate_num
room_type,Entire home/apt,Hotel room,Private room,Shared room
neighbourhood_group_cleansed,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Arganzuela,0.91,,0.77,0.92
Barajas,0.92,,0.79,0.96
Carabanchel,0.8,,0.8,0.99
Centro,0.92,0.98,0.88,0.92
Chamartín,0.85,0.96,0.79,
Chamberí,0.82,0.97,0.73,0.67
Ciudad Lineal,0.89,0.96,0.77,1.0
Fuencarral - El Pardo,0.83,,0.8,
Hortaleza,0.89,0.99,0.8,0.92
Latina,0.87,,0.81,1.0


In [15]:
sorted_sums = (
    df["host_acceptance_rate_num"].groupby(level=0).sum().sort_values(ascending=False)
)
df.reindex(sorted_sums.index, level=0).unstack(level=1)

Unnamed: 0_level_0,host_acceptance_rate_num,host_acceptance_rate_num,host_acceptance_rate_num,host_acceptance_rate_num
room_type,Entire home/apt,Hotel room,Private room,Shared room
neighbourhood_group_cleansed,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Centro,0.92,0.98,0.88,0.92
Salamanca,0.89,1.0,0.8,0.97
Ciudad Lineal,0.89,0.96,0.77,1.0
Hortaleza,0.89,0.99,0.8,0.92
Moncloa - Aravaca,0.85,1.0,0.83,0.54
Chamberí,0.82,0.97,0.73,0.67
Latina,0.87,,0.81,1.0
Retiro,0.87,,0.86,0.95
Barajas,0.92,,0.79,0.96
Usera,0.87,,0.83,0.94


### 2.3. _How is competition in each neighbourhood? What number and proportion of listings belong to hosts owning different numbers of locations?_


In [16]:
listings_df.groupby("neighbourhood_group_cleansed").count()["id"].sort_values(
    ascending=False
)

neighbourhood_group_cleansed
Centro                   9181
Salamanca                1401
Chamberí                 1261
Arganzuela               1085
Tetuán                    937
Carabanchel               722
Retiro                    721
Moncloa - Aravaca         678
Ciudad Lineal             631
Chamartín                 584
Latina                    579
Puente de Vallecas        560
San Blas - Canillejas     466
Hortaleza                 406
Fuencarral - El Pardo     366
Usera                     342
Villaverde                242
Barajas                   151
Moratalaz                 137
Vicálvaro                 135
Villa de Vallecas          96
Name: id, dtype: int64

In [17]:
df = (
    listings_df[["host_id", "neighbourhood_group_cleansed", "id"]]
    .groupby(["host_id", "neighbourhood_group_cleansed"])
    .count()["id"]
    .sort_values(ascending=False)
    .unstack(level=0)
)
df

host_id,7952,13660,17453,31622,53526,66327,67353,71602,75744,75944,...,478551223,478564645,478608136,478615464,478631020,478637567,478795127,478796705,478911429,478917335
neighbourhood_group_cleansed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Arganzuela,,,,,,,,,,,...,,,,,,,,,,
Barajas,,,,,,,,,,,...,,,,,,,,,,
Carabanchel,,,,,,,,,,,...,,,,,,,,,,
Centro,,,1.0,,6.0,,4.0,,1.0,,...,1.0,,1.0,1.0,1.0,1.0,,1.0,,
Chamartín,,1.0,,,,,,,,,...,,,,,,,,,,
Chamberí,,,,1.0,,,,1.0,,,...,,,,,,,,,,
Ciudad Lineal,,,,,,,,,,,...,,,,,,,,,,
Fuencarral - El Pardo,,,,,,,,,,,...,,,,,,,,,,
Hortaleza,,,,,,,,,,,...,,,,,,,1.0,,,
Latina,,,,,,,,,,,...,,,,,,,,,,


In [18]:
from collections import defaultdict


neighborhood_hosts_groups = defaultdict(dict)
for neighborhood, neighborhood_hosts in df.iterrows():
    # Hosts with only one listing
    total = sum(neighborhood_hosts == 1)
    p = total / sum(neighborhood_hosts >= 1) * 100
    neighborhood_hosts_groups[neighborhood]["1"] = f"{total} ({p:.2f}%)"

    # Hosts managing between 2 and 5 listings
    total = sum((neighborhood_hosts >= 2) & (neighborhood_hosts <= 5))
    p = total / sum(neighborhood_hosts >= 1) * 100
    neighborhood_hosts_groups[neighborhood]["2_to_5"] = f"{total} ({p:.2f}%)"

    # Hosts managing between 6 to 20 listings
    total = sum((neighborhood_hosts >= 6) & (neighborhood_hosts <= 20))
    p = total / sum(neighborhood_hosts >= 1) * 100
    neighborhood_hosts_groups[neighborhood]["6_to_20"] = f"{total} ({p:.2f}%)"

    # Hosts managing 21 or more listings
    total = sum(neighborhood_hosts >= 21)
    p = total / sum(neighborhood_hosts >= 1) * 100
    neighborhood_hosts_groups[neighborhood]["21_to_many"] = f"{total} ({p:.2f}%)"

In [19]:
host_counts_df = pd.DataFrame(neighborhood_hosts_groups).transpose()
host_counts_df.sort_values(
    by=host_counts_df.columns.tolist(),
    key=lambda x: [int(r.split(" ")[0]) for r in x],
    ascending=False,
)

Unnamed: 0,1,2_to_5,6_to_20,21_to_many
Centro,3231 (73.82%),878 (20.06%),236 (5.39%),32 (0.73%)
Salamanca,647 (81.38%),119 (14.97%),23 (2.89%),6 (0.75%)
Chamberí,626 (81.51%),115 (14.97%),23 (2.99%),4 (0.52%)
Arganzuela,611 (81.25%),127 (16.89%),14 (1.86%),0 (0.00%)
Tetuán,504 (83.03%),82 (13.51%),20 (3.29%),1 (0.16%)
Ciudad Lineal,419 (84.14%),75 (15.06%),4 (0.80%),0 (0.00%)
Carabanchel,402 (79.60%),95 (18.81%),8 (1.58%),0 (0.00%)
Retiro,378 (80.25%),81 (17.20%),11 (2.34%),1 (0.21%)
Latina,364 (82.17%),75 (16.93%),4 (0.90%),0 (0.00%)
Moncloa - Aravaca,346 (79.00%),83 (18.95%),7 (1.60%),2 (0.46%)


### 2.4. _What is the expected average profit per room type and neighborhood when looking at the reservations for the next 4 weeks? What is the neighbourhood expected to be the most profitable in that period?_

Here we assume that none of the reserved dates will be cancelled and that they are a good representation of the yearly trend.


In [20]:
calendar_df["date"] = pd.to_datetime(calendar_df["date"])
calendar_df = calendar_df[
    calendar_df["date"] <= (calendar_df["date"].min() + datetime.timedelta(weeks=8))
]

calendar_df["price_num"] = calendar_df["price"].apply(price_str_to_float)
calendar_df["adjusted_price_num"] = calendar_df["adjusted_price"].apply(
    price_str_to_float
)

In [21]:
listings_profits_df = (
    calendar_df[calendar_df["available"] == "f"][["listing_id", "adjusted_price_num"]]
    .groupby("listing_id")
    .sum()
    .join(listings_df.set_index("id"))
)


In [22]:
df = (
    listings_profits_df[
        ["neighbourhood_group_cleansed", "room_type", "adjusted_price_num"]
    ]
    .groupby(["neighbourhood_group_cleansed", "room_type"])
    .mean()
    .round(2)
)


In [23]:
sorted_sums = (
    df["adjusted_price_num"].groupby(level=0).sum().sort_values(ascending=False)
)
df.reindex(sorted_sums.index, level=0).unstack(level=1)["adjusted_price_num"]


room_type,Entire home/apt,Hotel room,Private room,Shared room
neighbourhood_group_cleansed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Ciudad Lineal,4250.83,754.0,1925.08,25618.25
Puente de Vallecas,2867.94,19950.0,1377.54,1280.78
Moncloa - Aravaca,5833.16,3774.0,11246.41,1055.5
Salamanca,7814.8,6834.75,3076.06,519.8
Arganzuela,4490.48,2155.0,7466.61,2775.0
Chamberí,6854.11,4406.33,2412.99,1689.29
San Blas - Canillejas,7721.99,,5736.84,1301.25
Centro,5976.28,4427.2,3199.49,854.83
Chamartín,4910.17,3665.67,2582.88,1878.5
Hortaleza,5125.04,2096.0,2222.02,987.5


### 2.5. _What listings' factors affect the number of reservations? Can they be predicted?_

Here we assume that none of the reserved dates will be cancelled and that they are a good representation of the yearly trend.


In [24]:
listings_reservations_df = (
    calendar_df[calendar_df["available"] == "f"][["listing_id", "date"]]
    .groupby("listing_id")
    .count()
    .rename(columns={"date": "reservations_count"})
    .join(listings_df.set_index("id"))
)
listings_reservations_df = listings_reservations_df.drop(
    columns=[
        c
        for c in listings_reservations_df.columns
        if any(s in c for s in ["availability", "calculated"])
    ]
)
listings_reservations_df.head()


Unnamed: 0_level_0,reservations_count,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,reviews_per_month,price_num,host_acceptance_rate_num
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6369,34,https://www.airbnb.com/rooms/6369,20220911230855,2022-09-12,city scrape,"Rooftop terrace room , ensuite bathroom",Excellent connection with the AIRPORT and EXHI...,,https://a0.muscache.com/pictures/683224/4cc318...,13660,...,4.82,4.75,4.85,4.82,4.85,,f,0.65,77.0,0.86
21853,56,https://www.airbnb.com/rooms/21853,20220911230855,2022-09-12,city scrape,Bright and airy room,We have a quiet and sunny room with a good vie...,We live in a leafy neighbourhood with plenty o...,https://a0.muscache.com/pictures/68483181/87bc...,83531,...,4.56,4.75,4.82,4.21,4.67,,f,0.34,31.0,0.0
24805,52,https://www.airbnb.com/rooms/24805,20220911230855,2022-09-12,city scrape,Gran Via Studio Madrid,"Studio located 50 meters from Gran Via, next t...","The area is next to the Gran Via, so people li...",https://a0.muscache.com/pictures/miso/Hosting-...,346366726,...,4.83,5.0,5.0,5.0,4.83,,f,0.25,92.0,
26825,26,https://www.airbnb.com/rooms/26825,20220911230855,2022-09-12,city scrape,Single Room whith private Bathroom,Nice and cozy roon for one person with a priva...,"Es un barrio muy tranquilo, en una zona de Mad...",https://a0.muscache.com/pictures/149358/218d5b...,114340,...,4.93,4.8,4.73,4.73,4.74,,f,1.1,26.0,0.74
30320,7,https://www.airbnb.com/rooms/30320,20220911230855,2022-09-12,city scrape,Great Vacational Apartments,<b>The space</b><br />Fully furnished spacious...,,https://a0.muscache.com/pictures/336868/f67409...,130907,...,4.89,4.85,4.8,4.9,4.71,,f,1.15,120.0,0.67


In [25]:
corr_matrix_quant = listings_reservations_df.select_dtypes(include=(float, int)).corr(
    method="spearman"
)
reservations_quant_corr = (
    corr_matrix_quant["reservations_count"]
    .dropna()
    .sort_values(key=abs, ascending=False)
)
reservations_quant_corr.head(20)


reservations_count             1.000000
number_of_reviews_ltm         -0.357409
reviews_per_month             -0.318977
price_num                     -0.296758
number_of_reviews_l30d        -0.239351
minimum_nights                 0.175172
host_listings_count           -0.173926
minimum_minimum_nights         0.170729
review_scores_checkin          0.169883
review_scores_communication    0.166459
host_total_listings_count     -0.161525
number_of_reviews             -0.141365
accommodates                  -0.140312
host_id                       -0.140167
review_scores_accuracy         0.130488
review_scores_value            0.130194
review_scores_rating           0.110060
host_acceptance_rate_num      -0.109813
minimum_maximum_nights         0.103386
review_scores_location         0.100424
Name: reservations_count, dtype: float64

In [26]:
cat_df = listings_reservations_df.select_dtypes(include=object)
cat_df = cat_df[cat_df.columns[cat_df.nunique() <= 10]]
cat_df = pd.concat(
    [
        pd.get_dummies(cat_df),
        listings_reservations_df["reservations_count"],
    ],
    axis=1,
)
cat_df.columns = [c.lower().replace(" ", "_") for c in cat_df.columns]
cat_df.head()


Unnamed: 0_level_0,last_scraped_2022-09-12,last_scraped_2022-09-19,source_city_scrape,source_previous_scrape,host_response_time_a_few_days_or_more,host_response_time_within_a_day,host_response_time_within_a_few_hours,host_response_time_within_an_hour,host_is_superhost_f,host_is_superhost_t,...,host_identity_verified_t,room_type_entire_home/apt,room_type_hotel_room,room_type_private_room,room_type_shared_room,calendar_last_scraped_2022-09-12,calendar_last_scraped_2022-09-19,instant_bookable_f,instant_bookable_t,reservations_count
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6369,1,0,1,0,0,1,0,0,1,0,...,1,0,0,1,0,1,0,1,0,34
21853,1,0,1,0,0,0,0,0,1,0,...,1,0,0,1,0,1,0,1,0,56
24805,1,0,1,0,0,0,0,0,1,0,...,0,1,0,0,0,1,0,1,0,52
26825,1,0,1,0,0,0,1,0,1,0,...,1,0,0,1,0,1,0,1,0,26
30320,1,0,1,0,0,0,1,0,1,0,...,0,1,0,0,0,1,0,1,0,7


In [27]:
corr_matrix_qual = cat_df.corr(method="spearman")
reservations_qual_corr = (
    corr_matrix_qual["reservations_count"]
    .dropna()
    .sort_values(key=abs, ascending=False)
)
reservations_qual_corr.head(20)


reservations_count                                     1.000000
source_city_scrape                                    -0.620970
source_previous_scrape                                 0.620970
host_response_time_within_an_hour                     -0.359901
host_identity_verified_t                              -0.128687
host_identity_verified_f                               0.128274
room_type_private_room                                 0.107157
instant_bookable_t                                    -0.103801
instant_bookable_f                                     0.103801
room_type_entire_home/apt                             -0.100849
host_is_superhost_t                                   -0.068437
host_is_superhost_f                                    0.067652
host_verifications_['email',_'phone',_'work_email']   -0.045078
host_response_time_a_few_days_or_more                 -0.044479
host_verifications_['email',_'phone']                  0.041208
host_has_profile_pic_f                  

In [32]:
X = pd.concat(
    [
        listings_reservations_df[
            reservations_quant_corr[abs(reservations_quant_corr) > 0.2].index
        ],
        cat_df[reservations_qual_corr[abs(reservations_qual_corr) > 0.2].index],
    ],
    axis=1,
).drop(columns=["reservations_count"])
X.columns = [c.lower().replace(" ", "_") for c in X.columns]

# we want complete, verified data for training a model, hence we remove all listings with NaN values.
X.dropna(inplace=True)

y = listings_reservations_df[["reservations_count"]]


In [33]:
X.head()


Unnamed: 0_level_0,number_of_reviews_ltm,reviews_per_month,price_num,number_of_reviews_l30d,source_city_scrape,source_previous_scrape,host_response_time_within_an_hour
listing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
6369,19,0.65,77.0,0,1,0,0
21853,0,0.34,31.0,0,1,0,0
24805,12,0.25,92.0,1,1,0,0
26825,17,1.1,26.0,1,1,0,0
30320,1,1.15,120.0,0,1,0,0


In [34]:
k_fold_cross_valitor = KFold(10, random_state=random_seed, shuffle=True)
model_base = ElasticNetCV(
    l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1],
    alphas=[0.0001, 0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 1],
    max_iter=int(1e5),
    cv=10,
    n_jobs=8,
    random_state=random_seed,
)

k_fold_scores = []
for train_index, test_index in k_fold_cross_valitor.split(X):
    model = deepcopy(model_base)
    normalizer = Normalizer()
    X_train = normalizer.fit_transform(np.ascontiguousarray(X.iloc[train_index]))
    model.fit(
        X_train,
        np.ravel(np.ascontiguousarray(y.iloc[train_index])),
    )
    k_fold_scores.append(
        r2_score(
            np.ravel(np.ascontiguousarray(y.iloc[test_index])),
            model.predict(
                normalizer.transform(np.ascontiguousarray((X.iloc[test_index])))
            ),
        )
    )

print(k_fold_scores)
print(np.mean(k_fold_scores))


[0.01463093879962718, 0.002381838139946746, 0.009622975112254184, 0.01444252696875925, 0.0018089556237939064, 0.002595667313382455, 0.004576223223146325, 0.004356151885459569, 0.013098130634573435, 0.004184395422128451]
0.00716978031230715


Using the selected columns, the model is able to explain X% of the variance in the number of reservations.
