In [1]:
from tqdm.notebook import tqdm
import pandas as pd
import os
import re
from pandarallel import pandarallel
import reverse_geocode
from scripts.custom.viz.class_transformers import AmenitiesTransformer


pandarallel.initialize(progress_bar=True)
tqdm.pandas()

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
datasets = {}

for file in os.listdir("../data/all_cities"):
    pattern = r'_(\w{2})'
    match = re.search(pattern, file)
    result = match.group(1)
    datasets[f"df_{result}"] = pd.read_csv(f"../data/all_cities/{file}")
df = pd.concat([value for key, value in datasets.items()], ignore_index=True)

df.shape

(96015, 75)

# Retrieving location from coordinates


In [3]:
def retrieve_city(row):
    coords = (row["latitude"], row['longitude'])
    row["listing_city"] = reverse_geocode.get(coords)["city"]
    row["listing_city_pop"] = reverse_geocode.get(coords)["population"]
    return row

df = df.parallel_apply(retrieve_city, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=24004), Label(value='0 / 24004')))…

In [4]:
sep_nas = pd.DataFrame(df.isnull().sum(), columns=["NAs"])
sep_nas.loc[sep_nas["NAs"] > 0, :]

Unnamed: 0,NAs
description,2861
neighborhood_overview,41432
host_name,1
host_since,1
host_location,22088
host_about,48562
host_response_time,11774
host_response_rate,11774
host_acceptance_rate,7630
host_is_superhost,2182


## Drop columns with too many NAs

In [5]:
df.drop(["neighborhood_overview",
             "host_about",
             "host_neighbourhood",
             "neighbourhood",
             "neighbourhood_group_cleansed",
             "calendar_updated",
             "license"],
            axis=1,
            inplace=True)

## Drop not useful

In [6]:
df.drop([
    "listing_url",
    "scrape_id",
    "last_scraped",
    "source",
    "name",
    "description",
    "picture_url",
    "host_url",
    "host_name",
    "host_thumbnail_url",
    "host_picture_url",
    "minimum_minimum_nights",
    "maximum_minimum_nights",
    "minimum_maximum_nights",
    "maximum_maximum_nights",
    "minimum_nights_avg_ntm",
    "maximum_nights_avg_ntm",
    "has_availability",
    "availability_30",
    "availability_60",
    "availability_90",
    "availability_365",
    "calendar_last_scraped",
    "number_of_reviews_ltm",
    "number_of_reviews_l30d",
    "instant_bookable",
    "calculated_host_listings_count",
    "calculated_host_listings_count_entire_homes",
    "calculated_host_listings_count_private_rooms",
    "calculated_host_listings_count_shared_rooms",
],
axis=1,
inplace=True)

df.set_index("id", inplace=True)

In [None]:
see_histograms = df.copy()

pd.set_option('future.no_silent_downcasting', True)
see_histograms.replace({'f': 0, 't': 1}, inplace=True)

see_histograms.hist(figsize=(30,20));

In [7]:
print(df.shape)
df_nas_columns = pd.DataFrame({
    'NAs': df.isnull().sum(axis=1),
    'Columns_with_NAs': df.parallel_apply(lambda x: ', '.join(x.index[x.isnull()]), axis=1)
})
df_nas_columns.loc[df_nas_columns["NAs"]>7]

(96015, 39)


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=24004), Label(value='0 / 24004')))…

Unnamed: 0_level_0,NAs,Columns_with_NAs
id,Unnamed: 1_level_1,Unnamed: 2_level_1
482422,13,"host_location, host_response_time, host_respon..."
523048,10,"first_review, last_review, review_scores_ratin..."
523221,13,"bathrooms, beds, price, first_review, last_rev..."
531903,10,"first_review, last_review, review_scores_ratin..."
1190641,13,"host_response_time, host_response_rate, host_a..."
...,...,...
1178237255401102369,10,"first_review, last_review, review_scores_ratin..."
1178241933628166985,13,"host_response_time, host_response_rate, host_a..."
1178285228892794868,10,"first_review, last_review, review_scores_ratin..."
1178322393649515459,10,"first_review, last_review, review_scores_ratin..."


In [8]:
more_than_7_missing = df_nas_columns.loc[df_nas_columns["NAs"]>7, :].index.tolist()
df.drop(more_than_7_missing, inplace=True)
df.shape

(81316, 39)

# Handling amenities

In [9]:
amenities_lists = df["amenities"].tolist()

amenities_counter = {}

for el in amenities_lists:
    for e in el.strip('][').split(', '):
        try:
            amenities_counter[e.strip('"')] += 1
        except:
            amenities_counter[e.strip('"')] = 1

amenities_counter = dict(sorted(amenities_counter.items(), key=lambda item: item[1], reverse=True))


## Remapping amenities in smaller dictionary

In [10]:
amenities_remapping = {}


### technology

In [11]:
pattern = r'\b(wifi|internet|ethernet|cable|fibra|dolby|smart|connection|tv|television|netflix|amazon|disney)\b'
regex = re.compile(pattern, re.IGNORECASE)

for am in list(amenities_counter.keys()):
    if regex.search(am):
        amenities_remapping[am] = "technology"

### kitchen

In [12]:
pattern = r'\b(kitchen|cooking|grill|cucina|refrigerator|fridge|oven|stove|dish|coffee|espresso|lavazza|dining|breakfast|microonde|microwave|washer|freezer|glasses|toast|baking)\b'
regex = re.compile(pattern, re.IGNORECASE)

for am in list(amenities_counter.keys()):
    if regex.search(am):
        amenities_remapping[am] = "kitchen"

### toiletry

In [13]:
pattern = r'\b(hair|capelli|soap|sapone|bidet|shampoo|bathtub|gel|laundry|closet|pillow|blanket|shower)\b'
regex = re.compile(pattern, re.IGNORECASE)

for am in list(amenities_counter.keys()):
    if regex.search(am):
        amenities_remapping[am] = "toiletry"

### AC/heating

In [14]:
pattern = r'\b(heating|ac|air|conditioning|fan)\b'
regex = re.compile(pattern, re.IGNORECASE)

for am in list(amenities_counter.keys()):
    if regex.search(am):
        amenities_remapping[am] = "AC/heating"

### benefits

In [15]:
pattern = r'\b(garden|backyard|skyline|beach|gym|fitness|view|outdoor|balcony|waterfront|bed linen|workspace|aid|luggage|elevator|free|safe|lock|security|bike|estinguisher)\b'
regex = re.compile(pattern, re.IGNORECASE)

for am in list(amenities_counter.keys()):
    if regex.search(am):
        amenities_remapping[am] = "benefits"

### other

In [16]:
pattern = r'\b(wifi|internet|ethernet|cable|fibra|dolby|smart|connection|tv|television|netflix|amazon|disney|kitchen|cooking|grill|cucina|refrigerator#|fridge|oven|stove|dish|coffee|espresso|lavazza|dining|breakfast|microonde|microwave|washer|freezer|glasses|toast|baking|hair|capelli|soap|sapone|bidet#|shampoo|bathtub|gel|laundry|closet|pillow|blanket|showers|heating|ac|air|conditioning|fan|garden|backyard|skyline|beach|gym|fitness|view|outdoor#|balcony|waterfront|bed linen|workspace|aid|luggage|elevator|free|safe|lock|security|bike)\b'

regex = re.compile(pattern, re.IGNORECASE)

for am in list(amenities_counter.keys()):
    if not regex.search(am):
        amenities_remapping[am] = "other"

## Remapping amenities in dataframe

In [17]:
def unwrap_remap_amenities(value):
    element = [e.strip('"') for e in value.strip('][').split(', ')]
    remapped_amenities = (pd.Series(element)).map(amenities_remapping)
    return remapped_amenities.tolist()

df["amenities"] = df["amenities"].parallel_apply(unwrap_remap_amenities)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=20329), Label(value='0 / 20329')))…

In [18]:
def return_amenity_counter(row):
    amenities = ["AC/heating", "technology", "kitchen", "benefits", "toiletry", "other"]
    counts = {amenity: row["amenities"].count(amenity) for amenity in amenities}
    for amenity, count in counts.items():
        row[f'amenities_{amenity}'] = count
    return row

df = df.parallel_apply(return_amenity_counter, axis=1)


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=20329), Label(value='0 / 20329')))…

## Manage Property type

In [32]:
property_type_list = df["property_type"].tolist()

In [33]:
properties_frequencies = {x:property_type_list.count(x) for x in property_type_list}

In [34]:
properties_frequencies

{'private_room': 15419,
 'entire_property': 64767,
 'other': 743,
 'shared_room': 387}

In [22]:
property_type_remapping = {}

### Entire property

In [23]:
pattern = r'\b(entire|tiny home)\b'
regex = re.compile(pattern, re.IGNORECASE)

for am in list(properties_frequencies.keys()):
    if regex.search(am):
        property_type_remapping[am] = "entire_property"

### Private room

In [24]:
pattern = r'\b(private room|room in serviced apartment|room in bed and breakfast|room in hotel|room in resort)\b'
regex = re.compile(pattern, re.IGNORECASE)

for am in list(properties_frequencies.keys()):
    if regex.search(am):
        property_type_remapping[am] = "private_room"


### Shared room

In [25]:
pattern = r'\b(shared room|shared)\b'
regex = re.compile(pattern, re.IGNORECASE)

for am in list(properties_frequencies.keys()):
    if regex.search(am):
        property_type_remapping[am] = "shared_room"



### Other

In [26]:
pattern = r'\b(entire|tiny home|private room|room in serviced apartment|room in bed and breakfast|room in hotel|room in resort|shared room|shared)\b'
regex = re.compile(pattern, re.IGNORECASE)

for am in list(properties_frequencies.keys()):
    if not regex.search(am):
        property_type_remapping[am] = "other"

In [30]:
df["property_type"] = df['property_type'].map(property_type_remapping)

In [31]:
df.head()

Unnamed: 0_level_0,host_id,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_verifications,...,review_scores_value,reviews_per_month,listing_city,listing_city_pop,amenities_AC/heating,amenities_technology,amenities_kitchen,amenities_benefits,amenities_toiletry,amenities_other
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
31840,380378,2011-02-07,"Florence, Italy",within an hour,100%,100%,f,39,41,"['email', 'phone', 'work_email']",...,5,1,Florence,367150,2,2,5,2,3,14
32120,99235,2010-03-26,"Florence, Italy",within an hour,100%,50%,f,1,1,"['email', 'phone']",...,5,0,Ponte a Ema,8412,2,2,1,0,0,2
32180,13925330,2014-04-05,"Florence, Italy",within a day,100%,67%,t,1,1,"['email', 'phone']",...,5,0,Florence,367150,1,2,6,0,1,10
39115,167739,2010-07-15,"Florence, Italy",within a few hours,100%,62%,f,10,12,"['email', 'phone', 'work_email']",...,5,0,Florence,367150,2,3,0,5,5,10
39165,167739,2010-07-15,"Florence, Italy",within a few hours,100%,62%,f,10,12,"['email', 'phone', 'work_email']",...,4,0,Florence,367150,2,3,0,4,5,12


## Fill Host Locations NAs with the values from the same HostID or same listing city

In [None]:
def fill_host_location(row):
    if pd.isna(row["host_location"]):
        row["host_location"] = row["listing_city"]+", Italy"
    return row

df = df.parallel_apply(fill_host_location, axis=1)

In [None]:
df.head()