In [None]:
from tqdm.notebook import tqdm
import pandas as pd
import os
import re
from pandarallel import pandarallel
import reverse_geocode
from scripts.custom.viz.class_transformers import AmenitiesTransformer


pandarallel.initialize(progress_bar=True)
tqdm.pandas()

In [None]:
datasets = {}

for file in os.listdir("../data/all_cities"):
    pattern = r'_(\w{2})'
    match = re.search(pattern, file)
    result = match.group(1)
    datasets[f"df_{result}"] = pd.read_csv(f"../data/all_cities/{file}")
df = pd.concat([value for key, value in datasets.items()], ignore_index=True)

df.shape[0]

# Retrieving location from coordinates


In [None]:
def retrieve_city(row):
    coords = (row["latitude"], row['longitude'])
    row["listing_city"] = reverse_geocode.get(coords)["city"]
    row["listing_city_pop"] = reverse_geocode.get(coords)["population"]
    return row

df = df.parallel_apply(retrieve_city, axis=1)

In [None]:
sep_nas = pd.DataFrame(df.isnull().sum(), columns=["NAs"])
#sep_nas.loc[sep_nas["NAs"] > 0, :]
sep_nas.reset_index()

## Drop columns with too many NAs

In [None]:
df.drop(["neighborhood_overview",
             "host_about",
             "host_neighbourhood",
             "neighbourhood",
             "neighbourhood_group_cleansed",
             "calendar_updated",
             "license"],
            axis=1,
            inplace=True)

## Drop not useful

In [None]:
df.drop([
    "listing_url",
    "scrape_id",
    "last_scraped",
    "source",
    "name",
    "description",
    "picture_url",
    "host_url",
    "host_name",
    "host_thumbnail_url",
    "host_picture_url",
    "minimum_minimum_nights",
    "maximum_minimum_nights",
    "minimum_maximum_nights",
    "maximum_maximum_nights",
    "minimum_nights_avg_ntm",
    "maximum_nights_avg_ntm",
    "has_availability",
    "availability_30",
    "availability_60",
    "availability_90",
    "availability_365",
    "calendar_last_scraped",
    "number_of_reviews_ltm",
    "number_of_reviews_l30d",
    "instant_bookable",
    "calculated_host_listings_count",
    "calculated_host_listings_count_entire_homes",
    "calculated_host_listings_count_private_rooms",
    "calculated_host_listings_count_shared_rooms",
],
axis=1,
inplace=True)

df.set_index("id", inplace=True)

In [None]:
see_histograms = df.copy()

pd.set_option('future.no_silent_downcasting', True)
see_histograms.replace({'f': 0, 't': 1}, inplace=True)

see_histograms.hist(figsize=(30,20));

In [None]:
print(df.shape)
df_nas_columns = pd.DataFrame({
    'NAs': df.isnull().sum(axis=1),
    'Columns_with_NAs': df.parallel_apply(lambda x: ', '.join(x.index[x.isnull()]), axis=1)
})
df_nas_columns.loc[df_nas_columns["NAs"]>7]

In [None]:
more_than_7_missing = df_nas_columns.loc[df_nas_columns["NAs"]>7, :].index.tolist()
df.drop(more_than_7_missing, inplace=True)
df.shape

# Handling amenities

In [None]:
amenities_lists = df["amenities"].tolist()

amenities_counter = {}

for el in amenities_lists:
    for e in el.strip('][').split(', '):
        try:
            amenities_counter[e.strip('"')] += 1
        except:
            amenities_counter[e.strip('"')] = 1

amenities_counter = dict(sorted(amenities_counter.items(), key=lambda item: item[1], reverse=True))


## Remapping amenities in smaller dictionary

In [None]:
amenities_remapping = {}


### technology

In [None]:
pattern = r'\b(wifi|internet|ethernet|cable|fibra|dolby|smart|connection|tv|television|netflix|amazon|disney)\b'
regex = re.compile(pattern, re.IGNORECASE)

for am in list(amenities_counter.keys()):
    if regex.search(am):
        amenities_remapping[am] = "technology"

### kitchen

In [None]:
pattern = r'\b(kitchen|cooking|grill|cucina|refrigerator|fridge|oven|stove|dish|coffee|espresso|lavazza|dining|breakfast|microonde|microwave|washer|freezer|glasses|toast|baking)\b'
regex = re.compile(pattern, re.IGNORECASE)

for am in list(amenities_counter.keys()):
    if regex.search(am):
        amenities_remapping[am] = "kitchen"

### toiletry

In [None]:
pattern = r'\b(hair|capelli|soap|sapone|bidet|shampoo|bathtub|gel|laundry|closet|pillow|blanket|shower)\b'
regex = re.compile(pattern, re.IGNORECASE)

for am in list(amenities_counter.keys()):
    if regex.search(am):
        amenities_remapping[am] = "toiletry"

### AC/heating

In [None]:
pattern = r'\b(heating|ac|air|conditioning|fan)\b'
regex = re.compile(pattern, re.IGNORECASE)

for am in list(amenities_counter.keys()):
    if regex.search(am):
        amenities_remapping[am] = "AC/heating"

### benefits

In [None]:
pattern = r'\b(garden|backyard|skyline|beach|gym|fitness|view|outdoor|balcony|waterfront|bed linen|workspace|aid|luggage|elevator|free|safe|lock|security|bike|estinguisher)\b'
regex = re.compile(pattern, re.IGNORECASE)

for am in list(amenities_counter.keys()):
    if regex.search(am):
        amenities_remapping[am] = "benefits"

### other

In [None]:
pattern = r'\b(wifi|internet|ethernet|cable|fibra|dolby|smart|connection|tv|television|netflix|amazon|disney|kitchen|cooking|grill|cucina|refrigerator#|fridge|oven|stove|dish|coffee|espresso|lavazza|dining|breakfast|microonde|microwave|washer|freezer|glasses|toast|baking|hair|capelli|soap|sapone|bidet#|shampoo|bathtub|gel|laundry|closet|pillow|blanket|showers|heating|ac|air|conditioning|fan|garden|backyard|skyline|beach|gym|fitness|view|outdoor#|balcony|waterfront|bed linen|workspace|aid|luggage|elevator|free|safe|lock|security|bike)\b'

regex = re.compile(pattern, re.IGNORECASE)

for am in list(amenities_counter.keys()):
    if not regex.search(am):
        amenities_remapping[am] = "other"

## Remapping amenities in dataframe

In [None]:
def unwrap_remap_amenities(value):
    element = [e.strip('"') for e in value.strip('][').split(', ')]
    remapped_amenities = (pd.Series(element)).map(amenities_remapping)
    return remapped_amenities.tolist()

df["amenities"] = df["amenities"].parallel_apply(unwrap_remap_amenities)

In [None]:
def return_amenity_counter(row):
    amenities = ["AC/heating", "technology", "kitchen", "benefits", "toiletry", "other"]
    counts = {amenity: row["amenities"].count(amenity) for amenity in amenities}
    for amenity, count in counts.items():
        row[f'amenities_{amenity}'] = count
    return row

df = df.parallel_apply(return_amenity_counter, axis=1)


## Manage Property type

In [None]:
property_type_list = df["property_type"].tolist()

In [None]:
properties_frequencies = {x:property_type_list.count(x) for x in property_type_list}

In [None]:
properties_frequencies

In [None]:
property_type_remapping = {}

### Entire property

In [None]:
pattern = r'\b(entire|tiny home)\b'
regex = re.compile(pattern, re.IGNORECASE)

for am in list(properties_frequencies.keys()):
    if regex.search(am):
        property_type_remapping[am] = "entire_property"

### Private room

In [None]:
pattern = r'\b(private room|room in serviced apartment|room in bed and breakfast|room in hotel|room in resort)\b'
regex = re.compile(pattern, re.IGNORECASE)

for am in list(properties_frequencies.keys()):
    if regex.search(am):
        property_type_remapping[am] = "private_room"


### Shared room

In [None]:
pattern = r'\b(shared room|shared)\b'
regex = re.compile(pattern, re.IGNORECASE)

for am in list(properties_frequencies.keys()):
    if regex.search(am):
        property_type_remapping[am] = "shared_room"



### Other

In [None]:
pattern = r'\b(entire|tiny home|private room|room in serviced apartment|room in bed and breakfast|room in hotel|room in resort|shared room|shared)\b'
regex = re.compile(pattern, re.IGNORECASE)

for am in list(properties_frequencies.keys()):
    if not regex.search(am):
        property_type_remapping[am] = "other"

In [None]:
df["property_type"] = df['property_type'].map(property_type_remapping)

In [None]:
df.head()

## Fill Host Locations NAs with the values from the same HostID or same listing city

In [None]:
def fill_host_location(row):
    if pd.isna(row["host_location"]):
        row["host_location"] = row["listing_city"]+", Italy"
    return row

df = df.parallel_apply(fill_host_location, axis=1)

In [None]:
df.head()