In [1]:
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
from geopy.extra.rate_limiter import RateLimiter
from tqdm import tqdm
import json
import statistics
import re
from sklearn.impute import KNNImputer

## December 2023

In [2]:
dic2023_reviews = pd.read_csv("data/2023dic/reviews.csv")
dic2023_calendar = pd.read_csv("data/2023dic/calendar.csv", dtype={"listing_id": str,
                                                   "date": str,
                                                   "available": str,
                                                   "price": str,
                                                   "adjusted_price": str,
                                                   "minimum_nights": str,
                                                   "maximum_nights": str})
dic2023_listings = pd.read_csv("data/2023dic/listings.csv")
dic2023_neighbourhoods = pd.read_csv("data/2023dic/neighbourhoods.csv")
# dic2023_geo_neighbourhoods  # GeoJson
dic2023_d_listings = pd.read_csv("data/2023dic/d_listings.csv")
dic2023_d_reviews = pd.read_csv("data/2023dic/d_reviews.csv")


## Listings

In [3]:
dic2023_d_listings = dic2023_d_listings[['id', 'name', 'neighborhood_overview', 'host_id', 'host_since', 'host_location', 'host_about',
                   'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
                   'host_listings_count', 'host_total_listings_count', 'host_verifications',
                   'host_has_profile_pic', 'host_identity_verified', 'neighbourhood_cleansed',
                   'latitude', 'longitude', 'property_type', 'room_type',
                   'accommodates', 'bathrooms_text', 'beds', 'price',
                   'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'has_availability', 'availability_30',
                   'availability_60', 'availability_90', 'availability_365', 'number_of_reviews', 'first_review',
                   'last_review', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness',
                   'review_scores_checkin', 'review_scores_communication', 'review_scores_location',
                   'review_scores_value', 'calculated_host_listings_count',
                   'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms',
                   'calculated_host_listings_count_shared_rooms', 'reviews_per_month']]


In [4]:
dic2023_d_listings.neighborhood_overview = dic2023_d_listings.neighborhood_overview.fillna("")
dic2023_d_listings.host_about = dic2023_d_listings.host_about.fillna("")

We can consider the distance between the Host house and the actual property

In [5]:
#location_geo = {}
#for l in tqdm(dic2023_d_listings.host_location.unique().tolist()):
#    geolocator = Nominatim(user_agent="Host to listing distance")
#    geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1.1)
#    host_location = geolocator.geocode(l)
#    location_geo[l] = (host_location.latitude, host_location.longitude)
#    
#with open("data/2023dic/hosts_locations.json", 'w') as f:
#    json.dump(location_geo, f)
#

### Host and listing location

In [6]:
with open("data/2023dic/hosts_locations.json", 'r') as f:
    location_geo = json.load(f)

dic2023_d_listings["host_location"] = dic2023_d_listings["host_location"].apply(lambda x: location_geo.get(x))

def geodesic_distancer(row):
    try:
        coords_1 = (row['host_location'][0], row['host_location'][1])
        coords_2 = (row["latitude"], row["longitude"])
        return geodesic(coords_1, coords_2).km
    except:
        return None

dic2023_d_listings['host_to_listing_geodesic_km'] = dic2023_d_listings.apply(geodesic_distancer, axis=1)
dic2023_d_listings.host_to_listing_geodesic_km = dic2023_d_listings.host_to_listing_geodesic_km.fillna(statistics.mode(dic2023_d_listings["host_to_listing_geodesic_km"]))
dic2023_d_listings.drop("host_location", axis=1, inplace=True)

### Response time

In [7]:
dic2023_d_listings.host_response_time = dic2023_d_listings.host_response_time.fillna("MISSING")
categorial_response_time = CategoricalDtype(categories=['within an hour',
                                                        'within a few hours',
                                                        'within a day',
                                                        'a few days or more',
                                                        'MISSING'],
                                            ordered=True)
dic2023_d_listings.host_response_time = dic2023_d_listings.host_response_time.astype(categorial_response_time)

### Response rate

In [8]:
dic2023_d_listings["host_response_rate"] = dic2023_d_listings["host_response_rate"].str.rstrip('%').astype(float)
dic2023_d_listings.host_response_rate = dic2023_d_listings.host_response_rate.fillna(statistics.mode(dic2023_d_listings["host_response_rate"]))


### Acceptance rate

In [9]:
dic2023_d_listings["host_acceptance_rate"] = dic2023_d_listings["host_acceptance_rate"].str.rstrip('%').astype(float)
dic2023_d_listings.host_acceptance_rate = dic2023_d_listings.host_acceptance_rate.fillna(statistics.mode(dic2023_d_listings["host_acceptance_rate"]))


### Is superhost

In [10]:
dic2023_d_listings.host_is_superhost = dic2023_d_listings.host_is_superhost.fillna(statistics.mode(dic2023_d_listings["host_is_superhost"]))
dic2023_d_listings.host_is_superhost = dic2023_d_listings.host_is_superhost.astype(CategoricalDtype(categories=["t", "f"], ordered=False))

### Bathrooms and bathrooms_text

In [11]:
dic2023_d_listings.bathrooms_text = dic2023_d_listings.bathrooms_text.fillna(statistics.mode(dic2023_d_listings["bathrooms_text"]))

#### Create bathrooms feature

In [12]:
def extract_digits(text):
    if "half" in text.lower():
        return '0.5'
    digits = re.findall(r'\d+\.\d+|\d+', text)
    return ''.join(digits)

dic2023_d_listings["bathrooms"] = dic2023_d_listings["bathrooms_text"].apply(extract_digits)
dic2023_d_listings["bathrooms"] = dic2023_d_listings["bathrooms"].astype(float)

#### Edit bathrooms text_feature

In [13]:
def remove_digits(text):
    return re.sub(r'\d', '', text).strip()
dic2023_d_listings["bathrooms_text"] = dic2023_d_listings["bathrooms_text"].apply(remove_digits)

remap_baths = {
    'baths': 'single',
    'bath': 'single',
    'private bath': 'private',
    'shared bath': 'shared',
    'shared baths': 'shared',
    'Shared half-bath': 'shared',
    '. baths': 'single',
    '. shared baths': 'shared',
    'Half-bath': 'single',
    'Private half-bath': 'private'
}

dic2023_d_listings["bathrooms_text"] = dic2023_d_listings["bathrooms_text"].replace(remap_baths)


### Beds

In [14]:
dic2023_d_listings.beds = dic2023_d_listings.beds.fillna(statistics.mode(dic2023_d_listings["beds"]))

### Availability

In [15]:
dic2023_d_listings.has_availability = dic2023_d_listings.has_availability.fillna(statistics.mode(dic2023_d_listings["has_availability"]))

## Dealing with Types before NAs imputation

In [16]:
dic2023_d_listings["host_id"] = dic2023_d_listings["host_id"].astype(str)
dic2023_d_listings["host_since"] = pd.to_datetime(dic2023_d_listings["host_since"])


In [17]:
dic2023_d_listings["email_verification"] = False
dic2023_d_listings["phone_verification"] = False
dic2023_d_listings["work_email_verification"] = False

def allocate_verifications_to_variables(row):
    if "email" in row["host_verifications"]:
        row["email_verification"] = True
    if "phone" in row["host_verifications"]:
        row["phone_verification"] = True
    if "work_email" in row["host_verifications"]:
        row["work_email_verification"] = True
    return row

dic2023_d_listings = dic2023_d_listings.apply(allocate_verifications_to_variables, axis=1)

In [18]:
dic2023_d_listings.host_has_profile_pic = dic2023_d_listings.host_has_profile_pic.astype(CategoricalDtype(categories=["t", "f"], ordered=False))

In [19]:
dic2023_d_listings.host_identity_verified = dic2023_d_listings.host_identity_verified.astype(CategoricalDtype(categories=["t", "f"], ordered=False))

In [20]:
new_neighbourhoods_levels = {'Cannaregio': 'Centro Storico',
                             'San Marco':'Centro Storico',
                             'Isola San Giorgio': 'Centro Storico',
                             'San Polo':'Centro Storico',
                             'Castello': 'Centro Storico',
                             "Sant'Elena": 'Centro Storico',
                             'Dorsoduro': 'Centro Storico',
                             'Sacca Fisola': 'Centro Storico',
                             'Giudecca': 'Centro Storico',
                             'Tronchetto': 'Centro Storico',
                             'Santa Croce': 'Centro Storico',
                             "Ca' Emiliani": 'Terraferma',
                             'Marghera Zona Industriale': 'Terraferma',
                             'Marghera Catene': 'Terraferma',
                             'Marghera': 'Terraferma',
                             "Ca' Sabbioni":'Terraferma',
                             'Giustizia': 'Terraferma',
                             'San Lorenzo XXV Aprile': 'Terraferma',
                             'Bissuola': 'Terraferma',
                             'Cipressina': 'Terraferma',
                             'Zona Commerciale via Torino': 'Terraferma',
                             'Carpenedo': 'Terraferma',
                             'Villabona': 'Terraferma',
                             'Santa Barbara': 'Terraferma',
                             'Altobello': 'Terraferma',
                             'Piave 1860': 'Terraferma',
                             'La Favorita': 'Terraferma',
                             'Villaggio Sartori': 'Terraferma',
                             'Villaggio San Marco': 'Terraferma',
                             'Gazzera': 'Terraferma',
                             'Asseggiano': 'Terraferma',
                             "Pra' Secco": 'Terraferma',
                             'Gatta - Bondu?': 'Terraferma',
                             'Quartiere Pertini': 'Terraferma',
                             'Campalto CEP': 'Terraferma',
                             'Mestre': 'Terraferma',
                             "Scaramuzza": "Terraferma",
                             'Alberoni': 'Isole',
                             'Malamocco': 'Isole',
                             'Lido': 'Isole',
                             "Sant'Erasmo": 'Isole',
                             'Burano': 'Isole',
                             'San Pietro in Volta': 'Isole',
                             'Mazzorbo': 'Isole',
                             'Pellestrina': 'Isole',
                             'Murano': 'Isole',
                             'Torcello': 'Isole',
                             'Favaro': 'Terraferma',
                             'Case Dosa': 'Terraferma',
                             'Marocco Terraglio': 'Terraferma',
                             'Campalto Gobbi': 'Terraferma',
                             'Malcontenta': 'Terraferma',
                             'Zelarino': 'Terraferma',
                             'Chirignago': 'Terraferma',
                             'Campalto Bagaron': 'Terraferma',
                             'Dese': 'Terraferma',
                             'Torre Antica': 'Terraferma',
                             'Aeroporto': 'Terraferma',
                             'Tessera':'Terraferma',
                             'Campalto': 'Terraferma',
                             'other city': 'Terraferma'}

dic2023_d_listings['neighbourhood_cleansed'] = dic2023_d_listings['neighbourhood_cleansed'].replace(new_neighbourhoods_levels)
neighbourhoods_dummies = pd.get_dummies(dic2023_d_listings['neighbourhood_cleansed'], drop_first=True)
#dic2023_d_listings = pd.concat([dic2023_d_listings, neighbourhoods_dummies], axis=0)

In [36]:
# TODO: concatenate dummy neighbourhoods with main dataframe
# TODO: continue with the data transformation to categorial etc
# TODO: fill NAs of reviews with the KNNImputer

In [34]:
dic2023_d_listings["neighbourhood_cleansed"]

0       Centro Storico
1       Centro Storico
2       Centro Storico
3       Centro Storico
4       Centro Storico
             ...      
7881    Centro Storico
7882        Terraferma
7883    Centro Storico
7884    Centro Storico
7885    Centro Storico
Name: neighbourhood_cleansed, Length: 7886, dtype: object

In [22]:
property_types_groupings = {
    'Entire rental unit': 'Entire Place',
    'Entire home': 'Entire Place',
    'Entire vacation home': 'Entire Place',
    'Entire serviced apartment': 'Entire Place',
    'Entire condo': 'Entire Place',
    'Entire loft': 'Entire Place',
    'Entire guesthouse': 'Entire Place',
    'Entire villa': 'Entire Place',
    'Entire townhouse': 'Entire Place',
    'Entire bungalow': 'Entire Place',
    'Entire guest suite': 'Entire Place',
    'Entire cottage': 'Entire Place',
    'Entire chalet': 'Entire Place',
    'Entire place': 'Entire Place',
    'Entire home/apt': 'Entire Place',
    'Private room in bed and breakfast': 'Private Room',
    'Private room in boat': 'Private Room',
    'Private room in rental unit': 'Private Room',
    'Private room in guest suite': 'Private Room',
    'Private room in villa': 'Private Room',
    'Private room in condo': 'Private Room',
    'Private room in home': 'Private Room',
    'Private room in guesthouse': 'Private Room',
    'Private room in serviced apartment': 'Private Room',
    'Private room in farm stay': 'Private Room',
    'Private room in loft': 'Private Room',
    'Private room in townhouse': 'Private Room',
    'Private room in vacation home': 'Private Room',
    'Private room in chalet': 'Private Room',
    'Private room in casa particular': 'Private Room',
    'Private room in pension': 'Private Room',
    'Private room in hostel': 'Private Room',
    'Shared room in bed and breakfast': 'Shared Room',
    'Shared room in rental unit': 'Shared Room',
    'Shared room in condo': 'Shared Room',
    'Shared room in home': 'Shared Room',
    'Shared room in hostel': 'Shared Room',
    'Castle': 'Unique Stays',
    'Boat': 'Unique Stays',
    'Houseboat': 'Unique Stays',
    'Tiny home': 'Unique Stays',
    'Casa particular': 'Unique Stays',
    'Room in bed and breakfast': 'Rooms in Commercial Establishments',
    'Room in boutique hotel': 'Rooms in Commercial Establishments',
    'Room in hotel': 'Rooms in Commercial Establishments',
    'Room in serviced apartment': 'Rooms in Commercial Establishments',
    'Room in aparthotel': 'Rooms in Commercial Establishments',
    'Room in hostel': 'Rooms in Commercial Establishments',
    'Room in heritage hotel': 'Rooms in Commercial Establishments',
    'Floor': 'Shared Room'
}

dic2023_d_listings['property_type'] = dic2023_d_listings['property_type'].replace(property_types_groupings)


In [33]:
dic2023_d_listings["neighbourhood_cleansed"].unique()

array(['Centro Storico', 'Terraferma', 'Isole'], dtype=object)

### Reviews


In [24]:
dic2023_d_listings["first_review"] = pd.to_datetime(dic2023_d_listings["first_review"])
dic2023_d_listings["last_review"] = pd.to_datetime(dic2023_d_listings["last_review"])


In [25]:
dic2023_d_listings.dtypes

id                                                       int64
name                                                    object
neighborhood_overview                                   object
host_id                                                 object
host_since                                      datetime64[ns]
host_about                                              object
host_response_time                                      object
host_response_rate                                     float64
host_acceptance_rate                                   float64
host_is_superhost                                       object
host_listings_count                                      int64
host_total_listings_count                                int64
host_verifications                                      object
host_has_profile_pic                                  category
host_identity_verified                                category
neighbourhood_cleansed                                 

In [26]:
dic2023_d_listings.head()

Unnamed: 0,id,name,neighborhood_overview,host_id,host_since,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,...,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,host_to_listing_geodesic_km,bathrooms,email_verification,phone_verification,work_email_verification
0,6623,Rental unit in Venice · ★4.94 · 2 bedrooms · 4...,Close by is the Frari Church (known as Tiziano...,15016,2009-04-27,"I'm usually happy and busy, often compelled to...",within a day,100.0,74.0,t,...,3,3,0,0,1.08,0.573759,2.0,True,True,False
1,6624,Rental unit in Venice · ★4.98 · 2 bedrooms · 6...,We are in the middle of a residential area cal...,15016,2009-04-27,"I'm usually happy and busy, often compelled to...",within a day,100.0,74.0,t,...,3,3,0,0,0.59,0.634166,2.0,True,True,False
2,12074,Rental unit in Venice · ★4.94 · 2 bedrooms · 4...,"There are plenty of bars, restaurants and pizz...",15016,2009-04-27,"I'm usually happy and busy, often compelled to...",within a day,100.0,74.0,t,...,3,3,0,0,1.21,0.430166,2.0,True,True,False
3,27116,Bed and breakfast in Venice · ★4.84 · 1 bedroo...,"The area is very beautiful and characteristic,...",116144,2010-04-30,,within an hour,100.0,100.0,f,...,3,1,2,0,2.55,0.491334,1.0,True,True,False
4,44527,Rental unit in Venice · ★4.85 · 3 bedrooms · 3...,"Cannaregio is a well-connected, truly Venetian...",120215,2010-05-07,"I (Marc) have lived in Venice all my life, stu...",within an hour,100.0,100.0,t,...,2,2,0,0,0.83,1.040278,2.0,True,True,True


In [27]:
# create dataset for imputing reviews NAs
df_imputation = dic2023_d_listings.drop(["price"], axis=1)

In [28]:
imputer = KNNImputer(n_neighbors=5, weights="distance", metric="nan_euclidean", copy=False)
imputer.fit_transform()

TypeError: TransformerMixin.fit_transform() missing 1 required positional argument: 'X'

In [None]:
dic2023_d_listings.isnull().sum(axis=0)

In [None]:
dic2023_d_reviews = dic2023_d_reviews[["listing_id", "id", "date", "comments"]]