In [1]:
from distutils.command.clean import clean

import pandas as pd
pd.options.display.float_format = '{:.0f}'.format
from pandas.api.types import CategoricalDtype
import numpy as np
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
from geopy.extra.rate_limiter import RateLimiter
from tqdm import tqdm
import json
import statistics
import re
from sklearn.impute import KNNImputer
from decimal import Decimal

# For sentiment analysis of text
import nltk
#nltk.download("all") # Only first time
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


## December 2023

In [2]:
dic2023_reviews = pd.read_csv("data/2023dic/reviews.csv")
dic2023_calendar = pd.read_csv("data/2023dic/calendar.csv", dtype={"listing_id": str,
                                                   "date": str,
                                                   "available": str,
                                                   "price": str,
                                                   "adjusted_price": str,
                                                   "minimum_nights": str,
                                                   "maximum_nights": str})
dic2023_listings = pd.read_csv("data/2023dic/listings.csv")
dic2023_neighbourhoods = pd.read_csv("data/2023dic/neighbourhoods.csv")
# dic2023_geo_neighbourhoods  # GeoJson
dic2023_d_listings = pd.read_csv("data/2023dic/d_listings.csv")
dic2023_d_reviews = pd.read_csv("data/2023dic/d_reviews.csv")


## Listings

- Reviews interpretation
    - Accuracy: consistency between the Listing description and the actual listing IRL
    - Rating is the average of the ratings in other categories

In [3]:
dic2023_d_listings = dic2023_d_listings[['id', 'name', 'neighborhood_overview', 'host_id', 'host_since', 'host_location', 'host_about',
                   'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
                   'host_listings_count', 'host_total_listings_count', 'host_verifications',
                   'host_has_profile_pic', 'host_identity_verified', 'neighbourhood_cleansed',
                   'latitude', 'longitude', 'property_type',
                   'accommodates', 'bathrooms_text', 'beds', 'price',
                   'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'has_availability', 'availability_30',
                   'availability_60', 'availability_90', 'availability_365', 'number_of_reviews', 'first_review',
                   'last_review', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness',
                   'review_scores_checkin', 'review_scores_communication', 'review_scores_location',
                   'review_scores_value', 'calculated_host_listings_count',
                   'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms',
                   'calculated_host_listings_count_shared_rooms', 'reviews_per_month']]


In [4]:
dic2023_d_listings.neighborhood_overview = dic2023_d_listings.neighborhood_overview.fillna("")
dic2023_d_listings.host_about = dic2023_d_listings.host_about.fillna("")

We can consider the distance between the Host house and the actual property

In [5]:
#location_geo = {}
#for l in tqdm(dic2023_d_listings.host_location.unique().tolist()):
#    geolocator = Nominatim(user_agent="Host to listing distance")
#    geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1.1)
#    host_location = geolocator.geocode(l)
#    location_geo[l] = (host_location.latitude, host_location.longitude)
#    
#with open("data/2023dic/hosts_locations.json", 'w') as f:
#    json.dump(location_geo, f)
#

### Host and listing location

In [6]:
with open("data/2023dic/hosts_locations.json", 'r') as f:
    location_geo = json.load(f)

dic2023_d_listings["host_location"] = dic2023_d_listings["host_location"].apply(lambda x: location_geo.get(x))

def geodesic_distancer(row, from_loc: str):
    try:
        coords_1 = (row[f"{from_loc}"][0], row[f"{from_loc}"][1])
        coords_2 = (row["latitude"], row["longitude"])
        return geodesic(coords_1, coords_2).km
    except:
        return None

dic2023_d_listings['host_to_listing_geodesic_km'] = dic2023_d_listings.apply(lambda row: geodesic_distancer(row, "host_location"), axis=1)
dic2023_d_listings.host_to_listing_geodesic_km = dic2023_d_listings.host_to_listing_geodesic_km.fillna(statistics.mode(dic2023_d_listings["host_to_listing_geodesic_km"]))
dic2023_d_listings.drop("host_location", axis=1, inplace=True)

### Strategic points distancer

In [7]:
strategic_locations_geo = {"Aereoporto Marco Polo": [45.50354, 12.34258],
                       "Piazza Erminio Ferretto": [45.49479, 12.24251],
                       "Piazzale Roma": [45.43801, 12.31885],
                       "Ponte di Rialto": [45.43805, 12.33593],
                       "Piazza San Marco": [45.434, 12.338]
                       }

with open("data/strategic_locations.json", 'w') as f:
    json.dump(strategic_locations_geo, f)


In [8]:
with open("data/strategic_locations.json", 'r') as f:
    strategic_locations_geo = json.load(f)

In [9]:
dic2023_d_listings["airport_distance_km"] = None
dic2023_d_listings["ferretto_square_distance_km"] = None
dic2023_d_listings["roma_square_distance_km"] = None
dic2023_d_listings["rialto_bridge_distance_km"] = None
dic2023_d_listings["san_marco_square_distance_km"] = None

dic2023_d_listings["airport_distance_km"] = dic2023_d_listings["airport_distance_km"].apply(lambda x: strategic_locations_geo["Aereoporto Marco Polo"])
dic2023_d_listings["ferretto_square_distance_km"] = dic2023_d_listings["ferretto_square_distance_km"].apply(lambda x: strategic_locations_geo["Piazza Erminio Ferretto"])
dic2023_d_listings["roma_square_distance_km"] = dic2023_d_listings["roma_square_distance_km"].apply(lambda x: strategic_locations_geo["Piazzale Roma"])
dic2023_d_listings["rialto_bridge_distance_km"] = dic2023_d_listings["rialto_bridge_distance_km"].apply(lambda x: strategic_locations_geo["Ponte di Rialto"])
dic2023_d_listings["san_marco_square_distance_km"] = dic2023_d_listings["san_marco_square_distance_km"].apply(lambda x: strategic_locations_geo["Piazza San Marco"])

dic2023_d_listings['airport_distance_km'] = dic2023_d_listings.apply(lambda row: geodesic_distancer(row, "airport_distance_km"), axis=1)
dic2023_d_listings['ferretto_square_distance_km'] = dic2023_d_listings.apply(lambda row: geodesic_distancer(row, "ferretto_square_distance_km"), axis=1)
dic2023_d_listings['roma_square_distance_km'] = dic2023_d_listings.apply(lambda row: geodesic_distancer(row, "roma_square_distance_km"), axis=1)
dic2023_d_listings['rialto_bridge_distance_km'] = dic2023_d_listings.apply(lambda row: geodesic_distancer(row, "rialto_bridge_distance_km"), axis=1)
dic2023_d_listings['san_marco_square_distance_km'] = dic2023_d_listings.apply(lambda row: geodesic_distancer(row, "san_marco_square_distance_km"), axis=1)



### Response time

In [10]:
dic2023_d_listings.host_response_time = dic2023_d_listings.host_response_time.fillna("MISSING")
categorial_response_time = CategoricalDtype(categories= ['MISSING',
                                                         'a few days or more',
                                                         'within a day',
                                                         'within a few hours',
                                                         'within an hour'],
                                            ordered=True)
dic2023_d_listings.host_response_time = dic2023_d_listings.host_response_time.astype(categorial_response_time)

### Response rate

In [11]:
dic2023_d_listings["host_response_rate"] = dic2023_d_listings["host_response_rate"].str.rstrip('%').astype(float)
dic2023_d_listings.host_response_rate = dic2023_d_listings.host_response_rate.fillna(statistics.mode(dic2023_d_listings["host_response_rate"]))


### Acceptance rate

In [12]:
dic2023_d_listings["host_acceptance_rate"] = dic2023_d_listings["host_acceptance_rate"].str.rstrip('%').astype(float)
dic2023_d_listings.host_acceptance_rate = dic2023_d_listings.host_acceptance_rate.fillna(statistics.mode(dic2023_d_listings["host_acceptance_rate"]))


### Is superhost

In [13]:
dic2023_d_listings.host_is_superhost = dic2023_d_listings.host_is_superhost.fillna(statistics.mode(dic2023_d_listings["host_is_superhost"]))
dic2023_d_listings.host_is_superhost = dic2023_d_listings.host_is_superhost.apply(lambda x: 1 if x=="t" else 0)


### Bathrooms and bathrooms_text

In [14]:
dic2023_d_listings.bathrooms_text = dic2023_d_listings.bathrooms_text.fillna(statistics.mode(dic2023_d_listings["bathrooms_text"]))

#### Create bathrooms feature

In [15]:
def extract_digits(text):
    if "half" in text.lower():
        return '0.5'
    digits = re.findall(r'\d+\.\d+|\d+', text)
    return ''.join(digits)

dic2023_d_listings["bathrooms"] = dic2023_d_listings["bathrooms_text"].apply(extract_digits)
dic2023_d_listings["bathrooms"] = dic2023_d_listings["bathrooms"].astype(float)

#### Edit bathrooms text_feature

In [16]:
def remove_digits(text):
    return re.sub(r'\d', '', text).strip()
dic2023_d_listings["bathrooms_text"] = dic2023_d_listings["bathrooms_text"].apply(remove_digits)

remap_baths = {
    'baths': 'single',
    'bath': 'single',
    'private bath': 'private',
    'shared bath': 'shared',
    'shared baths': 'shared',
    'Shared half-bath': 'shared',
    '. baths': 'single',
    '. shared baths': 'shared',
    'Half-bath': 'single',
    'Private half-bath': 'private'
}

dic2023_d_listings["bathrooms_text"] = dic2023_d_listings["bathrooms_text"].replace(remap_baths)


### Beds

In [17]:
dic2023_d_listings.beds = dic2023_d_listings.beds.fillna(statistics.mode(dic2023_d_listings["beds"]))

### Availability

In [18]:
dic2023_d_listings.has_availability = dic2023_d_listings.has_availability.fillna(statistics.mode(dic2023_d_listings["has_availability"]))

## Dealing with Types before NAs imputation

In [19]:
dic2023_d_listings["host_id"] = dic2023_d_listings["host_id"].astype(str)
dic2023_d_listings["id"] = dic2023_d_listings["id"].astype(str)
dic2023_d_listings["host_since"] = pd.to_datetime(dic2023_d_listings["host_since"])


**Forse da rimuovere**

In [20]:
dic2023_d_listings["email_verification"] = False
dic2023_d_listings["phone_verification"] = False
dic2023_d_listings["work_email_verification"] = False

def allocate_verifications_to_variables(row):
    if "email" in row["host_verifications"]:
        row["email_verification"] = True
    if "phone" in row["host_verifications"]:
        row["phone_verification"] = True
    if "work_email" in row["host_verifications"]:
        row["work_email_verification"] = True
    return row

dic2023_d_listings = dic2023_d_listings.apply(allocate_verifications_to_variables, axis=1)

In [21]:
dic2023_d_listings.host_has_profile_pic = dic2023_d_listings.host_has_profile_pic.apply(lambda x: 1 if x=="t" else 0)

In [22]:
dic2023_d_listings.host_identity_verified = dic2023_d_listings.host_identity_verified.apply(lambda x: 1 if x=="t" else 0)

In [23]:
new_neighbourhoods_levels = {'Cannaregio': 'Centro Storico',
                             'San Marco':'Centro Storico',
                             'Isola San Giorgio': 'Centro Storico',
                             'San Polo':'Centro Storico',
                             'Castello': 'Centro Storico',
                             "Sant'Elena": 'Centro Storico',
                             'Dorsoduro': 'Centro Storico',
                             'Sacca Fisola': 'Centro Storico',
                             'Giudecca': 'Centro Storico',
                             'Tronchetto': 'Centro Storico',
                             'Santa Croce': 'Centro Storico',
                             "Ca' Emiliani": 'Terraferma',
                             'Marghera Zona Industriale': 'Terraferma',
                             'Marghera Catene': 'Terraferma',
                             'Marghera': 'Terraferma',
                             "Ca' Sabbioni":'Terraferma',
                             'Giustizia': 'Terraferma',
                             'San Lorenzo XXV Aprile': 'Terraferma',
                             'Bissuola': 'Terraferma',
                             'Cipressina': 'Terraferma',
                             'Zona Commerciale via Torino': 'Terraferma',
                             'Carpenedo': 'Terraferma',
                             'Villabona': 'Terraferma',
                             'Santa Barbara': 'Terraferma',
                             'Altobello': 'Terraferma',
                             'Piave 1860': 'Terraferma',
                             'La Favorita': 'Terraferma',
                             'Villaggio Sartori': 'Terraferma',
                             'Villaggio San Marco': 'Terraferma',
                             'Gazzera': 'Terraferma',
                             'Asseggiano': 'Terraferma',
                             "Pra' Secco": 'Terraferma',
                             'Gatta - Bondu?': 'Terraferma',
                             'Quartiere Pertini': 'Terraferma',
                             'Campalto CEP': 'Terraferma',
                             'Mestre': 'Terraferma',
                             "Scaramuzza": "Terraferma",
                             'Alberoni': 'Isole',
                             'Malamocco': 'Isole',
                             'Lido': 'Isole',
                             "Sant'Erasmo": 'Isole',
                             'Burano': 'Isole',
                             'San Pietro in Volta': 'Isole',
                             'Mazzorbo': 'Isole',
                             'Pellestrina': 'Isole',
                             'Murano': 'Isole',
                             'Torcello': 'Isole',
                             'Favaro': 'Terraferma',
                             'Case Dosa': 'Terraferma',
                             'Marocco Terraglio': 'Terraferma',
                             'Campalto Gobbi': 'Terraferma',
                             'Malcontenta': 'Terraferma',
                             'Zelarino': 'Terraferma',
                             'Chirignago': 'Terraferma',
                             'Campalto Bagaron': 'Terraferma',
                             'Dese': 'Terraferma',
                             'Torre Antica': 'Terraferma',
                             'Aeroporto': 'Terraferma',
                             'Tessera':'Terraferma',
                             'Campalto': 'Terraferma',
                             'other city': 'Terraferma'}

dic2023_d_listings['neighbourhood_cleansed'] = dic2023_d_listings['neighbourhood_cleansed'].replace(new_neighbourhoods_levels)
neighbourhoods_dummies = pd.get_dummies(dic2023_d_listings['neighbourhood_cleansed'], drop_first=True)
dic2023_d_listings = pd.concat([dic2023_d_listings, neighbourhoods_dummies], axis=1)

In [24]:
property_types_groupings = {
    'Entire rental unit': 'Entire Place',
    'Entire home': 'Entire Place',
    'Entire vacation home': 'Entire Place',
    'Entire serviced apartment': 'Entire Place',
    'Entire condo': 'Entire Place',
    'Entire loft': 'Entire Place',
    'Entire guesthouse': 'Entire Place',
    'Entire villa': 'Entire Place',
    'Entire townhouse': 'Entire Place',
    'Entire bungalow': 'Entire Place',
    'Entire guest suite': 'Entire Place',
    'Entire cottage': 'Entire Place',
    'Entire chalet': 'Entire Place',
    'Entire place': 'Entire Place',
    'Entire home/apt': 'Entire Place',
    'Private room in bed and breakfast': 'Private Room',
    'Private room in boat': 'Private Room',
    'Private room in rental unit': 'Private Room',
    'Private room in guest suite': 'Private Room',
    'Private room in villa': 'Private Room',
    'Private room in condo': 'Private Room',
    'Private room in home': 'Private Room',
    'Private room in guesthouse': 'Private Room',
    'Private room in serviced apartment': 'Private Room',
    'Private room in farm stay': 'Private Room',
    'Private room in loft': 'Private Room',
    'Private room in townhouse': 'Private Room',
    'Private room in vacation home': 'Private Room',
    'Private room in chalet': 'Private Room',
    'Private room in casa particular': 'Private Room',
    'Private room in pension': 'Private Room',
    'Private room in hostel': 'Private Room',
    'Shared room in bed and breakfast': 'Shared Room',
    'Shared room in rental unit': 'Shared Room',
    'Shared room in condo': 'Shared Room',
    'Shared room in home': 'Shared Room',
    'Shared room in hostel': 'Shared Room',
    'Castle': 'Unique Stays',
    'Boat': 'Unique Stays',
    'Houseboat': 'Unique Stays',
    'Tiny home': 'Unique Stays',
    'Casa particular': 'Unique Stays',
    'Room in bed and breakfast': 'Rooms in Commercial Establishments',
    'Room in boutique hotel': 'Rooms in Commercial Establishments',
    'Room in hotel': 'Rooms in Commercial Establishments',
    'Room in serviced apartment': 'Rooms in Commercial Establishments',
    'Room in aparthotel': 'Rooms in Commercial Establishments',
    'Room in hostel': 'Rooms in Commercial Establishments',
    'Room in heritage hotel': 'Rooms in Commercial Establishments',
    'Floor': 'Shared Room',
    'Private room': 'Private Room'
}

dic2023_d_listings['property_type'] = dic2023_d_listings['property_type'].replace(property_types_groupings)
categorial_property_type = CategoricalDtype(categories=['Shared Room',
                                                        'Private Room',
                                                        'Rooms in Commercial Establishments',
                                                        'Entire Place',
                                                        'Unique Stays'],
                                            ordered=True)
dic2023_d_listings.property_type = dic2023_d_listings.property_type.astype(categorial_property_type)

In [25]:
dic2023_d_listings.bathrooms_text = dic2023_d_listings.bathrooms_text.astype(CategoricalDtype(categories=["shared",
                                                                                                          "single",
                                                                                                          "private"],
                                                                                              ordered=True))

In [26]:
def remove_symbols(text):
    try:
        cleaned_text = re.sub(r'[$,]', '', text)
        return cleaned_text.strip()
    except:
        return None
dic2023_d_listings["price"] = dic2023_d_listings["price"].apply(remove_symbols).astype(float)

In [27]:
dic2023_d_listings.has_availability = dic2023_d_listings.has_availability.apply(lambda x: 1 if x=="t" else 0)

### Reviews


In [28]:
dic2023_d_listings["first_review"] = pd.to_datetime(dic2023_d_listings["first_review"])
dic2023_d_listings["last_review"] = pd.to_datetime(dic2023_d_listings["last_review"])


*To estimate the values of the reviews, I need to keep only numeric data in the dataset.
For this reason, I decided to use a simple Sentiment Analysis on the Description to at least
retain some information about the listing*

In [29]:
# create dataset for imputing reviews NAs and eliminate categorial variables
df_imputation = dic2023_d_listings.drop(["id",
                                         "name",
                                         "host_id",
                                         "host_since",
                                         "host_response_time",
                                         "host_verifications",     # to remove permanently
                                         "neighbourhood_cleansed", # to remove permanently
                                         "property_type",
                                         "bathrooms_text",
                                         "first_review",
                                         "last_review",
                                         "price"
                                         ], axis=1)

column_names = list(df_imputation.columns)

In [30]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    processed_text = ' '.join(lemmatized_tokens)
    return processed_text

analyzer = SentimentIntensityAnalyzer()
def get_sentiment(text):
    scores = analyzer.polarity_scores(text)
    sentiment = scores['pos']
    return sentiment

In [31]:
df_imputation["neighborhood_overview"] = df_imputation["neighborhood_overview"].apply(preprocess_text)
df_imputation["neighborhood_overview"] = df_imputation["neighborhood_overview"].apply(get_sentiment)

df_imputation["host_about"] = df_imputation["host_about"].apply(preprocess_text)
df_imputation["host_about"] = df_imputation["host_about"].apply(get_sentiment)

In [32]:
imputer = KNNImputer(n_neighbors=5, weights="distance", metric="nan_euclidean", copy=False)
imputed_df = imputer.fit_transform(df_imputation)
imputed_df = pd.DataFrame(imputed_df, columns = column_names)

### Merge datasets adding categorial columns

In [33]:
dropped_columns = ["id",
                   "name",
                   "host_id",
                   "host_since",
                   "host_response_time",
                   "property_type",
                   "bathrooms_text",
                   "first_review",
                   "last_review",
                   "neighborhood_overview", # add again string form
                   "host_about",            # add again string form
                   "price"
                   ]

imputed_df = imputed_df.drop(["neighborhood_overview",
                              "host_about"],
                             axis=1)

pre_concat_dic = dic2023_d_listings[dropped_columns]
clean_dic_listings = pd.concat([pre_concat_dic, imputed_df], axis=1)

In [34]:
clean_dic_listings.loc[clean_dic_listings["first_review"].isna(), "first_review"] = clean_dic_listings.loc[clean_dic_listings["first_review"].isna(), "host_since"]
clean_dic_listings.loc[clean_dic_listings["last_review"].isna(), "last_review"] = pd.to_datetime("26/12/2023", dayfirst=True)

clean_dic_listings.isnull().sum(axis=0)


id                                                0
name                                              0
host_id                                           0
host_since                                        0
host_response_time                                0
property_type                                     0
bathrooms_text                                    0
first_review                                      0
last_review                                       0
neighborhood_overview                             0
host_about                                        0
price                                           173
host_response_rate                                0
host_acceptance_rate                              0
host_is_superhost                                 0
host_listings_count                               0
host_total_listings_count                         0
host_has_profile_pic                              0
host_identity_verified                            0
latitude    

In [35]:
clean_dic_listings.to_pickle("data/pickles/december_listings.pkl")