In [1]:
import pandas as pd
from pandas import Timestamp

pd.options.display.float_format = '{:.0f}'.format
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
from geopy.extra.rate_limiter import RateLimiter
import json
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
import re



In [2]:
df_listings = pd.read_csv("../data/2023dic/d_listings.csv")
#df_listings = df_listings[['id', 'neighborhood_overview', 'host_id', 'host_since', 'host_location', 'host_about',
#                           'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
#                           'host_listings_count', 'host_total_listings_count', 'host_verifications',
#                           'host_has_profile_pic', 'host_identity_verified', 'neighbourhood_cleansed',
#                           'latitude', 'longitude', 'room_type',
#                           'accommodates', 'bathrooms_text', 'beds', 'price',
#                           'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'has_availability', 'availability_30',
#                           'availability_60', 'availability_90', 'availability_365', 'number_of_reviews', 'first_review',
#                           'last_review', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness',
#                           'review_scores_checkin', 'review_scores_communication', 'review_scores_location',
#                           'review_scores_value', 'calculated_host_listings_count',
#                           'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms',
#                           'calculated_host_listings_count_shared_rooms', 'reviews_per_month']]

In [3]:
df_listings.drop(labels=["listing_url", "name", "scrape_id", "last_scraped", "source", "description", "picture_url", "host_url",
                         "host_name", "host_thumbnail_url", "host_picture_url", "host_neighbourhood", "neighbourhood",
                         "neighbourhood_group_cleansed", "property_type", "amenities", "minimum_minimum_nights",
                         "maximum_minimum_nights", "minimum_maximum_nights", "maximum_maximum_nights", "minimum_nights_avg_ntm",
                         "maximum_nights_avg_ntm", "has_availability", "availability_30", "availability_60", "availability_90",
                         "availability_365", "calendar_updated", "calendar_last_scraped", "number_of_reviews_ltm",
                         "number_of_reviews_l30d", "license", "instant_bookable"],
                 axis=1,
                 inplace=True)

In [4]:
## Drop rows with NaN in target 
#df_listings = df_listings.loc[df_listings['price'].notnull(), :]
#df_listings.price.isnull().sum()

In [5]:
#X = df_listings.drop(["price"], axis=1, inplace=False)
#y = df_listings["price"]

In [6]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=874631)

In [7]:
### Feature engineering
#- `first_review` to `last_review` as date span
#- `host_listings_count` as a % of `host_total_listings_count`
#- Manage `neighbourhoods_cleansed` as a OHE of most frequent categories
#- Distance between host home and listing location
#- Distance between listing and relevant locations in town
#- `host_since` encoded as *days of activity until period end (end of dataset scraping)*
#- Sentiment of `neighborhood_overview` (investigate best sentiment technique for descriptions of appartments)
#- Sentiment of `host_about` (investigate best sentiment technique for description of people)
#- `host_id` and `id` as categorial
#- `host_response_time` as ordinal variable
#- string manipulation for `host_response_rate` and `host_acceptance_rate`
#- `host_is_superhost` as binary categorial
#- `host_verifications` as encoded in previous script
#- `host_has_profile_pic` as binary
#- `host_identity_verified` as binary
#- keep `room_type` instead of `property_type` and make `room_type` a categorial with OHE
#- `accomodates` used with `baths`, `beds` to compute the rate of beds and bathrooms for every person
#- `price` with string manipulation
#- `minimum_nighs_avg_ntm` as float
#- `maximum_nights_avg_ntm` as float
#- `has_availability` as binary
#- all the `has_availability_NUMBER` as a % of the NUMBER of the feature
#- `number_of_reviews` as an integer
#- `review_scores_rating` as float
#- all the reviews scores as float
#- remove `calculated_host_listings_count` and keep the other three BUT **set them as % of the total host listings**
#- `reviews_per_month` as float
#- `longitude` and `latitude` standardization (because the values are both negatives and positives)

## Transform feature datatypes

In this section we execute the feature engineering without dealing with null values.
We do it because once the types are cleaned, we want to plot a bit the data and explore it to see what is going on
with NAs, frequency distribution, numeric distributions etc.
In order to do so, we need to:
1. Generalise the pipeline, because we would like to apply this script also to other similar dataset
2. Return exceptions for NAs, to carry them on to the data exploration section

> ***NOTE*** that the `feature-engine` library enables us to split the dataset into train and test just after the data type and feature engineering. This because the library contains some functions for [preprocessing](https://feature-engine.trainindata.com/en/latest/user_guide/preprocessing/index.html) that can deal with removed rows and features afterwards

- [Useful library for feature engineering](https://feature-engine.trainindata.com/en/latest/quickstart/index.html)

## Split features into groups based on the data type

- Split features for data types (***remember to insert the case where the columns with more than 50% NaN are not included in the splitting at all***)
    - Then the pipeline is build to transform the data types
    - Based on the previous splitting, apply Imputation methods to all the features. This is done because we don't know if other datasets will have the same null values ripartition
    - At this point we need to **drop** the columns not included in the splitting of data types. This because the columns not included will be the ones with a lot of NAs from the start (more than 50%)

> *Eventually we could compare the result of this approach with the result of a parallel approach whereby no columns are dropped and the NaNs are all Imputed. Then see how the two models perform*

In [8]:
## To decide if will be included or not in the pipeline
from sklearn.preprocessing import FunctionTransformer

def drop_features_with_many_nan(x: pd.DataFrame) -> pd.DataFrame:
    nulls_summary = pd.DataFrame(df_listings.isnull().sum())
    more_than_null_features = nulls_summary.loc[nulls_summary.iloc[:, 0] > df_listings.shape[0]*0.5, :].index.tolist()
    return x.drop(more_than_null_features, axis=1)

fun_tr_drop_features_with_many_nan = FunctionTransformer(drop_features_with_many_nan)

## Define groups for data transformation

The following class serves as definition of some general functions to be used for geographic handling

In [9]:
strategic_locations_geo = {"Aereoporto Marco Polo": [45.50354, 12.34258],
                       "Piazza Erminio Ferretto": [45.49479, 12.24251],
                       "Piazzale Roma": [45.43801, 12.31885],
                       "Ponte di Rialto": [45.43805, 12.33593],
                       "Piazza San Marco": [45.434, 12.338]
                       }


class GeoDataHandler:
    def __init__(self, user_agent: str = "GeoDataHandler"):
        """
        Initializes the GeoDataHandler with a user agent for Nominatim.
        :param user_agent: A string representing the user agent for Nominatim.
        """
        self.geolocator = Nominatim(user_agent=user_agent)
        self.geocode = RateLimiter(self.geolocator.geocode, min_delay_seconds=1.1)
    
    def retrieve_host_location(self, df: pd.DataFrame) -> dict:
        """
        From a dataset of listings, extracts the list of unique host locations
        and retrieve latitude and longitude of every location.
        :param df: pandas DataFrame of listings.
        :return: dict of locations: [latitude, longitude]
        """
        location_geo = {}
        try:
            for location in df['host_location'].unique().tolist():
                host_location = self.geocode(location)
                if host_location:
                    location_geo[location] = (host_location.latitude, host_location.longitude)
                else:
                    location_geo[location] = (None, None)
            return location_geo
        except Exception as e:
            print(f"An error occurred: {e}")
            return None

    def export_to_json(self, dict_object: dict, path: str) -> None:
        """
        Given a dict with host locations, saves it to a custom path.
        :param dict_object: dictionary to be saved as JSON.
        :param path: str with the path where to save JSON.
        :return: None
        """
        try:
            with open(path, 'w') as f:
                json.dump(dict_object, f)
        except Exception as e:
            print(f"An error occurred while exporting to JSON: {e}")

    def import_from_json(self, path: str) -> dict:
        """
        Import host location from saved JSON.
        :param path: path where the JSON is saved.
        :return: JSON in dictionary form.
        """
        try:
            with open(path, 'r') as f:
                dict_object = json.load(f)
            return dict_object
        except Exception as e:
            print(f"An error occurred while importing from JSON: {e}")
            return None


In [10]:
handler = GeoDataHandler()
#locations = handler.retrieve_host_location(df_listings)
#handler.export_to_json(locations, "../data/2023dic/host_locations.json")
locations = handler.import_from_json("../data/2023dic/host_locations.json")

strategic_locations_geo = {"Aereoporto Marco Polo": [45.50354, 12.34258],
                       "Piazza Erminio Ferretto": [45.49479, 12.24251],
                       "Piazzale Roma": [45.43801, 12.31885],
                       "Ponte di Rialto": [45.43805, 12.33593],
                       "Piazza San Marco": [45.434, 12.338]
                       }
#handler.export_to_json(strategic_locations_geo, "../data/strategic_locations.json")
strategic_locations = handler.import_from_json("../data/strategic_locations.json")

### Geographical Features

In [11]:
df_listings.dtypes
#df_listings

id                                                int64
neighborhood_overview                            object
host_id                                           int64
host_since                                       object
host_location                                    object
host_about                                       object
host_response_time                               object
host_response_rate                               object
host_acceptance_rate                             object
host_is_superhost                                object
host_listings_count                               int64
host_total_listings_count                         int64
host_verifications                               object
host_has_profile_pic                             object
host_identity_verified                           object
neighbourhood_cleansed                           object
latitude                                        float64
longitude                                       

In [12]:
geo_features = ["host_location"] # + strategic_locations

In [13]:
class GeographicTransformer(BaseEstimator, TransformerMixin):
    # https://datascience.stackexchange.com/questions/117200/creating-new-features-as-linear-combination-of-others-as-part-of-a-scikit-learn
    # https://www.andrewvillazon.com/custom-scikit-learn-transformers/
    def __init__(self, locations: dict = locations, column: str ="host_location"):
        
        self.column = column
        self.locations = locations
    def fit(self, X, y=None):
        return self

    def transform(self, X: pd.DataFrame, y=None):
        if self.column == "host_location":
            X = self.transform_to_coordinates(X, self.locations)
            X[self.column] = X.apply(lambda row: self.geodesic_distancer(row, from_loc="host_location"), axis=1)
            return X
        else:
            X = self.create_strategic_locations_features(X)
            X = self.apply_location_to_feature(X)
            X = self.apply_distancer_to_strategic_locations(X)
            return X
    
    def transform_to_coordinates(self, X, locations: dict):
        """
        Given an entry and a dictionary, returns the latitude, longitude for
        the entry that are saved in the dictionary
        :param entry: entry (from dataframe)
        :param locations: dict of locations:[latitude, longitude]
        :return: [latitude, longitude]
        """
        try:
            X[self.column] = X[self.column].apply(lambda x: locations.get(x))
            return X
        except:
            return X
        
    def geodesic_distancer(self, row, from_loc: str):
        try:
            coords_1 = (row[from_loc][0], row[from_loc][1])
            coords_2 = (row["latitude"], row["longitude"])
            return geodesic(coords_1, coords_2).km
        except:
            return None
    
    @staticmethod
    def create_strategic_locations_features(X: pd.DataFrame) -> pd.DataFrame:
        X["airport_distance_km"] = None
        X["ferretto_square_distance_km"] = None
        X["roma_square_distance_km"] = None
        X["rialto_bridge_distance_km"] = None
        X["san_marco_square_distance_km"] = None
        return X
    
    def apply_location_to_feature(self, X: pd.DataFrame) -> pd.DataFrame:
        X["airport_distance_km"] = X["airport_distance_km"].apply(lambda x: self.locations["Aereoporto Marco Polo"])
        X["ferretto_square_distance_km"] = X["ferretto_square_distance_km"].apply(lambda x: self.locations["Piazza Erminio Ferretto"])
        X["roma_square_distance_km"] = X["roma_square_distance_km"].apply(lambda x: self.locations["Piazzale Roma"])
        X["rialto_bridge_distance_km"] = X["rialto_bridge_distance_km"].apply(lambda x: self.locations["Ponte di Rialto"])
        X["san_marco_square_distance_km"] = X["san_marco_square_distance_km"].apply(lambda x: self.locations["Piazza San Marco"])
        return X
    
    def apply_distancer_to_strategic_locations(self, X: pd.DataFrame) -> pd.DataFrame:
        X['airport_distance_km'] = X.apply(lambda row: self.geodesic_distancer(row=row, from_loc="airport_distance_km"), axis=1)
        X['ferretto_square_distance_km'] = X.apply(lambda row: self.geodesic_distancer(row=row, from_loc="ferretto_square_distance_km"), axis=1)
        X['roma_square_distance_km'] = X.apply(lambda row: self.geodesic_distancer(row=row, from_loc="roma_square_distance_km"), axis=1)
        X['rialto_bridge_distance_km'] = X.apply(lambda row: self.geodesic_distancer(row=row, from_loc="rialto_bridge_distance_km"), axis=1)
        X['san_marco_square_distance_km'] = X.apply(lambda row: self.geodesic_distancer(row=row, from_loc="san_marco_square_distance_km"), axis=1)
        return X

        

In [14]:
geographic_pipeline = Pipeline(steps=[
    ('Host location transformer', GeographicTransformer(column="host_location", locations=locations)),
    ("Strategic locations distance", GeographicTransformer(column="strategic_locations", locations=strategic_locations))
])

### String features

In [15]:
string_features = ["neighborhood_overview",
                   "host_about"]

Procedure for the string features in order to extract encoded features from text:
- use the tf-idf in order to gain a vector of encoded normalized word scores
- Use the vector as a feature in the dataset
- the vector does not need other normalization aspects

In [16]:
def trasform_nan_unicode(text_series):
    return text_series.fillna("").astype('U')

text_encoding_pipeline = Pipeline(steps=[
    ("text preprocessing", FunctionTransformer(trasform_nan_unicode, validate=False)),
    ("tf-idf vectorizer", TfidfVectorizer(encoding='utf-8',
                                          decode_error='ignore',
                                          strip_accents='unicode',
                                          lowercase=True,
                                          analyzer='word',
                                          max_df=0.8,
                                          use_idf=True,
                                          smooth_idf=True)
     )
])

### ID features

In [17]:
id_feature = ["id",
              "host_id"]

In [18]:
def id_to_string(id_object) -> str:
    return id_object.astype(str)

In [19]:
id_pipeline = Pipeline(steps=[
    ("From ID to string", FunctionTransformer(id_to_string))
])

### Rates features

In [20]:
rate_feature = ["host_response_rate",
                "host_acceptance_rate"]

In [21]:
def from_string_to_rate(rate_string: str) -> float:
    return rate_string.str.rstrip('%').astype(float)
    

In [22]:
rates_pipeline = Pipeline(steps=[
    ("Transform response rate", FunctionTransformer(from_string_to_rate))
])

### Time features

In [23]:
time_feature = ["host_since",
                "first_review",
                "last_review"]

In [24]:
def trasform_to_datetime(text_date: str) -> pd.Timestamp | pd.Timestamp:
    return pd.to_datetime(text_date)

In [25]:
timestamp_pipeline = Pipeline(steps=[
    ("Trasform to timestamp", FunctionTransformer(trasform_to_datetime))
])

 ## Categorial features
 
### Neighbourhoods features

In [26]:
neighbourhood_feature = ["neighbourhood_cleansed"]

new_neighbourhoods_levels = {'Cannaregio': 'Centro Storico',
                             'San Marco':'Centro Storico',
                             'Isola San Giorgio': 'Centro Storico',
                             'San Polo':'Centro Storico',
                             'Castello': 'Centro Storico',
                             "Sant'Elena": 'Centro Storico',
                             'Dorsoduro': 'Centro Storico',
                             'Sacca Fisola': 'Centro Storico',
                             'Giudecca': 'Centro Storico',
                             'Tronchetto': 'Centro Storico',
                             'Santa Croce': 'Centro Storico',
                             "Ca' Emiliani": 'Terraferma',
                             'Marghera Zona Industriale': 'Terraferma',
                             'Marghera Catene': 'Terraferma',
                             'Marghera': 'Terraferma',
                             "Ca' Sabbioni":'Terraferma',
                             'Giustizia': 'Terraferma',
                             'San Lorenzo XXV Aprile': 'Terraferma',
                             'Bissuola': 'Terraferma',
                             'Cipressina': 'Terraferma',
                             'Zona Commerciale via Torino': 'Terraferma',
                             'Carpenedo': 'Terraferma',
                             'Villabona': 'Terraferma',
                             'Santa Barbara': 'Terraferma',
                             'Altobello': 'Terraferma',
                             'Piave 1860': 'Terraferma',
                             'La Favorita': 'Terraferma',
                             'Villaggio Sartori': 'Terraferma',
                             'Villaggio San Marco': 'Terraferma',
                             'Gazzera': 'Terraferma',
                             'Asseggiano': 'Terraferma',
                             "Pra' Secco": 'Terraferma',
                             'Gatta - Bondu?': 'Terraferma',
                             'Quartiere Pertini': 'Terraferma',
                             'Campalto CEP': 'Terraferma',
                             'Mestre': 'Terraferma',
                             "Scaramuzza": "Terraferma",
                             'Alberoni': 'Isole',
                             'Malamocco': 'Isole',
                             'Lido': 'Isole',
                             "Sant'Erasmo": 'Isole',
                             'Burano': 'Isole',
                             'San Pietro in Volta': 'Isole',
                             'Mazzorbo': 'Isole',
                             'Pellestrina': 'Isole',
                             'Murano': 'Isole',
                             'Torcello': 'Isole',
                             'Favaro': 'Terraferma',
                             'Case Dosa': 'Terraferma',
                             'Marocco Terraglio': 'Terraferma',
                             'Campalto Gobbi': 'Terraferma',
                             'Malcontenta': 'Terraferma',
                             'Zelarino': 'Terraferma',
                             'Chirignago': 'Terraferma',
                             'Campalto Bagaron': 'Terraferma',
                             'Dese': 'Terraferma',
                             'Torre Antica': 'Terraferma',
                             'Aeroporto': 'Terraferma',
                             'Tessera':'Terraferma',
                             'Campalto': 'Terraferma',
                             'other city': 'Terraferma'}

#handler.export_to_json(new_neighbourhoods_levels, "../data/2023dic/neighbourhoods_levels.json")
neighbourhood_levels = handler.import_from_json("../data/2023dic/neighbourhoods_levels.json")

In [27]:
class NeighborhoodMapper(BaseEstimator, TransformerMixin):
    def __init__(self, mapping):
        self.mapping = mapping
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = pd.DataFrame(X)
        return X.replace(self.mapping)

neighbourhood_pipeline = Pipeline(steps=[
    ('Neighbourhood Mapper', NeighborhoodMapper(mapping=neighbourhood_levels))
])

### Verifications Feature

In [28]:
def new_features_for_verifications(df: pd.DataFrame) -> pd.DataFrame:
    df['email_verification'] = 'f'
    df['phone_verification'] = 'f'
    df['work_email_verification'] = 'f'
    return df

def allocate_verifications_to_variables(row):
    if "email" in row["host_verifications"]:
        row["email_verification"] = 't'
    if "phone" in row["host_verifications"]:
        row["phone_verification"] = 't'
    if "work_email" in row["host_verifications"]:
        row["work_email_verification"] = 't'
    return row

def apply_on_every_row(df: pd.DataFrame) -> pd.DataFrame:
    return df.apply(allocate_verifications_to_variables, axis=1)

verifications_pipeline = Pipeline(steps=[
    ('Create features', FunctionTransformer(new_features_for_verifications)),
    ('Allocate verifications', FunctionTransformer(apply_on_every_row))
])


### Bathrooms text feature

In [29]:
bathroom_text_feature = ["bathrooms_text"]

remap_baths = {
    'baths': 'single',
    'bath': 'single',
    'private bath': 'private',
    'shared bath': 'shared',
    'shared baths': 'shared',
    'Shared half-bath': 'shared',
    '. baths': 'single',
    '. shared baths': 'shared',
    'Half-bath': 'single',
    'Private half-bath': 'private'
}

handler.export_to_json(remap_baths, '../data/2023dic/baths.json')
remap_baths = handler.import_from_json('../data/2023dic/baths.json')

class BathroomsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, mapping):
        self.mapping = mapping

    def extract_digits(self, text):
        if pd.isna(text):
            return '0'
        if "half" in text.lower():
            return '0.5'
        digits = re.findall(r'\d+\.\d+|\d+', str(text))
        return ''.join(digits) if digits else '0'

    def remove_digits(self, text):
        if pd.isna(text):
            return ''
        return re.sub(r'\d', '', str(text)).strip()

    def create_baths_column(self, df: pd.DataFrame) -> pd.DataFrame:
        df['bathrooms'] = df['bathrooms_text'].apply(self.extract_digits)
        df['bathrooms'] = df['bathrooms'].astype(float)
        return df

    def clean_bathrooms_text(self, df: pd.DataFrame) -> pd.DataFrame:
        df['bathrooms_text'] = df['bathrooms_text'].apply(self.remove_digits)
        return df
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = pd.DataFrame(X)
        X = self.create_baths_column(X)
        X = self.clean_bathrooms_text(X)
        return X.replace(self.mapping) 
    
bathrooms_pipeline = Pipeline(steps=[
    ('Remap bathrooms text', BathroomsTransformer(remap_baths))
])    

### Price feature

In [30]:
price_feature = ['price']

def remove_symbols(text):
    try:
        cleaned_text = re.sub(r'[$,]', '', text)
        return cleaned_text.strip()
    except:
        return None
    
def remove_dollar_sign(df: pd.DataFrame) -> pd.DataFrame:
    df['price'] = df['price'].apply(remove_symbols).astype(float)
    return df
    
price_pipeline = Pipeline(steps=[
    ("Trim price feature", FunctionTransformer(remove_dollar_sign))
])
    

## Aggregate visualization dataset

In [44]:
from sklearn import set_config
set_config(transform_output = "pandas")

preprocessor = ColumnTransformer(transformers=[
    ("Geographic", geographic_pipeline, df_listings),
    ("Text encoding", text_encoding_pipeline, string_features),
    ("Id", id_pipeline, id_feature),
    ("Rates", rates_pipeline, rate_feature),
    ("Timestamp", timestamp_pipeline, time_feature),
    ("Neighbourhood", neighbourhood_pipeline, neighbourhood_feature),
    ("Verifications", verifications_pipeline, df_listings),
    ("Bathrooms", bathrooms_pipeline, bathroom_text_feature),
    ("Price", price_pipeline, price_feature)
],
    remainder="passthrough",
    n_jobs=-1
)

In [45]:
from sklearn.utils import estimator_html_repr
with open('visualization_pipeline.html', 'w') as f:  
    f.write(estimator_html_repr(preprocessor))


In [47]:
preprocessor.fit_transform(df_listings)

ValueError: No valid specification of the columns. Only a scalar, list or slice of all integers or all strings, or boolean mask is allowed

# Pipeline for visualization ENDED

## Numerical features

In [None]:
num_features = ["host_listings_count", "host_total_listings_count", "accommodates", "bathrooms", "bedrooms", "beds",
                "minimum_nights", "maximum_nights", "number_of_reviews", "review_scores_rating", "review_scores_accuracy",
                "review_scores_cleanliness", "review_scores_checkin", "review_scores_communication",
                "review_scores_location", "review_scores_value", "calculated_host_listings_count",
                "calculated_host_listings_count_entire_homes", "calculated_host_listings_count_private_rooms",
                "calculated_host_listings_count_shared_rooms", "reviews_per_month"
                ]

### Add and manipulate features