In [1]:
import pandas as pd
pd.options.display.float_format = '{:.0f}'.format
import numpy as np
import matplotlib.pyplot as plt
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
from geopy.extra.rate_limiter import RateLimiter
from tqdm import tqdm
import json
from sklearn.model_selection import train_test_split
#from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from feature_engine.imputation import AddMissingIndicator, MeanMedianImputer
from feature_engine.pipeline import Pipeline


In [2]:
df_listings = pd.read_csv("../data/2023dic/d_listings.csv")
#df_listings = df_listings[['id', 'neighborhood_overview', 'host_id', 'host_since', 'host_location', 'host_about',
#                           'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
#                           'host_listings_count', 'host_total_listings_count', 'host_verifications',
#                           'host_has_profile_pic', 'host_identity_verified', 'neighbourhood_cleansed',
#                           'latitude', 'longitude', 'room_type',
#                           'accommodates', 'bathrooms_text', 'beds', 'price',
#                           'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'has_availability', 'availability_30',
#                           'availability_60', 'availability_90', 'availability_365', 'number_of_reviews', 'first_review',
#                           'last_review', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness',
#                           'review_scores_checkin', 'review_scores_communication', 'review_scores_location',
#                           'review_scores_value', 'calculated_host_listings_count',
#                           'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms',
#                           'calculated_host_listings_count_shared_rooms', 'reviews_per_month']]

In [3]:
df_listings.drop(labels=["listing_url", "name", "scrape_id", "last_scraped", "source", "description", "picture_url", "host_url",
                         "host_name", "host_thumbnail_url", "host_picture_url", "host_neighbourhood", "neighbourhood",
                         "neighbourhood_group_cleansed", "property_type", "amenities", "minimum_minimum_nights",
                         "maximum_minimum_nights", "minimum_maximum_nights", "maximum_maximum_nights", "minimum_nights_avg_ntm",
                         "maximum_nights_avg_ntm", "calendar_updated", "calendar_last_scraped", "number_of_reviews_ltm",
                         "number_of_reviews_l30d", "license", "instant_bookable"],
                 axis=1,
                 inplace=True)

In [None]:
df_listings.head()

## Feature engineering

- `first_review` to `last_review` as date span
- `host_listings_count` as a % of `host_total_listings_count`
- Manage `neighbourhoods_cleansed` as a OHE of most frequent categories
- Distance between host home and listing location
- Distance between listing and relevant locations in town
- `host_since` encoded as *days of activity until period end (end of dataset scraping)*
- Sentiment of `neighborhood_overview` (investigate best sentiment technique for descriptions of appartments)
- Sentiment of `host_about` (investigate best sentiment technique for description of people)
- `host_id` and `id` as categorial
- `host_response_time` as ordinal variable
- string manipulation for `host_response_rate` and `host_acceptance_rate`
- `host_is_superhost` as binary categorial
- `host_verifications` as encoded in previous script
- `host_has_profile_pic` as binary
- `host_identity_verified` as binary
- keep `room_type` instead of `property_type` and make `room_type` a categorial with OHE
- `accomodates` used with `baths`, `beds` to compute the rate of beds and bathrooms for every person
- `price` with string manipulation
- `minimum_nighs_avg_ntm` as float
- `maximum_nights_avg_ntm` as float
- `has_availability` as binary
- all the `has_availability_NUMBER` as a % of the NUMBER of the feature
- `number_of_reviews` as an integer
- `review_scores_rating` as float
- all the reviews scores as float
- remove `calculated_host_listings_count` and keep the other three BUT **set them as % of the total host listings**
- `reviews_per_month` as float
- `longitude` and `latitude` standardization (because the values are both negatives and positives)

## Transform feature datatypes

In this section we execute the feature engineering without dealing with null values.
We do it because once the types are cleaned, we want to plot a bit the data and explore it to see what is going on
with NAs, frequency distribution, numeric distributions etc.
In order to do so, we need to:
1. Generalise the pipeline, because we would like to apply this script also to other similar dataset
2. Return exceptions for NAs, to carry them on to the data exploration section

> ***NOTE*** that the `feature-engine` library enables us to split the dataset into train and test just after the data type and feature engineering. This because the library contains some functions for [preprocessing](https://feature-engine.trainindata.com/en/latest/user_guide/preprocessing/index.html) that can deal with removed rows and features afterwards

- [Useful library for feature engineering](https://feature-engine.trainindata.com/en/latest/quickstart/index.html)

In [None]:
df_listings.dtypes

In [None]:
# https://feature-engine.trainindata.com/en/latest/quickstart/index.html

### Transform data types

In [None]:
def retrieve_host_location(df: pd.DataFrame) -> dict:
    """
    From a dataset of listings, extracts the list of unique host locations
    and retrieve latitude and longitude of every location
    :param df: pandas dataframe of listings
    :return: dict of locations: [latitude, longitude]
    """
    location_geo = {}
    try:
        for l in df.host_location.unique().tolist():
            geolocator = Nominatim(user_agent="Host to listing distance")
            geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1.1)
            host_location = geolocator.geocode(l)
            location_geo[l] = (host_location.latitude, host_location.longitude)
        return location_geo
    except:
        return None


def export_to_json(dict_object: dict, path: str) -> None:
    """
    Given a dict with host locations, saves it to custom path
    :param dict_object: dictionary to be saved as json
    :param path: str with the path where to save json
    :return: None
    """
    with open(path, 'w') as f:
        json.dump(dict_object, f)
    return


def import_from_json(path: str) -> dict:
    """
    Import host location from saved json
    :param path: path where the json is saved
    :return: json in dictionary form
    """
    with open(path, 'r') as f:
        dict_object = json.load(f)
    return dict_object


def transform_to_coordinates(entry, locations: dict) -> list[int, int]:
    """
    Given an entry and a dictionary, returns the latitude, longitude for
    the entry that are saved in the dictionary
    :param entry: entry (from dataframe)
    :param locations: dict of locations:[latitude, longitude]
    :return: [latitude, longitude]
    """
    try:
        return locations.get(entry)
    except:
        return None


def geodesic_distancer(row, from_loc: str):
    try:
        coords_1 = (row[f"{from_loc}"][0], row[f"{from_loc}"][1])
        coords_2 = (row["latitude"], row["longitude"])
        return geodesic(coords_1, coords_2).km
    except:
        return None


def compute_distance(entry) -> float:
    """
    Given a row in the 
    :param entry: 
    :return: 
    """
    pass

In [None]:
def transform_to_string(text) -> str:
    try:
        return str(text)
    except:
        return None


def transform_to_date(date) -> pd.DatetimeTZDtype:
    try:
        return pd.to_datetime(date)
    except:
        return None
        




### Add and manipulate features