In [1]:
import numpy as np
import pandas as pd
from pandarallel import pandarallel
from tqdm.notebook import tqdm
tqdm.pandas()
import matplotlib.pyplot as plt

In [2]:
## Load Visualization Pipeline for all the periods
## Insert one of the following arguments to run the visualization:
## - sep
## - dic
## - mar
## - jun

#!python3 -m scripts.visualization_pipeline sep
#!python3 -m scripts.visualization_pipeline dic
#!python3 -m scripts.visualization_pipeline mar
#!python3 -m scripts.visualization_pipeline jun

## Run the following to compute the mappings for host locations
#!python3 -m scripts.mappings_host_location

In [3]:
df = pd.read_pickle("data/pickles/listings_viz_sep.pkl")
df.dtypes


id                                                      object
host_id                                                 object
host_response_rate                                     float64
host_acceptance_rate                                   float64
neighbourhood_cleansed                                  object
price                                                  float64
host_since                                      datetime64[ns]
first_review                                    datetime64[ns]
last_review                                     datetime64[ns]
neighborhood_overview                                   object
host_location                                          float64
host_about                                              object
host_response_time                                      object
host_is_superhost                                       object
host_listings_count                                      int64
host_total_listings_count                              

### IDs

https://feature-engine.trainindata.com/en/latest/user_guide/encoding/index.html

In [4]:
df_plot = df.copy()
df_ids = pd.DataFrame(df_plot.groupby(by=["host_id"]).count()["id"])

df_ids['group'] = df_ids["id"].apply(lambda x: x if x > 5 else "other")

df_ids['index'] = df_ids.index

def set_custom_index(row):
    if row['group']=='other':
        row['index'] = 'other'
    return row

df_ids = df_ids.apply(lambda x: set_custom_index(x), axis=1)

df_ids.reset_index(inplace=True)
df_ids.drop(["host_id", "group"], inplace=True, axis=1)
df_ids.set_index('index', inplace=True)

df_ids = pd.DataFrame(df_ids.groupby(by=['index']).sum())


In [5]:
#plt.style.use('_mpl-gallery')
#
#plt.figure(figsize=(10, 6))
#plt.bar(df_ids.index, df_ids['id'], color='skyblue')
#plt.xlabel('Unique Host IDs')
#plt.ylabel('Count of Rows')
#plt.title('Count of Rows for Unique Host IDs')
##plt.xticks(host_id_counts.index)
#plt.grid(axis='y')
#
#plt.show()

## Review dates

From `first_review` and `last_review` to `reviewed_time_span`

In [6]:
df.isnull().sum()

id                                                 0
host_id                                            0
host_response_rate                               829
host_acceptance_rate                             535
neighbourhood_cleansed                             0
price                                              0
host_since                                         0
first_review                                     806
last_review                                      806
neighborhood_overview                           3208
host_location                                   1754
host_about                                      3529
host_response_time                               829
host_is_superhost                                205
host_listings_count                                0
host_total_listings_count                          0
host_has_profile_pic                               0
host_identity_verified                             0
latitude                                      

In [29]:
import pandas as pd
from feature_engine.datetime import DatetimeSubtraction
from feature_engine.pipeline import Pipeline
from feature_engine.creation import RelativeFeatures
from feature_engine.encoding import OneHotEncoder, CountFrequencyEncoder
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer

In [8]:
review_dates_feature = ["first_review", "last_review"]

review_dates_pipeline = Pipeline(steps=[
    ('Review_dates_engineering', DatetimeSubtraction(variables="last_review",
                                                     reference="first_review",
                                                     output_unit="D",
                                                     drop_original=True,
                                                     new_variables_names=["days_active_reviews"],
                                                     missing_values="ignore"
                                                     )
     ),
    ("Imputation", SimpleImputer(strategy='median')),
    ("Standardize", MinMaxScaler()),
])


## Host Listings Count

`host_listings_count` as a percentage of `host_total_listings_count` and drop original columns

In [9]:
host_listings_feature = ["host_listings_count",
                         "host_total_listings_count"]

host_listings_pipeline = Pipeline(steps=[
    ("Imputation", SimpleImputer(strategy="median")),
    ("relative_feature", RelativeFeatures(
        variables=['host_listings_count'],
        reference=['host_total_listings_count'],
        func=['div'],
        drop_original=True
    )),
    ("Standardize", MinMaxScaler()),
])


## One-hot-encoding transformation

*This pipeline is also suited for all the binary variables at hand because of the `drop_last_binary` option*

In [10]:
ohe_feature = ["neighbourhood_cleansed", # categorical
                "host_is_superhost",       # binary
                "host_has_profile_pic",
                "host_identity_verified",
                "email_verification",
                "phone_verification",
                "work_email_verification"]

ohe_pipeline = Pipeline(steps=[
    ("Imputation", SimpleImputer(strategy="most_frequent")),
    ("One-hot-encoder", OneHotEncoder(drop_last_binary=True,
                                      drop_last=True))
])

## Ordinal categorical encoding



In [11]:
ordinal_feature = ["host_response_time",
                   "room_type",
                   "bathrooms_text"]

ordinal_pipeline = Pipeline(steps=[
    ("Imputation", SimpleImputer(strategy="most_frequent")),
    ("Ordinal_encoder", OrdinalEncoder(categories="auto"))
])

## Host ID

**The high cardinality of unique hosts and the severely skewed distribution in the number of listings for every host
could lead us to drop also the `host_id` feature in order to avoid overfitting.**

**The alternative is to use some Categorical Encoding functions from `feature_engine` library that seem to be well suited to handle this kind cardinality** like
- the Rare Label Encoding,
- Decision Tree Encoding,
- Count Frequency Encoding (that enables us to handle also unseen categories by allocating the code $0$ for the prediction [see documentation])


In [12]:
host_id_feature = ["host_id"]

host_id_pipeline = Pipeline(steps=[
    ("Count_frequency_encoding", CountFrequencyEncoder(encoding_method="count",
                               missing_values="ignore",
                               unseen="encode"))
])

## Host since

In [13]:
host_since_feature = ["host_since"]

host_since_pipeline = Pipeline(steps=[
    ('date_engineering', DatetimeSubtraction(variables=max("last_review"),
                                             reference="host_since",
                                             output_unit="D",
                                             drop_original=True,
                                             new_variables_names=["host_since_days"],
                                             missing_values="ignore"
                                             )
     ),
    ("Imputation", SimpleImputer(strategy='median')),
    ("Standardize", MinMaxScaler()),
])

## Numerical features

In [14]:
numerical_feature = ["host_response_rate",
                     "host_acceptance_rate",
                     "host_location",
                     "minimum_nights",
                     "maximum_nights",
                     "number_of_reviews",
                     "review_scores_rating",
                     "review_scores_accuracy",
                     "review_scores_cleanliness",
                     "review_scores_checkin",
                     "review_scores_communication",
                     "review_scores_location",
                     "review_scores_value",
                     "reviews_per_month",
                     "airport_distance_km",
                     "ferretto_square_distance_km",
                     "roma_square_distance_km",
                     "rialto_bridge_distance_km",
                     "san_marco_square_distance_km"
                     ]

numerical_pipeline = Pipeline(steps=[
    ("Imputer", KNNImputer(n_neighbors=5, weights="uniform")),
    ("Standardize", MinMaxScaler())
])

## Coordinates (numerical)

In [15]:
coordinates_feature = ["latitude",
                       "longitude"]

coordinates_pipeline = Pipeline(steps=[
    ("Imputer", SimpleImputer(strategy="most_frequent")),
    ("Standardization", StandardScaler())
])

## Engineer Accomodates vs Beds and Bathrooms and Bedrooms

In [16]:
accomodates_vs_feature = ["accommodates",
                          "bathrooms",
                          "bedrooms",
                          "beds"]

accomodates_vs_pipeline = Pipeline(steps=[
    ("Feature_engineering", RelativeFeatures(
        variables=[
            "bathrooms",
            "bedrooms",
            "beds"],
        reference=["accommodates"],
        func=["div"],
        fill_value=None,
        missing_values="ignore",
        drop_original=True
    )
     ),
    ("Imputation", SimpleImputer(strategy="median")),
    ("Standardization", MinMaxScaler())
])

## Engineer Beds vs Rooms

In [17]:
bedrooms_feature = ["beds", 
                    "bedrooms"]

bedrooms_pipeline = Pipeline(steps=[
    ("Feature_engineering", RelativeFeatures(
        variables=["beds"],
        reference=["bedrooms"],
        func=["div"],
        fill_value=None,
        missing_values="ignore",
        drop_original=True
    )
     ),
    ("Imputation", SimpleImputer(strategy="median")),
    ("Standardization", MinMaxScaler())
])

## Calculated host listings features

In [18]:
calculated_listings_feature = ["calculated_host_listings_count",
                          "calculated_host_listings_count_entire_homes",
                          "calculated_host_listings_count_private_rooms",
                          "calculated_host_listings_count_shared_rooms"
                          ]

calculated_listings_pipeline = Pipeline(steps=[
    ("Feature_engineering", RelativeFeatures(
        variables=[
            "calculated_host_listings_count_entire_homes",
            "calculated_host_listings_count_private_rooms",
            "calculated_host_listings_count_shared_rooms"
        ],
        reference=["calculated_host_listings_count"],
        func=["div"],
        fill_value=None,
        missing_values="ignore",
        drop_original=True
    )
     ),
    ("Imputation", SimpleImputer(strategy="median")),
    ("Standardization", MinMaxScaler())
])

# Final transformer

In [19]:
df.head()

Unnamed: 0,id,host_id,host_response_rate,host_acceptance_rate,neighbourhood_cleansed,price,host_since,first_review,last_review,neighborhood_overview,...,calculated_host_listings_count_shared_rooms,reviews_per_month,airport_distance_km,ferretto_square_distance_km,roma_square_distance_km,rialto_bridge_distance_km,san_marco_square_distance_km,email_verification,phone_verification,work_email_verification
0,27491536,185612295,50.0,32.0,Centro Storico,211.0,2018-04-22,2018-10-01,2023-05-28,,...,0,0.03,6.874816,8.510272,0.665043,1.165896,1.560678,t,t,f
1,829284975573971493,321485673,,,Centro Storico,1000.0,2019-12-27,NaT,NaT,,...,0,,6.707799,8.91165,1.112071,0.768878,1.238334,t,t,f
2,22925979,7074201,,,Centro Storico,55.0,2013-06-23,NaT,NaT,,...,0,,7.706987,10.683423,2.371963,1.082086,0.83885,t,t,f
3,33146077,184936719,100.0,55.0,Isole,190.0,2018-04-18,2019-11-04,2019-11-04,0.0,...,0,0.02,10.213909,13.28363,4.64474,3.659282,3.218084,t,t,f
4,603168,2985972,100.0,100.0,Centro Storico,80.0,2012-07-19,2012-10-27,2023-08-11,,...,0,3.84,6.561673,8.436345,0.990182,1.218631,1.664298,t,t,f


In [20]:
## Drop rows with NaN in target 
df = df.loc[df['price'].notnull(), :]

X = df.drop(["price"], axis=1, inplace=False)
y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=874631)

In [21]:
preprocessor = ColumnTransformer(
    transformers=[
        ("Review dates", review_dates_pipeline, review_dates_feature),
        ("Host listings", host_listings_pipeline, host_listings_feature),
        ("OHE", ohe_pipeline, ohe_feature),
        ("Ordinal", ordinal_pipeline, ordinal_feature),
        ("Host Id", host_id_pipeline, host_id_feature),
        ("Host since", host_since_pipeline, host_since_feature),
        ("Numerical", numerical_pipeline, numerical_feature),
        ("Coordinates", coordinates_pipeline, coordinates_feature),
        ("Accomodates VS", accomodates_vs_pipeline, accomodates_vs_feature),
        ("Bedrooms VS", bedrooms_pipeline, bedrooms_feature),
        ("Listings count VS", calculated_listings_pipeline, calculated_listings_feature)
    ],
    remainder="drop",
    n_jobs=-1,
    verbose=True,
    verbose_feature_names_out=True
)

In [22]:
from sklearn.ensemble import RandomForestRegressor
from feature_engine.preprocessing import MatchVariables

transform = Pipeline(
   steps=[
       ("processing", preprocessor),
       ("RandomForestRegressor", RandomForestRegressor()),
   ]
)

In [23]:
model = transform.fit(X_train,y_train)
#model.score(X_test, y_test)


KeyError: "None of [Index(['host_listings_count'], dtype='object')] are in the [columns]"