In [1]:
import numpy as np
import pandas as pd
from pandarallel import pandarallel
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from src.tools import JsonHandler, concatenate_listings_datasets, return_cleaned_col_names, preprocess_text
from src.class_transformers import (
    GeographicTransformer,
    BathroomsTransformer,
    CreateVerificationsTransformer,
    AmenitiesTransformer,
    OfflineLocationFinder,
    PropertyTypeTransformer,
    HostLocationImputer,
)
from src.function_transformers import (
    fun_tr_id_to_string,
    fun_tr_from_string_to_rate,
    fun_tr_transform_to_datetime,
    fun_tr_remove_dollar_sign,
)
from sklearn.utils import estimator_html_repr
from sklearn import set_config
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
from sklearn.pipeline import Pipeline
from src.class_transformers import ColumnDropperTransformer, IntoBinaryTransformer, CoordinatesTransformer
from sklearn import set_config
import plotly.express as px

set_config(transform_output="pandas")

pandarallel.initialize()
pd.options.display.float_format = "{:.0f}".format
handler = JsonHandler()

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
df = pd.read_csv("data/all_cities/listings_ve.csv")

In [4]:
df.head(1)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,6623,https://www.airbnb.com/rooms/6623,20240607164321,2024-06-08,city scrape,Venice Blue on Canal apartment,"I'm thrilled to share my ""Blue on Venice Canal...",Close by is the Frari Church (known as Tiziano...,https://a0.muscache.com/pictures/492258/834683...,15016,...,5,5,5,,f,1,1,0,0,1


In [None]:
print("Importing dataset and other data...")
#df_listings = concatenate_listings_datasets()
host_locations = handler.import_from_json("data/mappings/host_locations.json")
remap_baths = handler.import_from_json("data/mappings/baths.json")
print("Data imported!")

# Viz pipeline

In [None]:
df.isna().sum()

In [None]:
df.head()

## Drop features with too many NAs

In [None]:
# Drop features with too many NAs
df.drop([
    "neighborhood_overview",
    "host_about",
    "host_neighbourhood",
    "neighbourhood",
    "calendar_updated",
    "license"
],
inplace=True,
axis=1)

## Drop features not useful at all

In [None]:
df.drop([
    "listing_url",
    "scrape_id",
    "last_scraped",
    "source",
    "name",
    "picture_url",
    "host_url",
    "host_name",
    "host_thumbnail_url",
    "host_picture_url",
    "minimum_minimum_nights",
    "maximum_minimum_nights",
    "minimum_maximum_nights",
    "maximum_maximum_nights",
    "minimum_nights_avg_ntm",
    "maximum_nights_avg_ntm",
    "has_availability",
    "availability_30",
    "availability_60",
    "availability_90",
    "availability_365",
    "calendar_last_scraped",
    "number_of_reviews_ltm",
    "number_of_reviews_l30d",
    "instant_bookable",
    "calculated_host_listings_count",
    "calculated_host_listings_count_entire_homes",
    "calculated_host_listings_count_private_rooms",
    "calculated_host_listings_count_shared_rooms"
],
inplace=True,
axis=1)

In [None]:
#test = df.loc[df["review_scores_accuracy"].notnull(), ]

In [None]:
#test.shape

In [None]:
df.set_index("id", inplace=True)

df_nas_columns = pd.DataFrame(
    {
        "NAs": df.isnull().sum(axis=1),
        "Columns_with_NAs": df.apply(
            lambda x: ", ".join(x.index[x.isnull()]), axis=1
        ),
    }
)

In [None]:
more_than_7_missing = df_nas_columns.loc[df_nas_columns["NAs"] > 8, :]

In [None]:
more_than_8_missing = df_nas_columns.loc[df_nas_columns["NAs"] > 8, :].index.tolist()
df.drop(more_than_8_missing, inplace=True)
print("Columns and rows dropping completed!")

In [None]:
id_feature = ["host_id"]
rate_feature = ["host_response_rate", "host_acceptance_rate"]
time_feature = ["host_since", "first_review", "last_review"]
neighbourhood_feature = ["neighbourhood_cleansed"]
price_feature = ["price"]

# Amenities
internet_pattern: str = r"\b(wifi|internet|ethernet|fibra|connection)\b"
self_checkin_pattern: str = r"\b(self checkin|self check-in|self-checkin)\b"
host_greeting_pattern: str = r"\b(host greeting|host greets you)\b"
pool_pattern: str = r"\b(pool|pool view|shared pool)\b"
oven_pattern: str = r"\b(oven)\b"
microwave_pattern: str = r"\b(microwave|microonde)\b"
garden_pattern: str = r"\b(garden|park|backyard)\b"
streaming_pattern: str = r"\b(netflix|amazon|disney+|chromecast|apple tv|hbo|hbo max)\b"
gym_pattern: str = r"\b(exercise|gym|fitness|private gym in building|shared gym|gym nearby|workout bench)\b"
elevator_pattern: str = r"\b(elevator)\b"
heating_pattern: str = r"\b(heating)\b"
ac_pattern: str = r"\b(central air conditioning|ac|air conditioning)\b"
safe_pattern: str = r"\b(safe|locker|lock|security|guard)\b"
workspace_pattern: str = r"\b(workspace|work)\b"
freezer_pattern: str = r"\b(freezer|refrigerator)\b"
aid_pattern: str = r"\b(first aid kit|aid)\b"
dishwasher_pattern: str = r"\b(dishwasher)\b"
long_term_stays_pattern: str = r"\b(long term stays)\b"
pets_pattern: str = r"\b(pets allowed)\b"
bathtube_pattern: str = r"\b(bathtube)\b"
bbq_grill_pattern: str = r"\b(bbq grill|grill|barbeque|barbeque utensils)\b"
lake_bay_pattern: str = r"\b(lake view|bay view|harbor view|beach view)\b"

In [None]:
set_amenities_remapper = [
    (internet_pattern, "internet"),
    (self_checkin_pattern, "self-checkin"),
    (host_greeting_pattern, "host-greeting"),
    (pool_pattern, "pool"),
    (oven_pattern, "oven"),
    (microwave_pattern, "microwave"),
    (garden_pattern, "garden"),
    (streaming_pattern, "streaming"),
    (gym_pattern, "gym"),
    (elevator_pattern, "elevator"),
    (heating_pattern, "heating"),
    (ac_pattern, "air-conditioning"),
    (workspace_pattern, "workspace"),
    (freezer_pattern, "freezer"),
    (aid_pattern, "first-aid-kit"),
    (dishwasher_pattern, "dishwasher"),
    (long_term_stays_pattern, "long-term-stays"),
    (pets_pattern, "pets-allowed"),
    (bathtube_pattern, "bathtube"),
    (bbq_grill_pattern, "bbq-grill"),
    (lake_bay_pattern, "lake-bay-view")
]

In [None]:
entire_property_pattern = r"\b(entire|tiny home)\b"
private_room_pattern = r"\b(private room|room in serviced apartment|room in bed and breakfast|room in hotel|room in resort)\b"
shared_room_pattern = r"\b(shared room|shared)\b"
other_room_pattern = r"\b(entire|tiny home|private room|room in serviced apartment|room in bed and breakfast|room in hotel|room in resort|shared room|shared)\b"

set_property_type_remapper = [
    (entire_property_pattern, "entire_property"),
    (private_room_pattern, "private_room"),
    (shared_room_pattern, "shared_room"),
    (other_room_pattern, "other"),
]

In [None]:
id_pipeline = Pipeline(steps=[("From ID to string", fun_tr_id_to_string)], verbose=True)

rates_pipeline = Pipeline(
    steps=[("Transform response rate", fun_tr_from_string_to_rate)], verbose=True
)

timestamp_pipeline = Pipeline(
    steps=[("Transform to timestamp", fun_tr_transform_to_datetime)], verbose=True
)

price_pipeline = Pipeline(
    steps=[("Trim price feature", fun_tr_remove_dollar_sign)], verbose=True
)

# Apply to all dataset (feature engineering using other features)
feature_creation_pipeline = Pipeline(
    steps=[
        ("Listing Locations", OfflineLocationFinder()),
        ("Host Locations imputer", HostLocationImputer()),
        (
            "Host location",
            GeographicTransformer(column="host_location", locations=host_locations),
        ),
        ("Host verifications", CreateVerificationsTransformer()),
        ("Bathrooms", BathroomsTransformer(remap_baths)),
        (
            "Amenities",
            AmenitiesTransformer(df=df, remapper=set_amenities_remapper),
        ),
        (
            "Property type",
            PropertyTypeTransformer(
                df=df, remapper=set_property_type_remapper
            ),
        ),
    ],
    verbose=True,
)


In [None]:
print("Executing Feature Creation Pipeline...")
df = feature_creation_pipeline.fit_transform(df)
print("Feature Creation Pipeline completed!")

print("Executing preprocessing on features...")
feature_preprocessor = ColumnTransformer(
    remainder="passthrough",
    n_jobs=-1,
    force_int_remainder_cols=False,
    transformers=[
        ("Id", id_pipeline, id_feature),
        ("Rates", rates_pipeline, rate_feature),
        ("Price", price_pipeline, price_feature),
        ("Timestamp", timestamp_pipeline, time_feature),
    ],
    verbose=True,
)

df = feature_preprocessor.fit_transform(df)
print("Preprocessing on features completed!")

df.columns = return_cleaned_col_names(df.columns)
print("Cleaned feature names retrieved")

In [None]:
df.drop(["description"], axis=1, inplace=True)

# Exploration pipeline


In [None]:
hist_cat_variables = [
    "host_response_time",
    "host_is_superhost",
    "host_has_profile_pic",
    "neighbourhood_group_cleansed",
    "neighbourhood_cleansed",
    "host_identity_verified",
    "property_type",
    "room_type",
    "listing_city",
    "bathrooms_text",
    "email_verification",
    "phone_verification",
    "work_email_verification",
    'amenities_internet',
    'amenities_self-checkin',
    'amenities_host-greeting',
    'amenities_pool',
    'amenities_oven',
    'amenities_microwave',
    'amenities_garden',
    'amenities_streaming',
    'amenities_gym',
    'amenities_elevator',
    'amenities_heating',
    "amenities_air-conditioning",
    "amenities_workspace",
    "amenities_freezer",
    "amenities_first-aid-kit",
    "amenities_dishwasher",
    "amenities_long-term-stays",
    "amenities_pets-allowed",
    "amenities_bathtube",
    "amenities_bbq-grill",
    "amenities_lake-bay-view"
]

In [None]:
hist_num_variables = [
    "host_response_rate",
    "host_acceptance_rate",
    "price",
    "host_location",
    "host_listings_count",
    "host_total_listings_count",
    "accommodates",
    "bathrooms",
    "bedrooms",
    "beds",
    "minimum_nights",
    "maximum_nights",
    "number_of_reviews",
    "review_scores_rating",
    "review_scores_accuracy",
    "review_scores_cleanliness",
    "review_scores_checkin",
    "review_scores_communication",
    "review_scores_location",
    "review_scores_value",
    "reviews_per_month",
    "listing_city_pop"
    ]

## Corr matrix for numerical variables

In [None]:
df_corr = df.copy()
corr_full = df_corr[hist_num_variables].corr()
colors = ['green' if val < 0.45 else 'red' for val in corr_full.values.flatten()]
mask = np.where(corr_full < 0.45, 0, 1)

In [None]:
fig = px.imshow(corr_full,
                text_auto=True,
                aspect="auto",
                width=900,
                height=900
                )
fig.show()


In [None]:
fig = px.imshow(corr_full,
                text_auto=True,
                aspect="auto",
                width=900,
                height=900,
                color_continuous_scale=["green", "red"])

fig.update_traces(z=mask, colorscale=[[0, "green"], [1, "red"]])

fig.show()

In [None]:
to_drop_corr = [
    "host_acceptance_rate",
    "host_total_listings_count",
    "bathrooms",
    "bedrooms",
    "beds",
    "review_scores_accuracy",
    "review_scores_cleanliness",
    "review_scores_checkin",
    "review_scores_communication",
    "review_scores_location",
    "review_scores_value",
    "amenities"
]


In [None]:
eng_after_exploration_pipeline = Pipeline(
    steps=[
        ("Drop columns", ColumnDropperTransformer(columns=to_drop_corr)),
        (
            "Transform Response Rate",
            IntoBinaryTransformer(
                feature="host_response_rate", cat1="100", cond="x==100", cat2="lower"
            ),
        ),
        (
            "Transform Minimum Nights",
            IntoBinaryTransformer(
                feature="minimum_nights", cat1="1", cond="x<=1", cat2="more_than_1"
            ),
        ),
        (
            "Transform Maximum Nights",
            IntoBinaryTransformer(
                feature="maximum_nights",
                cat1="less_than_100",
                cond="x<=100",
                cat2="more_than_100",
            ),
        ),
        (
            "Transform City Population",
            IntoBinaryTransformer(
                feature="listing_city_pop",
                cat1="less_than_300k",
                cond="x<=300000",
                cat2="more_than_300k",
            ),
        ),
        (
            "Transform Review Score Rating",
            IntoBinaryTransformer(
                feature="review_scores_rating",
                cat1="less_than_4.8",
                cond="x<4.8",
                cat2="more_than_4.8",
            ),
        ),

        (
            "Transform Host Response Time",
            IntoBinaryTransformer(
                feature="host_response_time",
                cat1="within_an_hour",
                cond="x=='within an hour'",
                cat2="more_than_one_hour",
            ),
        ),
        (
            "Transform Property Type",
            IntoBinaryTransformer(
                feature="property_type",
                cat1="entire_property",
                cond="x=='entire_property'",
                cat2="other",
            ),
        ),
        (
            "Transform Room Type",
            IntoBinaryTransformer(
                feature="room_type",
                cat1="entire_home",
                cond="x=='Entire home/apt'",
                cat2="other",
            ),
        ),
        (
            "Transform Bathrooms Text",
            IntoBinaryTransformer(
                feature="bathrooms_text",
                cat1="single",
                cond="x=='single'",
                cat2="other",
            ),
        ),
        #(
        #    "Coordinates to spatial",
        #    CoordinatesTransformer()
        #),
    ],
    verbose=True,
)

df = eng_after_exploration_pipeline.fit_transform(df)

# Final pipeline and model fitting

In [None]:
import pandas as pd
import numpy as np
from feature_engine.datetime import DatetimeSubtraction
from feature_engine.creation import RelativeFeatures
from feature_engine.encoding import OneHotEncoder, CountFrequencyEncoder, OrdinalEncoder
from feature_engine.wrappers import SklearnTransformerWrapper
from feature_engine.imputation import MeanMedianImputer, CategoricalImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PolynomialFeatures, PowerTransformer, RobustScaler
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import QuantileRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import GridSearchCV
import sys
from sklearn.neural_network import MLPRegressor

In [None]:
review_dates_feature = [
    "first_review",
    "last_review"
]

ohe_feature = [
    "neighbourhood_group_cleansed",
    "host_is_superhost",
    "host_response_time",
    "host_has_profile_pic",
    "property_type",
    "room_type",
    "bathrooms_text",
    "host_response_rate",
    "minimum_nights",
    "maximum_nights",
    "listing_city_pop",
    "review_scores_rating",
    'amenities_internet',
    'amenities_self-checkin',
    'amenities_host-greeting',
    'amenities_pool',
    'amenities_oven',
    'amenities_microwave',
    'amenities_garden',
    'amenities_streaming',
    'amenities_gym',
    'amenities_elevator',
    'amenities_heating',
    "amenities_air-conditioning",
    "amenities_workspace",
    "amenities_freezer",
    "amenities_first-aid-kit",
    "amenities_dishwasher",
    "amenities_long-term-stays",
    "amenities_pets-allowed",
    "amenities_bathtube",
    "amenities_bbq-grill",
    "amenities_lake-bay-view",
]

ohe_most_frequent = [
    "listing_city",
    "neighbourhood_cleansed"
]

host_id_feature = [
    "host_id"
]

host_since_feature = [
    "host_since"
]

numerical_feature = [
                        "host_listings_count",
                        "host_location",
                        "number_of_reviews",
                        "reviews_per_month",
                        "accommodates",
                    ] #+ description_features

coordinates_feature = [
    "latitude",
    "longitude"
]

In [None]:
df["scraping_date"] = max(df["last_review"])

# Drop rows with NaN in target
df = df.loc[df["price"].notnull(), :]

X = df.drop(["price"], axis=1, inplace=False)
y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=874631
)

In [None]:
from src.class_transformers import ColumnDropperTransformer

In [None]:
param_grid={
    "regressor__n_estimators": [400], #, 400, 800],
    "regressor__criterion": ["squared_error", "absolute_error"],
    "regressor__random_state": [874631],
    "regressor__verbose": [True],
    #"regressor__min_impurity_decrease": [0, 0.1, 0.2, 0.3],
    "regressor__bootstrap": [True],
    "regressor__oob_score": [True],
}

In [None]:
wizard_pipe = Pipeline(
    steps=[
        # Review Dates (RD)
        (
            "RD_engineering",
            DatetimeSubtraction(
                variables="last_review",
                reference="first_review",
                output_unit="D",
                drop_original=False,
                new_variables_names=["days_active_reviews"],
                missing_values="ignore",
            ),
        ),
        (
            "RD_imputation",
            MeanMedianImputer(
                imputation_method="median", variables=["days_active_reviews"]
            ),
        ),
        # ========================
        # One-hot-encoding (OHE)
        (
            "OHE_imputation",
            CategoricalImputer(
                imputation_method="frequent",
                variables=ohe_feature,
                return_object=True,
                ignore_format=False,
            ),
        ),
        (
            "OHE_encoding",
            OneHotEncoder(
                top_categories=None,
                drop_last=True,
                drop_last_binary=True,
                ignore_format=False,
                variables=ohe_feature,
            ),
        ),
        # ========================
        # One-hot-encoding Top Frequent (OHETF)
        (
            "OHETF_imputation",
            CategoricalImputer(
                imputation_method="frequent",
                variables=ohe_most_frequent,
                return_object=True,
                ignore_format=False,
            ),
        ),
        (
            "OHETF_encoding",
            OneHotEncoder(
                top_categories=7,
                drop_last=True,
                drop_last_binary=True,
                ignore_format=False,
                variables=ohe_most_frequent,
            ),
        ),
        # =======================
        # Host ID (HID)
        (
            "HID_imputation",
            CategoricalImputer(
                imputation_method="missing",
                variables=host_id_feature,
                fill_value="MISSING",
            ),
        ),
        (
            "HID_encoding",
            CountFrequencyEncoder(
                encoding_method="count", missing_values="ignore", unseen="encode"
            ),
        ),
        # =========================
        # Host since (HS)
        (
            "HS_engineering",
            DatetimeSubtraction(
                variables=["scraping_date"],
                reference=["host_since"],
                output_unit="D",
                drop_original=False,
                new_variables_names=["host_since_days"],
                missing_values="ignore",
            ),
        ),
        (
            "HS_imputation",
            MeanMedianImputer(
                imputation_method="median", variables=["host_since_days"]
            ),
        ),
        # ==========================
        # Numerical features (NF)
        (
            "NF_imputation",
            SklearnTransformerWrapper(
                transformer=KNNImputer(n_neighbors=5, weights="uniform"),
                variables=numerical_feature,
            ),
        ),
        # ============================
        # Coordinates numerical (COO)
        (
            "COO_imputation",
            MeanMedianImputer(
                imputation_method="median", variables=coordinates_feature
            ),
        ),
        # ========================
        # Drop features not needed
        # ========================
        (
            "ColumnDropperTransformer",
            ColumnDropperTransformer(
                columns=[
                    "last_review",
                    "first_review",
                    "scraping_date",
                    "host_since",
                ]
            )
        ),
        # =======================
        # Scaling
        # =======================
        (
            "PowerTransformer",
            SklearnTransformerWrapper(
                transformer=PowerTransformer(
                    method="yeo-johnson",
                    standardize=True,
                    copy=False
                ),
                variables=[
                              "days_active_reviews",
                              "host_since_days",
                          ]
                          + numerical_feature
            )
        ),
        (
            "StandardScaler",
            SklearnTransformerWrapper(
                transformer=StandardScaler(), variables=coordinates_feature
            ),
        ),
        # ============
        # Prediction
        # ============
        (
          "RFR",
          GridSearchCV(
              estimator=TransformedTargetRegressor(
                  regressor=RandomForestRegressor(),
                  transformer=PowerTransformer(
                      method="yeo-johnson",
                      standardize=True,
                      copy=False,
                  )
              ),
              param_grid=param_grid,
              cv=5,
              n_jobs=-1,
              verbose=True,
              scoring=["neg_mean_absolute_error", "neg_mean_squared_error", "r2"],
              refit="r2",
              error_score="raise",
          ),
        ),
    ],
    verbose=True,
)

In [None]:
model = wizard_pipe.fit(X_train, y_train)

In [None]:
pred = wizard_pipe.predict(X_test)

In [None]:
print(
    f"\nExplained variance score is {explained_variance_score(y_true=y_test, y_pred=pred)}",
    f"\nMean Absolute Error is {mean_absolute_error(y_true=y_test, y_pred=pred)}",
    f"\nMean Squared Error is {mean_squared_error(y_true=y_test, y_pred=pred)}",
    f"\nR^2 Error is {r2_score(y_true=y_test, y_pred=pred)}",
)