In [1]:
import sys
import plotly.express as px
import pandas as pd
from pandarallel import pandarallel
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from src.tools import JsonHandler, concatenate_listings_datasets, return_cleaned_col_names, preprocess_text
from src.class_transformers import (
    GeographicTransformer,
    BathroomsTransformer,
    CreateVerificationsTransformer,
    AmenitiesTransformer,
    OfflineLocationFinder,
    PropertyTypeTransformer,
    HostLocationImputer,
    ScrapingDateTransformer
)
from src.function_transformers import (
    fun_tr_id_to_string,
    fun_tr_from_string_to_rate,
    fun_tr_transform_to_datetime,
    fun_tr_remove_dollar_sign,
)
from sklearn.utils import estimator_html_repr
from sklearn import set_config
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob

set_config(transform_output="pandas")

pandarallel.initialize()
pd.options.display.float_format = "{:.0f}".format
handler = JsonHandler()

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## Viz pipeline

In [2]:
print("Importing dataset and other data...")
df_listings = concatenate_listings_datasets()
host_locations = handler.import_from_json("data/mappings/host_locations.json")
remap_baths = handler.import_from_json("data/mappings/baths.json")
print("Data imported!")

print("Dropping columns and rows with too many NAs...")
df_listings.drop(
    [
        "description",
        "neighborhood_overview",
        "host_about",
        "host_neighbourhood",
        "neighbourhood",
        "neighbourhood_group_cleansed",
        "calendar_updated",
        "license",
        "listing_url",
        "scrape_id",
        "last_scraped",
        "source",
        "name",
        "picture_url",
        "host_url",
        "host_name",
        "host_thumbnail_url",
        "host_picture_url",
        "minimum_minimum_nights",
        "maximum_minimum_nights",
        "minimum_maximum_nights",
        "maximum_maximum_nights",
        "minimum_nights_avg_ntm",
        "maximum_nights_avg_ntm",
        "has_availability",
        "availability_30",
        "availability_60",
        "availability_90",
        "availability_365",
        "calendar_last_scraped",
        "number_of_reviews_ltm",
        "number_of_reviews_l30d",
        "instant_bookable",
        "calculated_host_listings_count",
        "calculated_host_listings_count_entire_homes",
        "calculated_host_listings_count_private_rooms",
        "calculated_host_listings_count_shared_rooms",
    ],
    axis=1,
    inplace=True,
)

df_listings.set_index("id", inplace=True)

df_nas_columns = pd.DataFrame(
    {
        "NAs": df_listings.isnull().sum(axis=1),
        "Columns_with_NAs": df_listings.apply(
            lambda x: ", ".join(x.index[x.isnull()]), axis=1
        ),
    }
)

more_than_7_missing = df_nas_columns.loc[df_nas_columns["NAs"] > 7, :].index.tolist()
df_listings.drop(more_than_7_missing, inplace=True)
print("Columns and rows dropping completed!")

Importing dataset and other data...
Data imported!
Dropping columns and rows with too many NAs...
Columns and rows dropping completed!


In [3]:
id_feature = ["host_id"]
rate_feature = ["host_response_rate", "host_acceptance_rate"]
time_feature = ["host_since", "first_review", "last_review"]
neighbourhood_feature = ["neighbourhood_cleansed"]
price_feature = ["price"]

# Amenities
internet_pattern: str = r"\b(wifi|internet|ethernet|fibra|connection)\b"
self_checkin_pattern: str = r"\b(self checkin|self check-in|self-checkin)\b"
host_greeting_pattern: str = r"\b(host greeting|host greets you)\b"
pool_pattern: str = r"\b(pool|pool view|shared pool)\b"
oven_pattern: str = r"\b(oven)\b"
microwave_pattern: str = r"\b(microwave|microonde)\b"
garden_pattern: str = r"\b(garden|park|backyard)\b"
streaming_pattern: str = r"\b(netflix|amazon|disney+|chromecast|apple tv|hbo|hbo max)\b"
gym_pattern: str = r"\b(exercise|gym|fitness|private gym in building|shared gym|gym nearby|workout bench)\b"
elevator_pattern: str = r"\b(elevator)\b"
heating_pattern: str = r"\b(heating)\b"
ac_pattern: str = r"\b(central air conditioning|ac|air conditioning)\b"
safe_pattern: str = r"\b(safe|locker|lock|security|guard)\b"
workspace_pattern: str = r"\b(workspace|work)\b"
freezer_pattern: str = r"\b(freezer|refrigerator)\b"
aid_pattern: str = r"\b(first aid kit|aid)\b"
dishwasher_pattern: str = r"\b(dishwasher)\b"
long_term_stays_pattern: str = r"\b(long term stays)\b"
pets_pattern: str = r"\b(pets allowed)\b"
bathtube_pattern: str = r"\b(bathtube)\b"
bbq_grill_pattern: str = r"\b(bbq grill|grill|barbeque|barbeque utensils)\b"
lake_bay_pattern: str = r"\b(lake view|bay view|harbor view|beach view)\b"

set_amenities_remapper = [
    (internet_pattern, "internet"),
    (self_checkin_pattern, "self-checkin"),
    (host_greeting_pattern, "host-greeting"),
    (pool_pattern, "pool"),
    (oven_pattern, "oven"),
    (microwave_pattern, "microwave"),
    (garden_pattern, "garden"),
    (streaming_pattern, "streaming"),
    (gym_pattern, "gym"),
    (elevator_pattern, "elevator"),
    (heating_pattern, "heating"),
    (ac_pattern, "air-conditioning"),
    (workspace_pattern, "workspace"),
    (freezer_pattern, "freezer"),
    (aid_pattern, "first-aid-kit"),
    (dishwasher_pattern, "dishwasher"),
    (long_term_stays_pattern, "long-term-stays"),
    (pets_pattern, "pets-allowed"),
    (bathtube_pattern, "bathtube"),
    (bbq_grill_pattern, "bbq-grill"),
    (lake_bay_pattern, "lake-bay-view")
]

# Property type
entire_property_pattern = r"\b(entire|tiny home)\b"
private_room_pattern = r"\b(private room|room in serviced apartment|room in bed and breakfast|room in hotel|room in resort)\b"
shared_room_pattern = r"\b(shared room|shared)\b"
other_room_pattern = r"\b(entire|tiny home|private room|room in serviced apartment|room in bed and breakfast|room in hotel|room in resort|shared room|shared)\b"

set_property_type_remapper = [
    (entire_property_pattern, "entire_property"),
    (private_room_pattern, "private_room"),
    (shared_room_pattern, "shared_room"),
    (other_room_pattern, "other"),
]

id_pipeline = Pipeline(steps=[("From ID to string", fun_tr_id_to_string)], verbose=True)

rates_pipeline = Pipeline(
    steps=[("Transform response rate", fun_tr_from_string_to_rate)], verbose=True
)

timestamp_pipeline = Pipeline(
    steps=[("Transform to timestamp", fun_tr_transform_to_datetime)], verbose=True
)

price_pipeline = Pipeline(
    steps=[("Trim price feature", fun_tr_remove_dollar_sign)], verbose=True
)


In [4]:
# Apply to all dataset (feature engineering using other features)
feature_creation_pipeline = Pipeline(
    steps=[
        ("Listing Locations", OfflineLocationFinder()),
        ("Host Locations imputer", HostLocationImputer()),
        (
            "Host location",
            GeographicTransformer(column="host_location", locations=host_locations),
        ),
        ("Host verifications", CreateVerificationsTransformer()),
        ("Bathrooms", BathroomsTransformer(remap_baths)),
        (
            "Amenities",
            AmenitiesTransformer(df=df_listings, remapper=set_amenities_remapper),
        ),
        (
            "Property type",
            PropertyTypeTransformer(
                df=df_listings, remapper=set_property_type_remapper
            ),
        ),
    ],
    verbose=True,
)
print("Executing Feature Creation Pipeline...")
df_listings = feature_creation_pipeline.fit_transform(df_listings)
print("Feature Creation Pipeline completed!")

Executing Feature Creation Pipeline...
[Pipeline] . (step 1 of 7) Processing Listing Locations, total=  36.1s
[Pipeline]  (step 2 of 7) Processing Host Locations imputer, total=   5.0s
[Pipeline] ..... (step 3 of 7) Processing Host location, total=  27.5s
[Pipeline]  (step 4 of 7) Processing Host verifications, total=  14.8s
[Pipeline] ......... (step 5 of 7) Processing Bathrooms, total=   1.6s
[Pipeline] ......... (step 6 of 7) Processing Amenities, total= 5.1min
[Pipeline] ..... (step 7 of 7) Processing Property type, total=   0.2s
Feature Creation Pipeline completed!


In [5]:
print("Executing preprocessing on features...")
feature_preprocessor = ColumnTransformer(
    remainder="passthrough",
    n_jobs=-1,
    force_int_remainder_cols=False,
    transformers=[
        ("Id", id_pipeline, id_feature),
        ("Rates", rates_pipeline, rate_feature),
        ("Price", price_pipeline, price_feature),
        ("Timestamp", timestamp_pipeline, time_feature),
    ],
    verbose=True,
)

cleaned_df = feature_preprocessor.fit_transform(df_listings)
print("Preprocessing on features completed!")
cleaned_df.columns = return_cleaned_col_names(cleaned_df.columns)
print("Cleaned feature names retrieved")

del df_listings

Executing preprocessing on features...
Preprocessing on features completed!
Cleaned feature names retrieved


In [6]:
## Description preprocessing
#print("Preprocessing listings descriptions")
#cleaned_df["description"] = cleaned_df["description"].parallel_apply(preprocess_text)
#print("Preprocessing listings descriptions ended")

In [7]:
#print("Descriptions word count computation")
#cleaned_df['description_word_count'] = cleaned_df['description'].parallel_apply(lambda x: len(x.split()))
#print("Descriptions word count computation ended")

In [8]:
#print("Description sentitment and polarity computation")
#cleaned_df['description_sentiment_polarity'] = cleaned_df['description'].parallel_apply(lambda x: TextBlob(x).sentiment.polarity)
#cleaned_df['description_sentiment_subjectivity'] = cleaned_df['description'].parallel_apply(lambda x: TextBlob(x).sentiment.subjectivity)
#print("Description sentitment and polarity computation ended")

In [9]:
#n_features_vec = 100
#print(f"Startup Tfid vectorizer with {n_features_vec} features")
#tfidf = TfidfVectorizer(max_features=n_features_vec,
#                        max_df=0.95,
#                        min_df=0.05,
#                        use_idf=True,
#                        )
#print("Creating the Tfid vectorized dataset for description feature")
#tfidf_matrix = tfidf.fit_transform(cleaned_df['description'])
#print("Creating the Tfid vectorized dataset for description feature ended")

In [10]:
#print("Converting TF-IDF matrix to DataFrame, then concatenating with original DataFrame")
#tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

In [11]:
#cleaned_df = pd.concat([cleaned_df, tfidf_df], axis=1)
#print("Description preprocessing ended")

### Tourism data

In [12]:
tourism_city_data = pd.read_excel("data/city_data/urb_ctour_page_spreadsheet.xlsx",
                        sheet_name="Data",
                        index_col=0,
                        )

col_names_tourism_data = [i.replace(" ", "_") for i in tourism_city_data.columns.tolist()]

rename_tourism_columns = {}
loop_index = 0
for el in tourism_city_data.columns.tolist():
    rename_tourism_columns[el] = col_names_tourism_data[loop_index]
    loop_index += 1
    
tourism_city_data.rename(columns=rename_tourism_columns, inplace=True)

In [13]:
rename_cities = {
    "Venezia": "ve",
    "Milano": "mi",
    "Bergamo": "bg",
    "Roma": "rm",
    "Firenze": "fi",
    "Bologna": "bo",
    "Napoli": "na"
}

In [14]:
tourism_city_data.rename(index=rename_cities, inplace=True)

In [15]:
for col in tourism_city_data.columns.tolist():
    cleaned_df[col] = cleaned_df['df_city_location'].parallel_apply(
        lambda city: tourism_city_data.loc[city, col]
        if city in tourism_city_data.index else None
    )

In [16]:
del tourism_city_data

### Data about city welfare

#### Happy about their city

In [17]:
happiness_citizens_to_live_in_city = pd.read_excel('data/city_data/Tavole-di-dati-QOL_23052024.xlsx',
                                                   sheet_name="Tav. 1",
                                                   skiprows=2
                                                   )
happiness_citizens_to_live_in_city = happiness_citizens_to_live_in_city.iloc[:, 1:4]
happiness_citizens_to_live_in_city.columns = happiness_citizens_to_live_in_city.columns.str.replace(' ', '_')


In [18]:
happiness_citizens_to_live_in_city = happiness_citizens_to_live_in_city.replace(rename_cities)
happiness_citizens_to_live_in_city.set_index("Città", inplace=True)

In [19]:
cleaned_df['people_happy_of_the_city'] = cleaned_df['df_city_location'].parallel_apply(
    lambda city: happiness_citizens_to_live_in_city.loc[city, 'Persone_soddisfatte_di_vivere_nella_propria_città'] 
    if city in happiness_citizens_to_live_in_city.index else None
)

In [20]:
del happiness_citizens_to_live_in_city

#### Perception of efficiency of public services

In [21]:
public_services_perception = pd.read_excel('data/city_data/Tavole-di-dati-QOL_23052024.xlsx',
                                                   sheet_name="Tav. 3",
                                                   skiprows=2
                                                   )
public_services_perception.columns = public_services_perception.columns.str.replace(' ', '_')
public_services_perception = public_services_perception.iloc[:, 1:-1]
public_services_perception = public_services_perception.replace(rename_cities)
public_services_perception.set_index("Città", inplace=True)

In [22]:
for col in public_services_perception.columns.tolist()[1:]:
    cleaned_df[col] = cleaned_df['df_city_location'].parallel_apply(
    lambda city: public_services_perception.loc[city, col] 
    if city in public_services_perception.index else None
)

In [23]:
del public_services_perception

#### Air quality and sound satisfaction

In [24]:
air_sound_quality = pd.read_excel('data/city_data/Tavole-di-dati-QOL_23052024.xlsx',
                                           sheet_name="Tav. 4",
                                           skiprows=2
                                           )
air_sound_quality.columns = air_sound_quality.columns.str.replace(' ', '_')
air_sound_quality = air_sound_quality.replace(rename_cities)
air_sound_quality.set_index("Città", inplace=True)

In [25]:
for col in air_sound_quality.columns.tolist()[1:]:
    cleaned_df[col] = cleaned_df['df_city_location'].parallel_apply(
        lambda city: air_sound_quality.loc[city, col]
        if city in air_sound_quality.index else None
    )

In [26]:
del air_sound_quality

#### Transportation rails

In [27]:
transport_rails = pd.read_excel('data/city_data/Tavole-di-dati-QOL_23052024.xlsx',
                                  sheet_name="Tav. 5",
                                  skiprows=2
                                  )
transport_rails.columns = transport_rails.columns.str.replace(' ', '_')
transport_rails = transport_rails.replace(rename_cities)
transport_rails.set_index("Città", inplace=True)
transport_rails.rename(columns={"Trasporto_pubblico_urbano_(autobus,_tram_o_metropolitana)": "Trasporto_pubblico"}, inplace=True)


In [28]:
for col in transport_rails.columns.tolist()[1:]:
    cleaned_df[col] = cleaned_df['df_city_location'].parallel_apply(
        lambda city: transport_rails.loc[city, col]
        if city in transport_rails.index else None
    )

In [29]:
del transport_rails

#### City is a good place to live for minorities

In [30]:
minorities_fitness = pd.read_excel('data/city_data/Tavole-di-dati-QOL_23052024.xlsx',
                                sheet_name="Tav. 11",
                                skiprows=2
                                )
minorities_fitness.columns = minorities_fitness.columns.str.replace(' ', '_')
minorities_fitness = minorities_fitness.replace(rename_cities)
minorities_fitness.set_index("Città", inplace=True)

In [31]:
for col in minorities_fitness.columns.tolist()[1:]:
    cleaned_df[col] = cleaned_df['df_city_location'].parallel_apply(
        lambda city: minorities_fitness.loc[city, col]
        if city in minorities_fitness.index else None
    )

In [32]:
del minorities_fitness

#### Night safety perception

In [33]:
night_safety = pd.read_excel('data/city_data/Tavole-di-dati-QOL_23052024.xlsx',
                                   sheet_name="Tav. 12",
                                   skiprows=2
                                   )
night_safety.columns = night_safety.columns.str.replace(' ', '_')
night_safety = night_safety.replace(rename_cities)
night_safety.set_index("Città", inplace=True)
night_safety = night_safety.iloc[:, 1:3]

In [34]:
for col in night_safety.columns.tolist()[1:]:
    cleaned_df[col] = cleaned_df['df_city_location'].parallel_apply(
        lambda city: night_safety.loc[city, col]
        if city in night_safety.index else None
    )

In [35]:
del night_safety

In [36]:
from sklearn.pipeline import Pipeline
from src.class_transformers import ColumnDropperTransformer, IntoBinaryTransformer, CoordinatesTransformer
from sklearn import set_config

set_config(transform_output="pandas")
pd.options.display.float_format = "{:.0f}".format


In [37]:
df = cleaned_df.copy()

# Columns to drop because of high correlation
to_drop_corr = [
    "host_acceptance_rate",
    "host_total_listings_count",
    "bathrooms",
    "bedrooms",
    "beds",
    "review_scores_accuracy",
    "review_scores_cleanliness",
    "review_scores_checkin",
    "review_scores_communication",
    "review_scores_location",
    "review_scores_value",
    "amenities"
]

widely_unbalanced_features = [
    "host_has_profile_pic",
    "host_identity_verified",
    "email_verification",
    "phone_verification",
    "work_email_verification",
]

eng_after_exploration_pipeline = Pipeline(
    steps=[
        ("Drop columns", ColumnDropperTransformer(columns=to_drop_corr)),
        (
            "Drop unbalanced columns",
            ColumnDropperTransformer(columns=widely_unbalanced_features),
        ),
        (
            "Transform Response Rate",
            IntoBinaryTransformer(
                feature="host_response_rate", cat1="100", cond="x==100", cat2="lower"
            ),
        ),
        (
            "Transform Minimum Nights",
            IntoBinaryTransformer(
                feature="minimum_nights", cat1="1", cond="x<=1", cat2="more_than_1"
            ),
        ),
        (
            "Transform Maximum Nights",
            IntoBinaryTransformer(
                feature="maximum_nights",
                cat1="less_than_100",
                cond="x<=100",
                cat2="more_than_100",
            ),
        ),
        (
            "Transform City Population",
            IntoBinaryTransformer(
                feature="listing_city_pop",
                cat1="less_than_300k",
                cond="x<=300000",
                cat2="more_than_300k",
            ),
        ),
        (
            "Transform Review Score Rating",
            IntoBinaryTransformer(
                feature="review_scores_rating",
                cat1="less_than_4.8",
                cond="x<4.8",
                cat2="more_than_4.8",
            ),
        ),

        (
            "Transform Host Response Time",
            IntoBinaryTransformer(
                feature="host_response_time",
                cat1="within_an_hour",
                cond="x=='within an hour'",
                cat2="more_than_one_hour",
            ),
        ),
        (
            "Transform Property Type",
            IntoBinaryTransformer(
                feature="property_type",
                cat1="entire_property",
                cond="x=='entire_property'",
                cat2="other",
            ),
        ),
        (
            "Transform Room Type",
            IntoBinaryTransformer(
                feature="room_type",
                cat1="entire_home",
                cond="x=='Entire home/apt'",
                cat2="other",
            ),
        ),
        (
            "Transform Bathrooms Text",
            IntoBinaryTransformer(
                feature="bathrooms_text",
                cat1="single",
                cond="x=='single'",
                cat2="other",
            ),
        ),
        (
            "Coordinates to spatial",
            CoordinatesTransformer()
        ),
    ],
    verbose=True,
)

df = eng_after_exploration_pipeline.fit_transform(df)

[Pipeline] ..... (step 1 of 12) Processing Drop columns, total=   0.0s
[Pipeline]  (step 2 of 12) Processing Drop unbalanced columns, total=   0.0s
[Pipeline]  (step 3 of 12) Processing Transform Response Rate, total=   1.1s
[Pipeline]  (step 4 of 12) Processing Transform Minimum Nights, total=   1.1s
[Pipeline]  (step 5 of 12) Processing Transform Maximum Nights, total=   1.1s
[Pipeline]  (step 6 of 12) Processing Transform City Population, total=   1.1s
[Pipeline]  (step 7 of 12) Processing Transform Review Score Rating, total=   1.2s
[Pipeline]  (step 8 of 12) Processing Transform Host Response Time, total=   1.2s
[Pipeline]  (step 9 of 12) Processing Transform Property Type, total=   1.2s
[Pipeline]  (step 10 of 12) Processing Transform Room Type, total=   1.2s
[Pipeline]  (step 11 of 12) Processing Transform Bathrooms Text, total=   1.2s
[Pipeline]  (step 12 of 12) Processing Coordinates to spatial, total=   0.1s


## ML pipeline

In [38]:
import pandas as pd
import numpy as np
from feature_engine.datetime import DatetimeSubtraction
from feature_engine.creation import RelativeFeatures
from feature_engine.encoding import OneHotEncoder, CountFrequencyEncoder, OrdinalEncoder
from feature_engine.wrappers import SklearnTransformerWrapper
from feature_engine.imputation import MeanMedianImputer, CategoricalImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PolynomialFeatures, PowerTransformer, RobustScaler
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import QuantileRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import GridSearchCV
import sys
from sklearn.neural_network import MLPRegressor


In [39]:
param_grid={
    "regressor__n_estimators": [400], #, 400, 800],
    "regressor__criterion": ["absolute_error"],
    "regressor__random_state": [874631],
    "regressor__verbose": [True],
    #"regressor__min_impurity_decrease": [0, 0.1, 0.2, 0.3],
    "regressor__bootstrap": [True],
    "regressor__oob_score": [True],
}

In [40]:

review_dates_feature = ["first_review", "last_review"]

ohe_feature = [
    "df_city_location",
    "host_is_superhost",
    "host_response_time",
    "property_type",
    "room_type",
    "bathrooms_text",
    "host_response_rate",
    "minimum_nights",
    "maximum_nights",
    "listing_city_pop",
    "review_scores_rating",
    'amenities_internet',
    'amenities_self-checkin',
    'amenities_host-greeting',
    'amenities_pool',
    'amenities_oven',
    'amenities_microwave',
    'amenities_garden',
    'amenities_streaming',
    'amenities_gym',
    'amenities_elevator',
    'amenities_heating',
    "amenities_air-conditioning",
    "amenities_workspace",
    "amenities_freezer",
    "amenities_first-aid-kit",
    "amenities_dishwasher",
    "amenities_long-term-stays",
    "amenities_pets-allowed",
    "amenities_bathtube",
    "amenities_bbq-grill",
    "amenities_lake-bay-view",
]

ohe_most_frequent = ["listing_city", "neighbourhood_cleansed"]

host_id_feature = ["host_id"]

host_since_feature = ["host_since"]

numerical_feature = [
    "host_listings_count",
    "host_location",
    "number_of_reviews",
    "reviews_per_month",
    "accommodates",
    "Total_nights_spent_in_tourist_accommodation_establishments",
    'Nights_spent_in_tourist_accommodation_establishments_by_residents',
    'Nights_spent_in_tourist_accommodation_establishments_by_non-residents',
    'Total_nights_spent_in_tourist_accommodation_establishments_per_resident_population',
    'people_happy_of_the_city',
    'Indice_sintetico_sulla_percezione_dell’efficienza_dei_servizi_pubblici_della_città',
    'Persone_soddisfatte_dei_trasporti_pubblici',
    'Persone_soddisfatte_degli_spazi_verdi',
    'Persone_soddisfatte_delle_infrastrutture_sportive',
    'Persone_soddisfatte_delle_infrastrutture_culturali',
    'Persone_soddisfatte_delle_scuole_e_degli_altri_servizi_di_formazione',
    'Persone_soddisfatte_di_servizi_sanitari,_medici_e_ospedali',
    'Persone_soddisfatte_degli_spazi_pubblici',
    "Persone_soddisfatte_della_qualità_dell'aria",
    'Persone_soddisfatte_del_livello_di_rumore',
    'Automobile',
    'Motocicletta',
    'Bicicletta',
    'A_piedi',
    'Treno',
    'Trasporto_pubblico',
    'La_città_è_un_buon_posto_in_cui_vivere_per_le_persone_in_generale',
    'La_città_è_un_buon_posto_in_cui_vivere_per_le_minoranze_etniche',
    'La_città_è_un_buon_posto_in_cui_vivere_per_le_persone_omosessuali',
    'La_città_è_un_buon_posto_in_cui_vivere_per_gli_immigrati_provenienti_da_altri_paesi',
    'La_città_è_un_buon_posto_in_cui_vivere_per_le_famiglie_con_bambini_piccoli',
    'La_città_è_un_buon_posto_in_cui_vivere_per_le_persone_anziane',
    'Persone_che_si_sentono_sicure_a_camminare_da_sole_di_notte_nella_propria_città'
] #+ description_features

coordinates_feature = [
    "x_coord",
    "y_coord",
    "z_coord"
]

# Drop rows with NaN in target
df = df.loc[df["price"].notnull(), :]

X = df.drop(["price"], axis=1, inplace=False)
y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=874631
)

# Drop the rows from the train set with outliers in price
#mask = y_train <= 10000
#X_train: np.array = X_train[mask]
#y_train: np.array = y_train[mask]

wizard_pipe = Pipeline(
    steps=[
        (
            "ScrapingDate-add",
            ScrapingDateTransformer()
        ),
        # Review Dates (RD)
        (
            "RD_engineering",
            DatetimeSubtraction(
                variables="last_review",
                reference="first_review",
                output_unit="D",
                drop_original=False,
                new_variables_names=["days_active_reviews"],
                missing_values="ignore",
            ),
        ),
        (
            "RD_imputation",
            MeanMedianImputer(
                imputation_method="median", variables=["days_active_reviews"]
            ),
        ),
        # ========================
        # One-hot-encoding (OHE)
        (
            "OHE_imputation",
            CategoricalImputer(
                imputation_method="frequent",
                variables=ohe_feature,
                return_object=True,
                ignore_format=False,
            ),
        ),
        (
            "OHE_encoding",
            OneHotEncoder(
                top_categories=None,
                drop_last=True,
                drop_last_binary=True,
                ignore_format=False,
                variables=ohe_feature,
            ),
        ),
        # ========================
        # One-hot-encoding Top Frequent (OHETF)
        (
            "OHETF_imputation",
            CategoricalImputer(
                imputation_method="frequent",
                variables=ohe_most_frequent,
                return_object=True,
                ignore_format=False,
            ),
        ),
        (
            "OHETF_encoding",
            OneHotEncoder(
                top_categories=30,
                drop_last=True,
                drop_last_binary=True,
                ignore_format=False,
                variables=ohe_most_frequent,
            ),
        ),
        # =======================
        # Host ID (HID)
        (
            "HID_imputation",
            CategoricalImputer(
                imputation_method="missing",
                variables=host_id_feature,
                fill_value="MISSING",
            ),
        ),
        (
            "HID_encoding",
            CountFrequencyEncoder(
                encoding_method="count", missing_values="ignore", unseen="encode"
            ),
        ),
        # =========================
        # Host since (HS)
        (
            "HS_engineering",
            DatetimeSubtraction(
                variables=["scraping_date"],
                reference=["host_since"],
                output_unit="D",
                drop_original=False,
                new_variables_names=["host_since_days"],
                missing_values="ignore",
            ),
        ),
        (
            "HS_imputation",
            MeanMedianImputer(
                imputation_method="median", variables=["host_since_days"]
            ),
        ),
        # ==========================
        # Numerical features (NF)
        (
            "NF_imputation",
            SklearnTransformerWrapper(
                transformer=KNNImputer(n_neighbors=5, weights="uniform"),
                variables=numerical_feature,
            ),
        ),
        # ============================
        # Coordinates numerical (COO)
        (
            "COO_imputation",
            MeanMedianImputer(
                imputation_method="median", variables=coordinates_feature
            ),
        ),
        # ========================
        # Drop features not needed
        # ========================
        (
            "ColumnDropperTransformer",
            ColumnDropperTransformer(
                columns=[
                    "last_review",
                    "first_review",
                    "scraping_date",
                    "host_since",
                ]
            )
        ),
        # =======================
        # Scaling
        # ======================================================================
        #(
        #    "PowerTransformer",
        #    SklearnTransformerWrapper(
        #        transformer=PowerTransformer(
        #            method="yeo-johnson",
        #            standardize=True,
        #            copy=False
        #        ),
        #        variables=[
        #                      "days_active_reviews",
        #                      "host_since_days",
        #                  ]
        #                  + numerical_feature
        #    )
        #),
        ( 
            "MinMaxScaling",
            SklearnTransformerWrapper(
                transformer=MinMaxScaler(),
                variables=coordinates_feature + numerical_feature + ["days_active_reviews", "host_since_days"]
            ),
        ),
        # ============
        # Prediction
        # ============
        #(
        #    "TransformedTarget-RandomForestRegressor",
        #    TransformedTargetRegressor(regressor=RandomForestRegressor(
        #        n_estimators=100,
        #        criterion="squared_error",
        #        bootstrap=True,
        #        max_samples=0.7,
        #        oob_score=True,
        #        n_jobs=-1,
        #        random_state=874631,
        #    ),
        #        transformer=PowerTransformer(
        #            method="yeo-johnson",
        #            standardize=True,
        #            copy=False
        #        ),
        #    )
        #),
        (
            "RFR",
            GridSearchCV(
                estimator=TransformedTargetRegressor(
                    regressor=RandomForestRegressor(),
                    transformer=PowerTransformer(
                        method="yeo-johnson",
                        standardize=True,
                        copy=False,
                    )
                ),
                param_grid=param_grid,
                cv=5,
                n_jobs=-1,
                verbose=True,
                scoring=["neg_mean_absolute_error", "neg_mean_squared_error", "r2"],
                refit="r2",
                error_score="raise",
            ),
        ),
        #(
        #    "MLPRegressor",
        #    MLPRegressor(hidden_layer_sizes=(100, 100, 100),
        #                 activation="relu",
        #                 solver="adam",
        #                 alpha=0.01,
        #                 batch_size=500,
        #                 learning_rate="constant",
        #                 learning_rate_init=0.001,
        #                 max_iter=200,
        #                 shuffle=True,
        #                 random_state=874631,
        #                 tol=1e-4,
        #                 verbose=True,
        #                 warm_start=True,
        #                 momentum=0.9,
        #                 early_stopping=False,
        #                 validation_fraction=0.1,
        #                 n_iter_no_change=20
        #    )
        #),
    ],
    verbose=True,
)


In [41]:
test = X_train.copy()

In [42]:
#test = wizard_pipe.fit_transform(test)

AttributeError: This 'Pipeline' has no attribute 'fit_transform'

In [None]:
#test.head(100)

In [None]:
fitting_model = wizard_pipe.fit(X_train, y_train)

[Pipeline] . (step 1 of 16) Processing ScrapingDate-add, total=   0.1s
[Pipeline] ... (step 2 of 16) Processing RD_engineering, total=   0.2s
[Pipeline] .... (step 3 of 16) Processing RD_imputation, total=   0.1s
[Pipeline] ... (step 4 of 16) Processing OHE_imputation, total=   0.6s
[Pipeline] ..... (step 5 of 16) Processing OHE_encoding, total=   2.5s
[Pipeline] . (step 6 of 16) Processing OHETF_imputation, total=   0.1s
[Pipeline] ... (step 7 of 16) Processing OHETF_encoding, total=   2.3s
[Pipeline] ... (step 8 of 16) Processing HID_imputation, total=   0.2s
[Pipeline] ..... (step 9 of 16) Processing HID_encoding, total=   0.2s
[Pipeline] .. (step 10 of 16) Processing HS_engineering, total=   0.2s
[Pipeline] ... (step 11 of 16) Processing HS_imputation, total=   0.1s
[Pipeline] ... (step 12 of 16) Processing NF_imputation, total=  27.9s
[Pipeline] .. (step 13 of 16) Processing COO_imputation, total=   0.2s
[Pipeline]  (step 14 of 16) Processing ColumnDropperTransformer, total=   0.0



In [None]:
pred = wizard_pipe.predict(X_test)
print(
    f"\nExplained variance score is {explained_variance_score(y_true=y_test, y_pred=pred)}",
    f"\nMean Absolute Error is {mean_absolute_error(y_true=y_test, y_pred=pred)}",
    f"\nMean Squared Error is {mean_squared_error(y_true=y_test, y_pred=pred)}",
    f"\nR^2 Error is {r2_score(y_true=y_test, y_pred=pred)}",
)


In [None]:
#plot_loss = pd.DataFrame(wizard_pipe["MLPRegressor"].loss_curve_, columns=["loss"])

In [None]:
#fig = px.line(plot_loss, x = plot_loss.index, y = "loss")
#fig.show()