In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from scripts.custom.tools import JsonHandler
from scripts.custom.viz.class_transformers import GeographicTransformer
from scripts.custom.viz.class_transformers import CreateStrategicLocationTransformer
from scripts.custom.viz.class_transformers import VectorToDataFrame
from scripts.custom.viz.class_transformers import NeighborhoodMapper
from scripts.custom.viz.class_transformers import BathroomsTransformer
from scripts.custom.viz.class_transformers import CreateVerificationsTransformer
from sklearn.utils import estimator_html_repr
from sklearn import set_config
set_config(transform_output = "pandas")

pd.options.display.float_format = '{:.0f}'.format
handler = JsonHandler()

In [2]:
# Import mappings data

host_locations = handler.import_from_json("data/mappings/host_locations.json")
strategic_locations = handler.import_from_json("data/mappings/strategic_locations.json")
neighbourhood_levels = handler.import_from_json("data/mappings/neighbourhoods_levels.json")
remap_baths = handler.import_from_json('data/mappings/baths.json')


In [3]:
df_listings = pd.read_csv("data/data_dic/d_listings.csv")
df_listings.drop(labels=["listing_url", "name", "scrape_id", "last_scraped", "source", "description", "picture_url", "host_url",
                         "host_name", "host_thumbnail_url", "host_picture_url", "host_neighbourhood", "neighbourhood",
                         "neighbourhood_group_cleansed", "property_type", "amenities", "minimum_minimum_nights",
                         "maximum_minimum_nights", "minimum_maximum_nights", "maximum_maximum_nights", "minimum_nights_avg_ntm",
                         "maximum_nights_avg_ntm", "has_availability", "availability_30", "availability_60", "availability_90",
                         "availability_365", "calendar_updated", "calendar_last_scraped", "number_of_reviews_ltm",
                         "number_of_reviews_l30d", "license", "instant_bookable"],
                 axis=1,
                 inplace=True)

**The following chunck is probably needed after this visualization pipeline**

In [None]:
## Drop rows with NaN in target 
#df_listings = df_listings.loc[df_listings['price'].notnull(), :]
#df_listings.price.isnull().sum()

#X = df_listings.drop(["price"], axis=1, inplace=False)
#y = df_listings["price"]

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=874631)


In [None]:
#def drop_features_with_many_nan(x: pd.DataFrame) -> pd.DataFrame:
#    nulls_summary = pd.DataFrame(df_listings.isnull().sum())
#    more_than_null_features = nulls_summary.loc[nulls_summary.iloc[:, 0] > df_listings.shape[0]*0#.5, :].index.tolist()
#    return x.drop(more_than_null_features, axis=1)
#
#fun_tr_drop_features_with_many_nan = FunctionTransformer(drop_features_with_many_nan)

## Define groups for data transformation

### Geographical Features

***Directly on the Feature elaboration pipeline***


### String features

In [4]:
string_features = ["neighborhood_overview",
                   "host_about"]

from scripts.custom.viz.function_transformers import fun_tr_transform_nan_unicode
#def transform_nan_unicode(text_series):
#    return text_series.fillna("").astype('U')
#
#fun_tr_transform_nan_unicode = FunctionTransformer(transform_nan_unicode, validate=False)

text_encoding_pipeline = Pipeline(steps=[
    ("text preprocessing", fun_tr_transform_nan_unicode),
    ("tf-idf vectorizer", TfidfVectorizer(encoding='utf-8',
                                          decode_error='ignore',
                                          strip_accents='unicode',
                                          lowercase=True,
                                          analyzer='word',
                                          max_df=0.8,
                                          use_idf=True,
                                          smooth_idf=True,
                                          max_features = 30)
     ),
    ("Vectors into dataframe", VectorToDataFrame())
])


### ID features

In [5]:
id_feature = ["id",
              "host_id"]

#def id_to_string(id_object) -> str:
#    return id_object.astype(str)
#
#fun_tr_id_to_string = FunctionTransformer(id_to_string)

from scripts.custom.viz.function_transformers import fun_tr_id_to_string

id_pipeline = Pipeline(steps=[
    ("From ID to string", fun_tr_id_to_string)
])


### Rates features

In [6]:
rate_feature = ["host_response_rate",
                "host_acceptance_rate"]

#def from_string_to_rate(rate_string) -> float:
#    return rate_string.apply(lambda col: col.str.rstrip('%').astype(float))
#
#fun_tr_from_string_to_rate = FunctionTransformer(from_string_to_rate)

from scripts.custom.viz.function_transformers import fun_tr_from_string_to_rate

rates_pipeline = Pipeline(steps=[
    ("Transform response rate", fun_tr_from_string_to_rate)
])

### Time features

In [7]:
time_feature = ["host_since",
                "first_review",
                "last_review"]

#def transform_to_datetime(text_date) -> pd.Timestamp | pd.Timestamp:
#    return text_date.apply(lambda row: pd.to_datetime(row), axis=1)
#fun_tr_transform_to_datetime = FunctionTransformer(transform_to_datetime)
from scripts.custom.viz.function_transformers import fun_tr_transform_to_datetime

timestamp_pipeline = Pipeline(steps=[
    ("Transform to timestamp", fun_tr_transform_to_datetime)
])

 ## Categorical features
 
### Neighbourhoods features

In [8]:
neighbourhood_feature = ["neighbourhood_cleansed"]

neighbourhood_pipeline = Pipeline(steps=[
    ('Neighbourhood Mapper', NeighborhoodMapper(mapping=neighbourhood_levels))
])


### Verifications Feature

***execution in final feature engineering pipeline***

### Bathrooms text feature

***execution in final feature engineering pipeline***

### Price feature

In [9]:
price_feature = ['price']

#def remove_symbols(text):
#    try:
#        cleaned_text = re.sub(r'[$,]', '', text)
#        return cleaned_text.strip()
#    except:
#        return None
#    
#def remove_dollar_sign(df: pd.DataFrame) -> pd.DataFrame:
#    df['price'] = df['price'].apply(remove_symbols).astype(float)
#    return df
#
#fun_tr_remove_dollar_sign = FunctionTransformer(remove_dollar_sign)
from scripts.custom.viz.function_transformers import fun_tr_remove_dollar_sign
    
price_pipeline = Pipeline(steps=[
    ("Trim price feature", fun_tr_remove_dollar_sign)
])
    

## Aggregate visualization dataset

In [10]:
# Apply to all dataset (feature engineering using other features)
feature_creation_pipeline = Pipeline(steps=[
    ("Strategic locations distance", CreateStrategicLocationTransformer(locations=strategic_locations)),
    ('Host location', GeographicTransformer(column="host_location", locations=host_locations)),
    ('Host verifications', CreateVerificationsTransformer()),
    ('Bathrooms', BathroomsTransformer(remap_baths))
])

df_listings = feature_creation_pipeline.fit_transform(df_listings)

In [11]:
feature_preprocessor = ColumnTransformer(
    remainder = 'passthrough',
    n_jobs=-1,
    transformers=[
        #("Text encoding", text_encoding_pipeline, string_features),
        ("Id", id_pipeline, id_feature),
        ("Rates", rates_pipeline, rate_feature),
        ("Neighbourhood", neighbourhood_pipeline, neighbourhood_feature),
        ("Price", price_pipeline, price_feature),
        ("Timestamp", timestamp_pipeline, time_feature)
])

In [12]:
pd.to_pickle(feature_preprocessor.fit_transform(df_listings), "data/pickles/listings_viz_dic.pkl")

In [13]:
with open('data/visual/feature_preprocessor.html', 'w') as f:  
    f.write(estimator_html_repr(feature_preprocessor))


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



# Pipeline for visualization ENDED

## Numerical features

In [None]:
num_features = ["host_listings_count", "host_total_listings_count", "accommodates", "bathrooms", "bedrooms", "beds",
                "minimum_nights", "maximum_nights", "number_of_reviews", "review_scores_rating", "review_scores_accuracy",
                "review_scores_cleanliness", "review_scores_checkin", "review_scores_communication",
                "review_scores_location", "review_scores_value", "calculated_host_listings_count",
                "calculated_host_listings_count_entire_homes", "calculated_host_listings_count_private_rooms",
                "calculated_host_listings_count_shared_rooms", "reviews_per_month"
                ]

### Add and manipulate features