In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from scripts.custom.tools import JsonHandler
from scripts.custom.transformers import GeographicTransformer
from scripts.custom.transformers import CreateStrategicLocationTransformer
from scripts.custom.transformers import VectorToDataFrame
from scripts.custom.transformers import NeighborhoodMapper
from scripts.custom.transformers import BathroomsTransformer


pd.options.display.float_format = '{:.0f}'.format
handler = JsonHandler()

In [None]:
# Import mappings data

host_locations = handler.import_from_json("data/mappings/host_locations.json")
strategic_locations = handler.import_from_json("data/mappings/strategic_locations.json")
neighbourhood_levels = handler.import_from_json("data/mappings/neighbourhoods_levels.json")
remap_baths = handler.import_from_json('data/mappings/baths.json')


In [None]:
df_listings = pd.read_csv("data/2023dic/d_listings.csv")
df_listings.drop(labels=["listing_url", "name", "scrape_id", "last_scraped", "source", "description", "picture_url", "host_url",
                         "host_name", "host_thumbnail_url", "host_picture_url", "host_neighbourhood", "neighbourhood",
                         "neighbourhood_group_cleansed", "property_type", "amenities", "minimum_minimum_nights",
                         "maximum_minimum_nights", "minimum_maximum_nights", "maximum_maximum_nights", "minimum_nights_avg_ntm",
                         "maximum_nights_avg_ntm", "has_availability", "availability_30", "availability_60", "availability_90",
                         "availability_365", "calendar_updated", "calendar_last_scraped", "number_of_reviews_ltm",
                         "number_of_reviews_l30d", "license", "instant_bookable"],
                 axis=1,
                 inplace=True)

**The following chunck is probably needed after this visualization pipeline**

In [None]:
## Drop rows with NaN in target 
#df_listings = df_listings.loc[df_listings['price'].notnull(), :]
#df_listings.price.isnull().sum()

#X = df_listings.drop(["price"], axis=1, inplace=False)
#y = df_listings["price"]

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=874631)


In [None]:
def drop_features_with_many_nan(x: pd.DataFrame) -> pd.DataFrame:
    nulls_summary = pd.DataFrame(df_listings.isnull().sum())
    more_than_null_features = nulls_summary.loc[nulls_summary.iloc[:, 0] > df_listings.shape[0]*0.5, :].index.tolist()
    return x.drop(more_than_null_features, axis=1)

fun_tr_drop_features_with_many_nan = FunctionTransformer(drop_features_with_many_nan)

## Define groups for data transformation

### Geographical Features

***Directly on the Feature elaboration pipeline***


### String features

In [None]:
string_features = ["neighborhood_overview",
                   "host_about"]

def transform_nan_unicode(text_series):
    return text_series.fillna("").astype('U')

text_encoding_pipeline = Pipeline(steps=[
    ("text preprocessing", FunctionTransformer(transform_nan_unicode, validate=False)),
    ("tf-idf vectorizer", TfidfVectorizer(encoding='utf-8',
                                          decode_error='ignore',
                                          strip_accents='unicode',
                                          lowercase=True,
                                          analyzer='word',
                                          max_df=0.8,
                                          use_idf=True,
                                          smooth_idf=True,
                                          max_features = 30)
     ),
    ("Vectors into dataframe", VectorToDataFrame())
])


### ID features

In [None]:
id_feature = ["id",
              "host_id"]

def id_to_string(id_object) -> str:
    return id_object.astype(str)

id_pipeline = Pipeline(steps=[
    ("From ID to string", FunctionTransformer(id_to_string))
])


### Rates features

In [None]:
rate_feature = ["host_response_rate",
                "host_acceptance_rate"]

def from_string_to_rate(rate_string: str) -> float:
    return rate_string.str.rstrip('%').astype(float)

rates_pipeline = Pipeline(steps=[
    ("Transform response rate", FunctionTransformer(from_string_to_rate))
])

### Time features

In [None]:
time_feature = ["host_since",
                "first_review",
                "last_review"]

def transform_to_datetime(text_date: str) -> pd.Timestamp | pd.Timestamp:
    return pd.to_datetime(text_date)

timestamp_pipeline = Pipeline(steps=[
    ("Transform to timestamp", FunctionTransformer(transform_to_datetime))
])

 ## Categorical features
 
### Neighbourhoods features

In [None]:
neighbourhood_feature = ["neighbourhood_cleansed"]

neighbourhood_pipeline = Pipeline(steps=[
    ('Neighbourhood Mapper', NeighborhoodMapper(mapping=neighbourhood_levels))
])


### Verifications Feature

In [None]:
def new_features_for_verifications(df: pd.DataFrame) -> pd.DataFrame:
    df['email_verification'] = 'f'
    df['phone_verification'] = 'f'
    df['work_email_verification'] = 'f'
    return df

def allocate_verifications_to_variables(row):
    if "email" in row["host_verifications"]:
        row["email_verification"] = 't'
    if "phone" in row["host_verifications"]:
        row["phone_verification"] = 't'
    if "work_email" in row["host_verifications"]:
        row["work_email_verification"] = 't'
    return row

def apply_on_every_row(df: pd.DataFrame) -> pd.DataFrame:
    return df.apply(allocate_verifications_to_variables, axis=1)

verifications_pipeline = Pipeline(steps=[
    ('Create features', FunctionTransformer(new_features_for_verifications)),
    ('Allocate verifications', FunctionTransformer(apply_on_every_row))
])


### Bathrooms text feature

In [None]:
bathroom_text_feature = ["bathrooms_text"]

bathrooms_pipeline = Pipeline(steps=[
    ('Remap bathrooms text', BathroomsTransformer(remap_baths))
])    

### Price feature

In [None]:
price_feature = ['price']

def remove_symbols(text):
    try:
        cleaned_text = re.sub(r'[$,]', '', text)
        return cleaned_text.strip()
    except:
        return None
    
def remove_dollar_sign(df: pd.DataFrame) -> pd.DataFrame:
    df['price'] = df['price'].apply(remove_symbols).astype(float)
    return df
    
price_pipeline = Pipeline(steps=[
    ("Trim price feature", FunctionTransformer(remove_dollar_sign))
])
    

## Aggregate visualization dataset

In [None]:
from sklearn import set_config
set_config(transform_output = "pandas")

# Apply to all dataset (feature engineering using other features)
feature_creation_pipeline = Pipeline(steps=[
    ("Create Strategic locations distance", CreateStrategicLocationTransformer(locations=strategic_locations)),
    ('Host location transformer', GeographicTransformer(column="host_location", locations=host_locations)),
])

# Features pipeline: apply to single columns elaboration
visualization_pipeline = Pipeline(steps=[
    ("Text encoding", text_encoding_pipeline, string_features),
    ("Id", id_pipeline, id_feature),
    ("Rates", rates_pipeline, rate_feature),
    ("Timestamp", timestamp_pipeline, time_feature),
    ("Neighbourhood", neighbourhood_pipeline, neighbourhood_feature),
    ("Verifications", verifications_pipeline, df_listings),
    ("Bathrooms", bathrooms_pipeline, bathroom_text_feature),
    ("Price", price_pipeline, price_feature)
])

#preprocessor = ColumnTransformer(transformers=[
#    ("Geographic", geographic_pipeline, df_listings),
#    ("Text encoding", text_encoding_pipeline, string_features),
#    ("Id", id_pipeline, id_feature),
#    ("Rates", rates_pipeline, rate_feature),
#    ("Timestamp", timestamp_pipeline, time_feature),
#    ("Neighbourhood", neighbourhood_pipeline, neighbourhood_feature),
#    ("Verifications", verifications_pipeline, df_listings),
#    ("Bathrooms", bathrooms_pipeline, bathroom_text_feature),
#    ("Price", price_pipeline, price_feature)
#],
#    remainder="passthrough",
#    n_jobs=-1
#)

In [None]:
from sklearn.utils import estimator_html_repr
with open('data/visual/visualization_pipeline.html', 'w') as f:  
    f.write(estimator_html_repr(visualization_pipeline))


In [None]:
visualization_pipeline.fit_transform(df_listings)

# Pipeline for visualization ENDED

## Numerical features

In [None]:
num_features = ["host_listings_count", "host_total_listings_count", "accommodates", "bathrooms", "bedrooms", "beds",
                "minimum_nights", "maximum_nights", "number_of_reviews", "review_scores_rating", "review_scores_accuracy",
                "review_scores_cleanliness", "review_scores_checkin", "review_scores_communication",
                "review_scores_location", "review_scores_value", "calculated_host_listings_count",
                "calculated_host_listings_count_entire_homes", "calculated_host_listings_count_private_rooms",
                "calculated_host_listings_count_shared_rooms", "reviews_per_month"
                ]

### Add and manipulate features