In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
from pandarallel import pandarallel

In [None]:
df = pd.read_pickle("../data/pickles/total_listings_viz.pkl")

In [None]:
df.head()

In [None]:
hist_cat_variables = [
    "host_response_time",
    "host_is_superhost",
    "host_listings_count",
    "host_has_profile_pic",
    "host_identity_verified",
    "property_type",
    "room_type",
    "bathrooms_text",
    "email_verification",
    "phone_verification",
    "work_email_verification"
]

In [None]:
hist_num_variables = [
    "host_response_rate",
    "host_acceptance_rate",
    "price",
    "host_location",
    "host_listings_count",
    "host_total_listings_count",
    "accommodates",
    "bathrooms",
    "bedrooms",
    "beds",
    "minimum_nights",
    "maximum_nights",
    "number_of_reviews",
    "review_scores_rating",
    "review_scores_accuracy",
    "review_scores_cleanliness",
    "review_scores_checkin",
    "review_scores_communication",
    "review_scores_location",
    "review_scores_value",
    "reviews_per_month",
    "listing_city_pop",
    "amenities_AC/heating",
    "amenities_technology",
    "amenities_kitchen",
    "amenities_benefits",
    "amenities_toiletry",
    "amenities_other"
]

## Corr matrix for numerical variables

In [None]:
df_corr = df.copy()
corr_full = df_corr[hist_num_variables].corr()
colors = ['green' if val < 0.45 else 'red' for val in corr_full.values.flatten()]
mask = np.where(corr_full < 0.45, 0, 1)



In [None]:
fig = px.imshow(corr_full,
                text_auto=True,
                aspect="auto",
                width=900,
                height=900
                )


fig.show()

In [None]:
fig = px.imshow(corr_full,
                text_auto=True,
                aspect="auto",
                width=900,
                height=900,
                color_continuous_scale=["green", "red"])

fig.update_traces(z=mask, colorscale=[[0, "green"], [1, "red"]])

fig.show()

# Drop highly correlated variables

In [None]:
to_drop_corr = [
    "host_acceptance_rate",
    "host_total_listings_count",
    "bathrooms",
    "bedrooms",
    "beds",
    "review_scores_accuracy",
    "review_scores_cleanliness",
    "review_scores_checkin",
    "review_scores_communication",
    "review_scores_location",
    "review_scores_value",
    "amenities_other",
    "amenities_toiletry",
    "amenities"
]

In [None]:
df.drop(to_drop_corr, inplace=True, axis=1)

In [None]:
df_corr = df.copy()
left_num_var = [x for x in hist_num_variables if x not in to_drop_corr]
corr_full = df_corr[left_num_var].corr()

fig = px.imshow(corr_full,
                text_auto=True,
                aspect="auto",
                width=600,
                height=600
                )
fig.show()

In [None]:
df.shape

# Plots

In [None]:
for var in hist_cat_variables:
    fig = px.histogram(df, x = f"{var}")
    fig.show()

## Numerical variables and price

In [None]:
for var in left_num_var:
    fig = px.histogram(df, x = f"{var}")
    fig.show()

In [None]:
fig.show()

## Turning skewed numerical into categorical

In [None]:
df_man = df.copy()

In [None]:
df_man["host_response_rate"] = df_man["host_response_rate"].apply(lambda x: "100" if x==100 else "lower")

In [None]:
fig = px.histogram(df_man, x = "host_response_rate")
fig.show()

In [None]:
df_man["minimum_nights"] = df_man["minimum_nights"].apply(lambda x: "1" if x <= 1 else "more_than_1")
fig = px.histogram(df_man, x = "minimum_nights")
fig.show()

In [None]:
df_man["maximum_nights"] = df_man["maximum_nights"].apply(lambda x: "less_than_100" if x <= 100 else "more_than_100")
fig = px.histogram(df_man, x = "maximum_nights")
fig.show()


In [None]:
df_man["listing_city_pop"] = df_man["listing_city_pop"].apply(lambda x: "less_than_300k" if x <= 300000 else "more_than_300k")
fig = px.histogram(df_man, x = "listing_city_pop")
fig.show()


In [None]:
df_man["review_scores_rating"] = df_man["review_scores_rating"].apply(lambda x: "less_than_4.8" if x < 4.8 else "more_than_4.8")
fig = px.histogram(df_man, x = "review_scores_rating")
fig.show()


In [None]:
pd.to_pickle(df, "../data/pickles/total_listings_exploration_handling.pkl")