In [2]:
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import os
import re
import matplotlib.pyplot as plt
import math
import plotly.express as px
from sklearn.neighbors import LocalOutlierFactor

tqdm.pandas()

In [None]:
## Run the following to compute the mappings for host locations
#!python3 -m scripts.mappings_host_location_all_cities

## Load Visualization Pipeline
#!python3 -m scripts.pipeline_all_cities_viz

## Load ML pipeline
#!python3 -m scripts.pipeline_ML

In [None]:
df = pd.read_pickle("data/pickles/total_listings_viz.pkl")

In [None]:
df.loc[df["amenities_host_greeting"] != 0, :]

In [None]:
df.head()

In [None]:
df.columns.tolist()

## Visualize neighbourhoods

In [None]:
neigh_list = df["neighbourhood_cleansed"].tolist()
neigh_counter = {x:neigh_list.count(x) for x in neigh_list}
neigh_counter = dict(sorted(neigh_counter.items(), key=lambda item: item[1], reverse=True))

neigh_counter


## Testing

In [3]:
df = pd.read_pickle("data/pickles/total_listings_exploration_handling.pkl")

In [4]:
df.head()

Unnamed: 0,host_id,host_response_rate,price,host_since,first_review,last_review,description,host_location,host_response_time,host_is_superhost,...,sofa,station,stay,steps,tv,walk,wifi,x_coord,y_coord,z_coord
31840,380378,100,122.0,2011-02-07,2010-06-23,2024-06-08,nice private quiet double room classic style b...,0.857548,within_an_hour,f,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.248455,-0.946968,-0.20377
32120,99235,100,95.0,2010-03-26,2010-09-26,2023-10-09,apartment minute walk ponte vecchio piazzale m...,1.976853,within_an_hour,f,...,0.204476,0.0,0.422793,0.0,0.0,0.0,0.0,0.271372,-0.937242,-0.218939
32180,13925330,100,109.0,2014-04-05,2010-06-17,2023-12-06,,0.992744,more_than_one_hour,t,...,0.0,0.0,0.249367,0.0,0.0,0.0,0.226435,0.239778,-0.94732,-0.212348
39115,167739,100,104.0,2010-07-15,2010-09-07,2024-05-15,double twin room shared bathroom comfortable l...,0.654212,more_than_one_hour,f,...,0.0,0.419241,0.0,0.0,0.0,0.427326,0.0,0.248757,-0.946499,-0.205571
39165,167739,100,121.0,2010-07-15,2010-12-17,2024-04-08,double room private bathroom comfortable large...,0.654212,more_than_one_hour,f,...,0.0,0.17546,0.0,0.0,0.0,0.357688,0.0,0.248757,-0.946499,-0.205571


In [None]:
df["listing_city"].unique().tolist()

In [None]:
fig = px.histogram(df.loc[df["price"]<1000], x = "price")
fig.show()

In [None]:
df.loc[df["price"]>=400, "price_category"] = "high_high_price"
df.loc[df["price"]<400, "price_category"] = "normal_high_price"
df.loc[df["price"]<160, "price_category"] = "normal_medium_price"
df.loc[df["price"]<100, "price_category"] = "normal_low_price"
df.loc[df["price"]<60, "price_category"] = "low_low_price"



In [None]:
fig = px.histogram(df, x = "price_category")
fig.show()

In [None]:
df["host_response_rate"].unique()

In [None]:
fig = px.scatter(df, x = "price", y = "host_response_rate")
fig.show()

In [None]:
df_only_num = pd.DataFrame(df.dtypes, columns=["type"]).loc[(pd.DataFrame(df.dtypes, columns=["type"])["type"] == "float64") | (pd.DataFrame(df.dtypes, columns=["type"])["type"] == "int64")]

In [None]:
df_only_num = df[df_only_num.index.tolist()]

In [None]:
df.loc[df["price"]>5000, :]

In [None]:
df.loc[df["price_category"]=="normal_price", :]

In [None]:
# This shows that the coordinates distributions are multimodal distributions
# Nonetheless, I use the StandardScaler to scale the data
fig = px.ecdf(df, x = "x_coord")
fig.show()

In [None]:
df.head()

In [None]:
fig = px.histogram(df, x = "x_coord")
fig.show()

In [None]:

fig = px.histogram(df, x = "y_coord")
fig.show()


In [None]:
fig = px.histogram(df, x = "z_coord")
fig.show()

### Normalize numerical data distribution with `power_transform`

In [None]:
fig = px.histogram(df.loc[df["number_of_reviews"]<100], x = "number_of_reviews")
fig.show()

In [None]:
fig = px.histogram(df, x = "accommodates")
fig.show()


In [None]:
numerical_positive = [
    "host_listings_count",
    "host_location",
    "accommodates",
    "number_of_reviews",
    "reviews_per_month",
    "amenities_benefits"
]

In [None]:
df[numerical_positive].skew().sort_values(ascending=False)