In [None]:
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import os
import re
import matplotlib.pyplot as plt
import math
import plotly.express as px
from sklearn.neighbors import LocalOutlierFactor

tqdm.pandas()

In [None]:
## Run the following to compute the mappings for host locations
#!python3 -m scripts.mappings_host_location_all_cities

## Load Visualization Pipeline
#!python3 -m scripts.pipeline_all_cities_viz

## Load ML pipeline
#!python3 -m scripts.pipeline_ML

In [None]:
df = pd.read_pickle("data/pickles/total_listings_viz.pkl")

In [None]:
df.columns.tolist()

## Visualize neighbourhoods

In [None]:
neigh_list = df["neighbourhood_cleansed"].tolist()
neigh_counter = {x:neigh_list.count(x) for x in neigh_list}
neigh_counter = dict(sorted(neigh_counter.items(), key=lambda item: item[1], reverse=True))

neigh_counter


## Testing

In [None]:
df = pd.read_pickle("data/pickles/total_listings_exploration_handling.pkl")

In [None]:
fig = px.histogram(df, x = "price")
fig.show()

In [None]:
df.loc[df["price"]>=400, "price_category"] = "high_high_price"
df.loc[df["price"]<400, "price_category"] = "normal_high_price"
df.loc[df["price"]<160, "price_category"] = "normal_medium_price"
df.loc[df["price"]<100, "price_category"] = "normal_low_price"
df.loc[df["price"]<60, "price_category"] = "low_low_price"



In [None]:
fig = px.histogram(df, x = "price_category")
fig.show()

In [None]:
df["host_response_rate"].unique()

In [None]:
fig = px.scatter(df, x = "price", y = "host_response_rate")
fig.show()

In [None]:
df_only_num = pd.DataFrame(df.dtypes, columns=["type"]).loc[(pd.DataFrame(df.dtypes, columns=["type"])["type"] == "float64") | (pd.DataFrame(df.dtypes, columns=["type"])["type"] == "int64")]

In [None]:
df_only_num = df[df_only_num.index.tolist()]

In [None]:
df.loc[df["price"]>5000, :]

In [None]:
df.loc[df["price_category"]=="normal_price", :]

In [None]:
# This shows that the coordinates distributions are multimodal distributions
# Nonetheless, I use the StandardScaler to scale the data
fig = px.ecdf(df, x = "x_coord")
fig.show()

In [None]:
df.head()

In [None]:
fig = px.histogram(df, x = "x_coord")
fig.show()

In [None]:

fig = px.histogram(df, x = "y_coord")
fig.show()


In [None]:
fig = px.histogram(df, x = "z_coord")
fig.show()

### Normalize numerical data distribution with `power_transform`

In [None]:
fig = px.histogram(df.loc[df["number_of_reviews"]<100], x = "number_of_reviews")
fig.show()

In [None]:
fig = px.histogram(df, x = "accommodates")
fig.show()


In [None]:
numerical_positive = [
    "host_listings_count",
    "host_location",
    "accommodates",
    "number_of_reviews",
    "reviews_per_month",
    "amenities_benefits"
]

In [None]:
df[numerical_positive].skew().sort_values(ascending=False)