In [3]:
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import os
import re
import matplotlib.pyplot as plt
import math
import plotly.express as px
from sklearn.neighbors import LocalOutlierFactor

tqdm.pandas()

In [None]:
## Run the following to compute the mappings for host locations
#!python3 -m scripts.mappings_host_location_all_cities

## Load Visualization Pipeline
#!python3 -m scripts.pipeline_all_cities_viz

## Load ML pipeline
#!python3 -m scripts.pipeline_ML

In [4]:
df = pd.read_pickle("data/pickles/total_listings_viz.pkl")

In [6]:
df.loc[df["amenities_host_greeting"] != 0, :]

Unnamed: 0_level_0,host_id,host_response_rate,host_acceptance_rate,price,host_since,first_review,last_review,host_location,host_response_time,host_is_superhost,...,amenities_pool,amenities_oven,amenities_microwave,amenities_garden,amenities_streaming,amenities_gym,amenities_elevator,amenities_heating,amenities_ac,amenities_safe
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
32180,13925330,100.0,67.0,109.0,2014-04-05,2010-06-17,2023-12-06,0.992744,within a day,t,...,0,1,1,0,0,0,0,1,0,0
39115,167739,100.0,62.0,104.0,2010-07-15,2010-09-07,2024-05-15,0.654212,within a few hours,f,...,0,0,0,1,0,0,1,1,0,0
39165,167739,100.0,62.0,121.0,2010-07-15,2010-12-17,2024-04-08,0.654212,within a few hours,f,...,0,0,0,0,0,0,1,1,0,0
39822,154769,100.0,100.0,120.0,2010-06-29,2010-09-13,2023-09-10,0.990769,within an hour,f,...,0,0,1,0,0,0,0,1,0,0
43369,4156491,100.0,99.0,228.0,2012-11-15,2014-04-14,2023-08-24,0.618787,within an hour,f,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1167456420647971669,21960026,100.0,100.0,197.0,2014-09-30,2024-06-08,2024-06-08,2.714651,within an hour,f,...,0,1,0,0,0,0,1,1,0,0
1168793242297109991,580720201,100.0,100.0,195.0,2024-05-31,2024-06-13,2024-06-13,3.775035,within an hour,f,...,0,0,1,1,1,0,1,1,0,0
1169580034869144725,580913909,100.0,100.0,82.0,2024-06-01,2024-06-13,2024-06-13,7.337922,within an hour,f,...,0,1,0,0,0,0,1,1,0,0
1171497635161879977,3134996,97.0,99.0,139.0,2012-08-02,2024-06-08,2024-06-08,35.834042,within an hour,f,...,0,1,0,0,0,0,1,1,0,0


In [None]:
df.columns.tolist()

## Visualize neighbourhoods

In [None]:
neigh_list = df["neighbourhood_cleansed"].tolist()
neigh_counter = {x:neigh_list.count(x) for x in neigh_list}
neigh_counter = dict(sorted(neigh_counter.items(), key=lambda item: item[1], reverse=True))

neigh_counter


## Testing

In [None]:
df = pd.read_pickle("data/pickles/total_listings_exploration_handling.pkl")

In [None]:
fig = px.histogram(df, x = "price")
fig.show()

In [None]:
df.loc[df["price"]>=400, "price_category"] = "high_high_price"
df.loc[df["price"]<400, "price_category"] = "normal_high_price"
df.loc[df["price"]<160, "price_category"] = "normal_medium_price"
df.loc[df["price"]<100, "price_category"] = "normal_low_price"
df.loc[df["price"]<60, "price_category"] = "low_low_price"



In [None]:
fig = px.histogram(df, x = "price_category")
fig.show()

In [None]:
df["host_response_rate"].unique()

In [None]:
fig = px.scatter(df, x = "price", y = "host_response_rate")
fig.show()

In [None]:
df_only_num = pd.DataFrame(df.dtypes, columns=["type"]).loc[(pd.DataFrame(df.dtypes, columns=["type"])["type"] == "float64") | (pd.DataFrame(df.dtypes, columns=["type"])["type"] == "int64")]

In [None]:
df_only_num = df[df_only_num.index.tolist()]

In [None]:
df.loc[df["price"]>5000, :]

In [None]:
df.loc[df["price_category"]=="normal_price", :]

In [None]:
# This shows that the coordinates distributions are multimodal distributions
# Nonetheless, I use the StandardScaler to scale the data
fig = px.ecdf(df, x = "x_coord")
fig.show()

In [None]:
df.head()

In [None]:
fig = px.histogram(df, x = "x_coord")
fig.show()

In [None]:

fig = px.histogram(df, x = "y_coord")
fig.show()


In [None]:
fig = px.histogram(df, x = "z_coord")
fig.show()

### Normalize numerical data distribution with `power_transform`

In [None]:
fig = px.histogram(df.loc[df["number_of_reviews"]<100], x = "number_of_reviews")
fig.show()

In [None]:
fig = px.histogram(df, x = "accommodates")
fig.show()


In [None]:
numerical_positive = [
    "host_listings_count",
    "host_location",
    "accommodates",
    "number_of_reviews",
    "reviews_per_month",
    "amenities_benefits"
]

In [None]:
df[numerical_positive].skew().sort_values(ascending=False)