In [1]:
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import os
import re
import matplotlib.pyplot as plt
import math
import plotly.express as px
from sklearn.neighbors import LocalOutlierFactor

tqdm.pandas()

In [None]:
## Run the following to compute the mappings for host locations
#!python3 -m scripts.mappings_host_location_all_cities

## Load Visualization Pipeline
#!python3 -m scripts.pipeline_all_cities_viz

## Load ML pipeline
#!python3 -m scripts.pipeline_ML

In [None]:
df = pd.read_pickle("data/pickles/total_listings_viz.pkl")

In [None]:
df.columns.tolist()

## Visualize neighbourhoods

In [None]:
neigh_list = df["neighbourhood_cleansed"].tolist()
neigh_counter = {x:neigh_list.count(x) for x in neigh_list}
neigh_counter = dict(sorted(neigh_counter.items(), key=lambda item: item[1], reverse=True))

neigh_counter


## Testing

In [2]:
df = pd.read_pickle("data/pickles/total_listings_exploration_handling.pkl")

In [None]:
fig = px.histogram(df, x = "price")
fig.show()

In [None]:
df.loc[df["price"]>=400, "price_category"] = "high_high_price"
df.loc[df["price"]<400, "price_category"] = "normal_high_price"
df.loc[df["price"]<160, "price_category"] = "normal_medium_price"
df.loc[df["price"]<100, "price_category"] = "normal_low_price"
df.loc[df["price"]<60, "price_category"] = "low_low_price"



In [None]:
fig = px.histogram(df, x = "price_category")
fig.show()

In [None]:
df["host_response_rate"].unique()

In [None]:
fig = px.scatter(df, x = "price", y = "host_response_rate")
fig.show()

In [None]:
df_only_num = pd.DataFrame(df.dtypes, columns=["type"]).loc[(pd.DataFrame(df.dtypes, columns=["type"])["type"] == "float64") | (pd.DataFrame(df.dtypes, columns=["type"])["type"] == "int64")]

In [None]:
df_only_num = df[df_only_num.index.tolist()]

In [None]:
df.loc[df["price"]>5000, :]

In [None]:
df.loc[df["price_category"]=="normal_price", :]

In [None]:
# This shows that the coordinates distributions are multimodal distributions
# Nonetheless, I use the StandardScaler to scale the data
fig = px.ecdf(df, x = "x_coord")
fig.show()

In [3]:
df.head()

Unnamed: 0_level_0,host_id,host_response_rate,price,host_since,first_review,last_review,host_location,host_response_time,host_is_superhost,host_listings_count,...,reviews_per_month,listing_city,listing_city_pop,amenities_AC/heating,amenities_technology,amenities_kitchen,amenities_benefits,x_coord,y_coord,z_coord
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
31840,380378,100.0,122.0,2011-02-07,2010-06-23,2024-06-08,0.857548,within_an_hour,f,39.0,...,0.77,Florence,367150,2,2,6,2,0.248455,-0.946968,-0.20377
32120,99235,100.0,95.0,2010-03-26,2010-09-26,2023-10-09,1.976853,within_an_hour,f,1.0,...,0.16,Ponte a Ema,8412,2,2,1,0,0.271372,-0.937242,-0.218939
32180,13925330,100.0,109.0,2014-04-05,2010-06-17,2023-12-06,0.992744,more_than_one_hour,t,1.0,...,0.2,Florence,367150,1,2,7,0,0.239778,-0.94732,-0.212348
39115,167739,100.0,104.0,2010-07-15,2010-09-07,2024-05-15,0.654212,more_than_one_hour,f,10.0,...,0.46,Florence,367150,2,3,0,5,0.248757,-0.946499,-0.205571
39165,167739,100.0,121.0,2010-07-15,2010-12-17,2024-04-08,0.654212,more_than_one_hour,f,10.0,...,0.13,Florence,367150,2,3,1,4,0.248757,-0.946499,-0.205571


In [None]:
fig = px.histogram(df, x = "x_coord")
fig.show()

In [None]:

fig = px.histogram(df, x = "y_coord")
fig.show()


In [None]:
fig = px.histogram(df, x = "z_coord")
fig.show()

### Normalize numerical data distribution with `power_transform`

In [None]:
fig = px.histogram(df.loc[df["number_of_reviews"]<100], x = "number_of_reviews")
fig.show()

In [None]:
fig = px.histogram(df, x = "accommodates")
fig.show()


In [None]:
numerical_positive = [
    "host_listings_count",
    "host_location",
    "accommodates",
    "number_of_reviews",
    "reviews_per_month",
    "amenities_benefits"
]

In [None]:
df[numerical_positive].skew().sort_values(ascending=False)