In [20]:
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import os
import re
import matplotlib.pyplot as plt
import math
import plotly.express as px
from sklearn.neighbors import LocalOutlierFactor

tqdm.pandas()

In [2]:
## Run the following to compute the mappings for host locations
#!python3 -m scripts.mappings_host_location_all_cities

## Load Visualization Pipeline
#!python3 -m scripts.pipeline_all_cities_viz

## Load ML pipeline
#!python3 -m scripts.pipeline_ML

In [58]:
df = pd.read_pickle("data/pickles/total_listings_viz.pkl")

In [22]:
df.head()

Unnamed: 0_level_0,host_id,host_response_rate,host_acceptance_rate,price,host_since,first_review,last_review,description,host_location,host_response_time,...,amenities_air-conditioning,amenities_workspace,amenities_freezer,amenities_first-aid-kit,amenities_dishwasher,amenities_long-term-stays,amenities_pets-allowed,amenities_bathtube,amenities_bbq-grill,amenities_lake-bay-view
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
31840,380378,100.0,100.0,122.0,2011-02-07,2010-06-23,2024-06-08,"Nice, private and quiet double room, classic s...",0.857548,within an hour,...,t,f,t,f,f,f,f,f,f,f
32120,99235,100.0,50.0,95.0,2010-03-26,2010-09-26,2023-10-09,Apartment at a 30 minute walk from Ponte Vecch...,1.976853,within an hour,...,t,f,f,f,f,f,f,f,f,f
32180,13925330,100.0,67.0,109.0,2014-04-05,2010-06-17,2023-12-06,.,0.992744,within a day,...,f,f,t,f,t,t,f,f,f,f
39115,167739,100.0,62.0,104.0,2010-07-15,2010-09-07,2024-05-15,Double (or Twin) Room Shared Bathroom: comfort...,0.654212,within a few hours,...,t,t,f,f,f,f,f,f,f,f
39165,167739,100.0,62.0,121.0,2010-07-15,2010-12-17,2024-04-08,"Double room private bathroom: comfortable, lar...",0.654212,within a few hours,...,t,t,t,f,f,f,f,f,f,f


In [None]:
df.columns.tolist()

## Visualize neighbourhoods

In [None]:
neigh_list = df["neighbourhood_cleansed"].tolist()
neigh_counter = {x:neigh_list.count(x) for x in neigh_list}
neigh_counter = dict(sorted(neigh_counter.items(), key=lambda item: item[1], reverse=True))

neigh_counter


## Testing

In [None]:
df = pd.read_pickle("data/pickles/total_listings_exploration_handling.pkl")

In [None]:
df.head()

In [None]:
df["listing_city"].unique().tolist()

In [None]:
fig = px.histogram(df.loc[df["price"]<1000], x = "price")
fig.show()

In [None]:
df.loc[df["price"]>=400, "price_category"] = "high_high_price"
df.loc[df["price"]<400, "price_category"] = "normal_high_price"
df.loc[df["price"]<160, "price_category"] = "normal_medium_price"
df.loc[df["price"]<100, "price_category"] = "normal_low_price"
df.loc[df["price"]<60, "price_category"] = "low_low_price"



In [None]:
fig = px.histogram(df, x = "price_category")
fig.show()

In [None]:
df["host_response_rate"].unique()

In [None]:
fig = px.scatter(df, x = "price", y = "host_response_rate")
fig.show()

In [None]:
df_only_num = pd.DataFrame(df.dtypes, columns=["type"]).loc[(pd.DataFrame(df.dtypes, columns=["type"])["type"] == "float64") | (pd.DataFrame(df.dtypes, columns=["type"])["type"] == "int64")]

In [None]:
df_only_num = df[df_only_num.index.tolist()]

In [None]:
df.loc[df["price"]>5000, :]

In [None]:
df.loc[df["price_category"]=="normal_price", :]

In [None]:
# This shows that the coordinates distributions are multimodal distributions
# Nonetheless, I use the StandardScaler to scale the data
fig = px.ecdf(df, x = "x_coord")
fig.show()

In [None]:
df.head()

In [None]:
fig = px.histogram(df, x = "x_coord")
fig.show()

In [None]:

fig = px.histogram(df, x = "y_coord")
fig.show()


In [None]:
fig = px.histogram(df, x = "z_coord")
fig.show()

### Normalize numerical data distribution with `power_transform`

In [None]:
fig = px.histogram(df.loc[df["number_of_reviews"]<100], x = "number_of_reviews")
fig.show()

In [None]:
fig = px.histogram(df, x = "accommodates")
fig.show()


In [None]:
numerical_positive = [
    "host_listings_count",
    "host_location",
    "accommodates",
    "number_of_reviews",
    "reviews_per_month",
    "amenities_benefits"
]

In [None]:
df[numerical_positive].skew().sort_values(ascending=False)

In [47]:
df.columns.tolist()

['host_id',
 'host_response_rate',
 'host_acceptance_rate',
 'price',
 'host_since',
 'first_review',
 'last_review',
 'description',
 'host_location',
 'host_response_time',
 'host_is_superhost',
 'host_listings_count',
 'host_total_listings_count',
 'host_has_profile_pic',
 'host_identity_verified',
 'neighbourhood_cleansed',
 'latitude',
 'longitude',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bathrooms_text',
 'bedrooms',
 'beds',
 'amenities',
 'minimum_nights',
 'maximum_nights',
 'number_of_reviews',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value',
 'reviews_per_month',
 'df_city_location',
 'listing_city',
 'listing_city_pop',
 'email_verification',
 'phone_verification',
 'work_email_verification',
 'amenities_internet',
 'amenities_self-checkin',
 'amenities_host-greeting',
 'amenities_pool',
 'amenities_oven',
 'a

## Cross-city confrontation

In [70]:
city_confr_price = pd.DataFrame(df.groupby(["df_city_location"]).median("price")["price"])
city_confr_price.reset_index(inplace=True)

fig = px.histogram(city_confr_price,
                   x = "df_city_location",
                   y = "price"
                   )
fig.update_xaxes(categoryorder="total descending")
fig.show()

In [71]:
city_confr_review_scores_rating = pd.DataFrame(df.groupby(["df_city_location"]).median("review_score_rating")["review_scores_rating"])
city_confr_review_scores_rating.reset_index(inplace=True)

fig = px.histogram(city_confr_review_scores_rating,
                   x = "df_city_location",
                   y = "review_scores_rating"
                   )
fig.update_xaxes(categoryorder="total descending")
fig.show()


In [74]:
city_confr_num_reviews = pd.DataFrame(df.groupby(["df_city_location"]).median("number_of_reviews")["number_of_reviews"])
city_confr_num_reviews.reset_index(inplace=True)

fig = px.histogram(city_confr_num_reviews,
                   x = "df_city_location",
                   y = "number_of_reviews"
                   )
fig.update_xaxes(categoryorder="total descending")
fig.show()

In [78]:
city_confr_accommodates = pd.DataFrame(df.groupby(["df_city_location"]).median("accommodates")["accommodates"])
city_confr_accommodates.reset_index(inplace=True)

fig = px.histogram(city_confr_accommodates,
                   x = "df_city_location",
                   y = "accommodates"
                   )
fig.update_xaxes(categoryorder="total descending")
fig.show()


In [79]:
city_confr_population = pd.DataFrame(df.groupby(["df_city_location"]).median("listing_city_pop")["listing_city_pop"])
city_confr_population.reset_index(inplace=True)

fig = px.histogram(city_confr_population,
                   x = "df_city_location",
                   y = "listing_city_pop"
                   )
fig.update_xaxes(categoryorder="total descending")
fig.show()


## Neighbourhoods

In [91]:
fig = px.scatter(df,
                 x = "listing_city_pop",
                 y = "price",
                 log_x=True,
                 log_y=True
                 )
fig.show()