## Data preprocessing Pt.1 - Listings in Amsterdam

In [None]:
#import libraries

import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import seaborn as sns
from scipy import stats


In [None]:
#if kneed not available then we install so to find the optimal k for when we run kmeans later on
#!conda install -c conda-forge kneed

In [None]:
#load data
df=pd.read_csv('listings.csv',low_memory=False)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
#we choose the features that we consider most relevant
X=df[['id',
 'host_id',
 'host_response_rate',
 'host_acceptance_rate',
 'host_total_listings_count',
 'host_has_profile_pic',
 'host_identity_verified',
 'street',
 'neighbourhood',
 'latitude',
 'longitude',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'bed_type',
 'price',
 'minimum_nights',
 'security_deposit',
 'cleaning_fee',
 'guests_included',
 'extra_people',
 'calendar_updated',
 'has_availability',
 'number_of_reviews',
 'review_scores_rating',
 'instant_bookable',
 'is_business_travel_ready',
 'cancellation_policy',
 'require_guest_profile_picture',
 'require_guest_phone_verification',
 'calculated_host_listings_count',
 'reviews_per_month',]]

In [None]:
#remove commas from numeric values
pricecolumns=['price','security_deposit','cleaning_fee','host_response_rate','host_acceptance_rate','extra_people']
for c in pricecolumns:
    X[c]=X[c].replace({'\$': ''}, regex=True)
    X[c]=X[c].replace({'\%': ''}, regex=True)
    X[c]=X[c].replace({'\" "': ''},regex=True)
    X[c]=X[c].replace({',': ''}, regex=True)

    

#binary encode true false features
binarycolumns =  ['host_identity_verified', 'host_has_profile_pic','instant_bookable','require_guest_profile_picture','has_availability',
                  'require_guest_phone_verification','is_business_travel_ready']
for c in binarycolumns:
    X[c] = X[c].replace('f',0,regex=True)
    X[c] = X[c].replace('t',1,regex=True)

In [None]:
X.head(5)

In [None]:
#check for missing values
X.isnull().sum()

In [None]:
#replace Null values with median in number of bedrooms, beds and bathrooms. For variables like cleaning fee, securiy deposit, review scores rating
#reviews per month etc we cannot assume that there are missing values by negligence - the values might as well be zero as i.e new property without reviews,
#or no requirement for security deposit or cleaning fee.

X['bedrooms']=X['bedrooms'].fillna(X['bedrooms'].median())
X['beds']=X['beds'].fillna(X['beds'].median())
X['bathrooms']=X['bathrooms'].fillna(X['bathrooms'].median())
X['cleaning_fee']=X['cleaning_fee'].fillna(0)
X['security_deposit']=X['security_deposit'].fillna(0)
X['review_scores_rating']=X['review_scores_rating'].fillna(0)
X['reviews_per_month']=X['reviews_per_month'].fillna(0)
X['host_total_listings_count']=X['host_total_listings_count'].fillna(0)
X['host_has_profile_pic']=X['host_has_profile_pic'].fillna(0)
X['host_identity_verified']=X['host_identity_verified'].fillna(0)

#drop columns with a lot of nan value
del X['review_scores_rating']
del X['reviews_per_month']
del X['neighbourhood']
del X['host_response_rate']
del X['host_acceptance_rate']
del X['is_business_travel_ready']


In [None]:
X.isnull().sum()

In [None]:
#check different categories
print (X['property_type'].unique())
print (X['room_type'].unique())
print (X['bed_type'].unique())
print (X['cancellation_policy'].unique())


In [None]:
#lets see which type of properties are more common in Amsterdam. We wouldnt have much use for properties with very few listings
property_occurence=X['property_type'].value_counts()
property_occurence

In [None]:
#we focus on the properties that are the most common - apartments and exclude rest types from our data
value_counts = X['property_type'].value_counts()
remove = value_counts[value_counts <= 14000].index
X = X[~X.property_type.isin(remove)]
X['property_type'].unique()

In [None]:
#convert objects to floats
X["security_deposit"] = X.security_deposit.astype(float)
X["price"] = X.price.fillna(0).astype(float)
X["cleaning_fee"] = X.cleaning_fee.astype(float)
X["extra_people"] = X.extra_people.astype(float)


In [None]:
#lets have a look at some of the features which seem to have wide range of values
print(X['price'].max())
print(X['minimum_nights'].max())
print(X['security_deposit'].max())
print(X['accommodates'].max())
print(X['extra_people'].max())

In [None]:
#one hot encoding
#pd.get_dummies(X, columns=["property_type"]).head()
X2 = pd.get_dummies(X, columns=['bed_type','room_type','cancellation_policy'], drop_first=True)


In [None]:
#we remove outliers
def remove_outlier(df,col_name):
    q1=df[col_name].quantile(0.25)
    q3=df[col_name].quantile(0.75)
    iqr=q3-q1
    low=q1-1.5*iqr
    high=q3+1.5*iqr
    df_out=df.loc[(df[col_name]>low)&(df[col_name]<high)]

In [None]:
remove_outlier(X2,'minimum_nights')
remove_outlier(X2,'price')
remove_outlier(X2,'security_deposit')
remove_outlier(X2,'beds')
remove_outlier(X2,'extra_people')
remove_outlier(X2,'accommodates')

In [None]:
#select only numeric values to use for our Kmeans model
numeric=X2.select_dtypes(exclude=[object])

## Implementing Kmeans algorithm for property classification

In [None]:
Sum_of_squared_distances = []
K = range(1,12)
for k in K:
    km = KMeans(n_clusters=k,n_init=12)
    km = km.fit(numeric)
    Sum_of_squared_distances.append(km.inertia_)

In [None]:
# We use Kneed to find the best K value for our algorithm
y=Sum_of_squared_distances

x = range(1, len(y)+1)

from kneed import KneeLocator
kn = KneeLocator(x, y, curve='convex', direction='decreasing')
print(kn.knee)
plt.xlabel('number of clusters k')
plt.ylabel('Sum of squared distances')
plt.plot(x, y, 'bx-')
plt.vlines(kn.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed')

#elbow method to determine cluster number
SSE =[]
for i in range(1, 10):
    kmeans = KMeans(n_clusters = i).fit(numeric)
    kmeans.fit(numeric)
    SSE.append(kmeans.inertia_)
import matplotlib.pyplot as plt
plt.plot(range(1, 10), SSE)
plt.title('Elbow method')
plt.xlabel('Number of clusters')
plt.ylabel('SSE') #within cluster sum of squares
plt.show()

In [None]:
#standardize features
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
scaled_final_df = scaler.fit_transform(numeric)
scaled_final_df = pd.DataFrame(scaled_final_df, columns=numeric.columns)

scaled_final_df.head()

In [None]:
#we remove variables that are not used as classiication parameters
del scaled_final_df['id']
del scaled_final_df['host_id']
del scaled_final_df['latitude']
del scaled_final_df['longitude']


In [None]:
scaled_final_df.head(2)

In [None]:
#we run Kmeans with k=3
num_clusters=3
kmeans = KMeans(n_clusters=num_clusters)
y_kmeans = kmeans.fit_predict(scaled_final_df)
print(y_kmeans)

In [None]:
print(y_kmeans.shape)
print(X2.shape)

In [None]:
X2['Cluster']=y_kmeans
X2.head()

In [None]:
import matplotlib.cm as cm
import matplotlib.colors as colors
import json 

!pip install geopy
from geopy.geocoders import Nominatim 
!pip install geocoder

import requests 
from pandas.io.json import json_normalize 


import matplotlib.cm as cm
import matplotlib.colors as colors

!pip install folium
import folium

print('Libraries imported.')

In [None]:
# we use geolocator to find coordinates of Amsterdam and help us in visualizing the clusters on map
address = 'Amsterdam, AMS'

geolocator = Nominatim(user_agent="TO_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Amsterdam are {}, {}.'.format(latitude, longitude))

In [None]:
map_airbnb = folium.Map(location=[latitude, longitude], zoom_start=11)


x = np.arange(num_clusters)
ys = [i + x + (i*x)**2 for i in range(num_clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, cluster in zip(X2['latitude'], X2['longitude'], X2['Cluster']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=0.1,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=1).add_to(map_airbnb)

map_airbnb


In [None]:
#we name our clusters
clus1 = X2.loc[X2.Cluster == 0] 
clus2 = X2.loc[X2.Cluster == 1] 
clus3 = X2.loc[X2.Cluster == 2] 
cluster_list = [clus1.values, clus2.values, clus3.values]

In [None]:
clus1.describe()

In [None]:
clus2.describe()

In [None]:
clus3.describe()

##### Running Kmeans clustering second time in the cluster with the biggest density (Cluster 1) as the classes are very disproportionate

In [None]:
clus1_numeric=clus1.select_dtypes(exclude=[object])

In [None]:
Sum_of_squared_distances = []
K = range(1,12)
for k in K:
    km = KMeans(n_clusters=k,n_init=12)
    km = km.fit(clus1_numeric)
    Sum_of_squared_distances.append(km.inertia_)

In [None]:

y=Sum_of_squared_distances

x = range(1, len(y)+1)

from kneed import KneeLocator
kn = KneeLocator(x, y, curve='convex', direction='decreasing')
print(kn.knee)
plt.xlabel('number of clusters k')
plt.ylabel('Sum of squared distances')
plt.plot(x, y, 'bx-')
plt.vlines(kn.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed')

In [None]:
clusters=3
kmeans = KMeans(n_clusters=clusters)
y_kmeans = kmeans.fit_predict(clus1_numeric)
print(y_kmeans)

In [None]:
clus1['clustering']=y_kmeans

In [None]:
map_airbnb2 = folium.Map(location=[latitude, longitude], zoom_start=11)


x = np.arange(clusters)
ys = [i + x + (i*x)**2 for i in range(clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, cluster in zip(clus1['latitude'], clus1['longitude'], clus1['clustering']):
    label = folium.Popup(' clustering ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=0.1,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=1).add_to(map_airbnb2)

map_airbnb2


#### *Continued at Data Preprocessing Pt.2 - Calendar_Bookings*