In [None]:
import numpy as np
import pandas as pd

In [None]:
# Load the raw dataset
df = pd.read_csv('listings.csv')
# Load the amenities dataset
amenities_columns = pd.read_csv('amenities_columns.csv')

In [None]:
# Keep usefull columns
df = df[['host_response_time','host_response_rate','host_acceptance_rate', 'host_is_superhost', 'host_identity_verified' ,'neighbourhood_cleansed','room_type', 'accommodates', 'bathrooms_text', 'bedrooms','beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'has_availability', 'number_of_reviews', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'instant_bookable', 'reviews_per_month','latitude', 'longitude']]

In [None]:
# Replace symbols 
df.host_response_rate = df.host_response_rate.replace({'\%':''}, regex = True).astype(float)
df.host_acceptance_rate = df.host_acceptance_rate.replace({'\%':''}, regex = True).astype(float)
df.price = df.price.replace({'\$':''}, regex = True)
df.price = df.price.replace({'\,':''}, regex = True).astype(float)

In [None]:
# Fix the bathrooms_text column
df['number_of_baths'] = df.bathrooms_text.str.replace(r"[a-zA-Z]",'')
df['number_of_baths'] = df.number_of_baths.replace({'\-':np.nan}, regex = True)
df['number_of_baths'] = df['number_of_baths'].astype(float)
c = df.bathrooms_text.str.split(' ', expand = True)
df['shared_bath'] = c[1]

df.shared_bath = df.shared_bath == 'shared'
df = df.drop(columns = ['bathrooms_text', 'amenities'], axis = 1)

In [None]:
# Merge amenities dataset with the row dataset
df = pd.concat([df, amenities_columns], axis=1)

In [None]:
# Calculate the distance from Acropolis
from geopy.distance import geodesic
acropolis = (37.97171, 23.72603)
distances = []
df.longitude.dropna
df.latitude.dropna
for index,row in df.iterrows():
    listing = (row.latitude , row.longitude)
    distance = geodesic(acropolis, listing).kilometers
    distances.append(distance)

distances = [element * 1000 for element in distances]
df['distance'] = distances

In [None]:
# Drop higf correleted columns
df.drop('Unnamed: 0',axis='columns', inplace=True)
df.drop([ 'review_scores_accuracy', 'reviews_per_month',
        'review_scores_cleanliness', 'review_scores_checkin','balcony','bedrooms', 
        'review_scores_communication', 'review_scores_location','review_scores_value'],axis='columns', inplace=True)

In [None]:
# Drop rows with missing values 
df=df.dropna(subset=['instant_bookable','has_availability','host_identity_verified','host_is_superhost','beds'],how='any')

In [None]:
# Apply filters
df = df.drop(df[df.price > 150].index)
df.drop(df[df.beds > 8].index, inplace=True)
df.drop(df[df.minimum_nights > 200].index, inplace=True)
df.drop(df[df.number_of_reviews > 600].index, inplace=True)
df.drop(df[df.maximum_nights > 1200].index, inplace=True)
df.drop(df[df.number_of_baths > 5].index, inplace=True)
index_names = df[ (df['beds'] <= 1) & (df['price'] > 90)].index
df.drop(index_names, inplace = True)

In [None]:
# Label Encoding for host_response_time
map_strategy={
    'within an hour': 0,
    'within a few hours': 1,
    'within a day': 2,
    'a few days or more': 3
}

df['host_response_time']=df['host_response_time'].map(map_strategy)

In [None]:
# Replace true & false values with 1 & 0 
df['instant_bookable']=df['instant_bookable'].replace({'t':1,'f':0})
df['has_availability']=df['has_availability'].replace({'t':1,'f':0})
df['host_identity_verified']=df['host_identity_verified'].replace({'t':1,'f':0})
df['host_is_superhost']=df['host_is_superhost'].replace({'t':1,'f':0})
df['shared_bath']=df['shared_bath'].replace({True:1,False:0})

In [None]:
# one-hot encoding 
df_hot=pd.get_dummies(df[['room_type', 'neighbourhood_cleansed']])
df = pd.concat([df, df_hot], axis=1)
df.drop(['room_type','neighbourhood_cleansed'],axis=1,inplace=True)

In [None]:
# Export dataset for modelling
df.to_csv('knn_imputed.csv')