In [None]:
# Data Preprocessing

## Data Cleaning

LIST
* Drop columns that are not relevant to the problem. Example: URL, host picture etc.
* Find missing values for each column.
* Convert columns to their correct data type.
* One-hot-encode the categorical variables.

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Import data
df = pd.read_csv('listings.csv') 
amenities_columns = pd.read_csv('amenities_columns.csv')

In [3]:
# -*- coding: utf-8 -*-
"""
Created on Sat Apr  3 18:02:27 2021

@author: leontaridiss
"""

df = df[['host_response_time','host_response_rate','host_acceptance_rate', 'host_is_superhost', 'host_identity_verified' ,'neighbourhood_cleansed','room_type', 'accommodates', 'bathrooms_text', 'bedrooms','beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'has_availability', 'number_of_reviews', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'instant_bookable', 'reviews_per_month' ]]

# Get list of categorical variables
s = (df.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

df.host_response_rate = df.host_response_rate.replace({'\%':''}, regex = True).astype(float)
df.host_acceptance_rate = df.host_acceptance_rate.replace({'\%':''}, regex = True).astype(float)

df.price = df.price.replace({'\$':''}, regex = True)
df.price = df.price.replace({'\,':''}, regex = True).astype(float)

# df['number_of_baths'] = df.bathrooms_text.str.replace(r'[^0-9]+', '')
# df['text_of_baths'] = df.bathrooms_text.str.extract('(\D+)', expand=False)

# df['number_of_baths'] = df.bathrooms_text.str.replace('^[^\d]*', '')

df['number_of_baths'] = df.bathrooms_text.str.replace(r"[a-zA-Z]",'')
df['number_of_baths'] = df.number_of_baths.replace({'\-':np.nan}, regex = True)
df['number_of_baths'] = df['number_of_baths'].astype(float)

 
c = df.bathrooms_text.str.split(' ', expand = True)
df['shared_bath'] = c[1]

df.shared_bath = df.shared_bath == 'shared'
df = df.drop(columns = ['bathrooms_text', 'amenities'], axis = 1)
df = pd.concat([df, amenities_columns], axis=1)


Categorical variables:
['host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_identity_verified', 'neighbourhood_cleansed', 'room_type', 'bathrooms_text', 'amenities', 'price', 'has_availability', 'instant_bookable']
  df['number_of_baths'] = df.bathrooms_text.str.replace(r"[a-zA-Z]",'')


In [4]:
df.shape

(9467, 59)

In [None]:
df.isna().sum()

In [6]:
df.drop( ['Unnamed: 0'],axis=1,inplace=True)

 # Find missing values for each column #

In [None]:
df.nunique()  # unique values in new dataframe

In [None]:
df.isna().sum()

In [None]:
df.isna().sum()/len(df)*100 #percentage of missing values for each col

## FIX MISSING VALUES AND FILL THE T/F COLS
LIST 
* instant_bookable
* has_availability
* host_identity_verified
* host_is_superhost

In [7]:
df=df.dropna(subset=['instant_bookable','has_availability','host_identity_verified','host_is_superhost','beds'],how='any')

In [8]:
ColList=['instant_bookable','has_availability','host_identity_verified','host_is_superhost']
df['instant_bookable']=df['instant_bookable'].replace({'t':1,'f':0})
df['has_availability']=df['has_availability'].replace({'t':1,'f':0})
df['host_identity_verified']=df['host_identity_verified'].replace({'t':1,'f':0})
df['host_is_superhost']=df['host_is_superhost'].replace({'t':1,'f':0})
df['shared_bath']=df['shared_bath'].replace({True:1,False:0})
       

## FIX THE HOST COLS
### Fill the NaN with strategy  


In [9]:
fill_strategy={
    'host_response_time': df['host_response_time'].mode()[0],
    'host_response_rate': df['host_response_rate'].mode()[0],
    'host_acceptance_rate': df['host_acceptance_rate'].mode()[0]
}

df=df.fillna(fill_strategy)

## FIX THE BEDROOMS NANS equals zero

In [10]:
df['bedrooms']=df['bedrooms'].fillna(0)

# FIX REVIEWS WITH MEAN 

In [11]:
df['review_scores_rating']=df['review_scores_rating'].fillna(df['review_scores_rating'].median())
df['review_scores_accuracy']=df['review_scores_accuracy'].fillna(df['review_scores_accuracy'].median())
df['review_scores_cleanliness']=df['review_scores_cleanliness'].fillna(df['review_scores_cleanliness'].median())
df['review_scores_checkin']=df['review_scores_checkin'].fillna(df['review_scores_checkin'].median())
df['review_scores_communication']=df['review_scores_communication'].fillna(df['review_scores_communication'].median())
df['review_scores_location']=df['review_scores_location'].fillna(df['review_scores_location'].median())
df['review_scores_value']=df['review_scores_value'].fillna(df['review_scores_value'].median())
df['reviews_per_month']=df['reviews_per_month'].fillna(df['reviews_per_month'].median())
df['number_of_baths']=df['number_of_baths'].fillna(df['number_of_baths'].median())

# ENCODING COLS

In [23]:
s = (df.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Categorical variables:
[]


In [13]:
map_strategy={
    'within an hour': 0,
    'within a few hours': 1,
    'within a day': 2,
    'a few days or more': 3
}

df['host_response_time']=df['host_response_time'].map(map_strategy)

In [14]:
df_hot=pd.get_dummies(df['room_type'])
df = pd.concat([df, df_hot], axis=1)

In [15]:
df['Private room'].astype(float)
df['Entire home/apt'].astype(float)
df['Hotel room'].astype(float)
df['Shared room'].astype(float)

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
9462    0.0
9463    0.0
9464    0.0
9465    0.0
9466    0.0
Name: Shared room, Length: 9409, dtype: float64

In [16]:
df.drop( ['room_type'],axis=1,inplace=True)

In [17]:
df_hot_two=pd.get_dummies(df['neighbourhood_cleansed'])
df_hot_two.astype(float)
df = pd.concat([df, df_hot_two], axis=1)

In [22]:
df.drop( ['neighbourhood_cleansed'],axis=1,inplace=True)

In [26]:
df.shape

(9409, 104)

In [25]:
df.columns

Index(['host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_identity_verified', 'accommodates',
       'bedrooms', 'beds', 'price', 'minimum_nights',
       ...
       'ΠΕΤΡΑΛΩΝΑ', 'ΠΛΑΤΕΙΑ ΑΜΕΡΙΚΗΣ', 'ΠΛΑΤΕΙΑ ΑΤΤΙΚΗΣ', 'ΠΟΛΥΓΩΝΟ',
       'ΠΡΟΜΠΟΝΑ', 'ΡΗΓΙΛΛΗΣ', 'ΡΙΖΟΥΠΟΛΗ', 'ΣΕΠΟΛΙΑ', 'ΣΤΑΔΙΟ',
       'ΣΤΑΘΜΟΣ ΛΑΡΙΣΗΣ'],
      dtype='object', length=104)

#  Transform to int 

In [19]:
df.dtypes

host_response_time          int64
host_response_rate        float64
host_acceptance_rate      float64
host_is_superhost           int64
host_identity_verified      int64
                           ...   
ΡΗΓΙΛΛΗΣ                    uint8
ΡΙΖΟΥΠΟΛΗ                   uint8
ΣΕΠΟΛΙΑ                     uint8
ΣΤΑΔΙΟ                      uint8
ΣΤΑΘΜΟΣ ΛΑΡΙΣΗΣ             uint8
Length: 105, dtype: object

In [27]:
df.to_csv('ListingsClean.csv')





