In [100]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datetime import datetime
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, RobustScaler

In [101]:
df_singa_airbnb = pd.read_csv('listings.csv')
df_singa_airbnb.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,49091,COZICOMFORT LONG TERM STAY ROOM 2,266763,Francesca,North Region,Woodlands,1.44255,103.7958,Private room,83,180,1,2013-10-21,0.01,2,365
1,50646,Pleasant Room along Bukit Timah,227796,Sujatha,Central Region,Bukit Timah,1.33235,103.78521,Private room,81,90,18,2014-12-26,0.28,1,365
2,56334,COZICOMFORT,266763,Francesca,North Region,Woodlands,1.44246,103.79667,Private room,69,6,20,2015-10-01,0.2,2,365
3,71609,Ensuite Room (Room 1 & 2) near EXPO,367042,Belinda,East Region,Tampines,1.34541,103.95712,Private room,206,1,14,2019-08-11,0.15,9,353
4,71896,B&B Room 1 near Airport & EXPO,367042,Belinda,East Region,Tampines,1.34567,103.95963,Private room,94,1,22,2019-07-28,0.22,9,355


At a glance of the features given, the processs to prepare them ML models are summarised below

### Prediction column
- price

### Feature columns
**Columns to be removed**
- host_name
- name
- last_review
- id
- host_id (since calculated_host_listings_count is available, host_id does not seem to add much value)

**Categorical encoding**

*Categorical values*
(This is applicable when there is a ranking in the categorical number)
- room_type


*Binary encoding*
(This is applicable when there is no ranking)
- neighbourhood
- neighbourhood_group


**Scaling**

*MinMaxScaler (for those features that do not seem to have outliers and more evenly distributed)*
- calculated_host_listings_count
- availability_365
- latitude
- longitude

*RobustScaler (For those features that seem to be skewed and have outliers)*
- minimum_nights
- number_of_reviews
- reviews_per_month


**Feature engineering**
- last_review (into year, month, day and dayofweek)
- latitude and longitude (into cluster number with K-mean clustering. Lat and lon have very small numerical changes and it makes mores sense to group them into cluster based on their lat lon positions)

### 1. Check if there are any missing values and perform imputation for those missing values

In [131]:
df = df_singa_airbnb.copy()

In [132]:
# dict_mat_null = {}
# dict_df_col_null = {}

# for col in df.columns:
#     dict_mat_null[col] = pd.isnull(df[col])
#     if not df[dict_mat_null[col]].empty:
#         dict_df_col_null[col] = df[dict_mat_null[col]]

# dict_df_col_null

In [133]:
null_value_stats = df.isnull().sum()
null_value_stats[null_value_stats != 0]

name                    2
last_review          2758
reviews_per_month    2758
dtype: int64

Based on the null dfs, there are 2 rows with missing **name**, 2758 rows with missing **last_review** and also 2758 with missing **reviews_per_month**.

Can also show that the indexes for rows with missing last_review and reviews_per_month are all the same (See below).

In [105]:
missing_last_review_index = dict_df_col_null['last_review'].index
missing_reviews_per_month_index = dict_df_col_null['reviews_per_month'].index

In [106]:
check_diff_index = [x for x in missing_last_review_index if x not in missing_reviews_per_month_index] + \
[x for x in missing_reviews_per_month_index if x not in  missing_last_review_index]

In [107]:
check_diff_index

[]

#### Imputation
Because name and last_review are going to be removed, imputation for them is not necessary

For rows with Null reviews_per_month, they will be imputed with 0 assuming that no value means no comments ever made

In [134]:
df['reviews_per_month'] = df['reviews_per_month'].fillna(0)

### Removing unnecessary features

### 2. Categorical encoding

In [112]:
np.array(df['neighbourhood_group']).reshape(-1,1)

array([['North Region'],
       ['Central Region'],
       ['North Region'],
       ...,
       ['Central Region'],
       ['Central Region'],
       ['Central Region']], dtype=object)

In [113]:
# df['availability_365'].unique()

In [114]:
#neighbourhood_group
#neighbourhood
#room_type

# print(df['neighbourhood_group'].unique())
# print(df['neighbourhood'].unique())
# print(df['room_type'].unique())

enc = OrdinalEncoder()
col_encode = ['neighbourhood_group', 'neighbourhood', 'room_type']

for col in col_encode:
    df[col] = enc.fit_transform(np.array(df_singa_airbnb[col]).reshape(-1,1))


In [115]:
df.head(5)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,last_review_date
0,49091,COZICOMFORT LONG TERM STAY ROOM 2,266763,Francesca,2.0,41.0,1.44255,103.7958,1.0,83,180,1,2013-10-21,0.01,2,365,2013-10-21
1,50646,Pleasant Room along Bukit Timah,227796,Sujatha,0.0,6.0,1.33235,103.78521,1.0,81,90,18,2014-12-26,0.28,1,365,2014-12-26
2,56334,COZICOMFORT,266763,Francesca,2.0,41.0,1.44246,103.79667,1.0,69,6,20,2015-10-01,0.2,2,365,2015-10-01
3,71609,Ensuite Room (Room 1 & 2) near EXPO,367042,Belinda,1.0,36.0,1.34541,103.95712,1.0,206,1,14,2019-08-11,0.15,9,353,2019-08-11
4,71896,B&B Room 1 near Airport & EXPO,367042,Belinda,1.0,36.0,1.34567,103.95963,1.0,94,1,22,2019-07-28,0.22,9,355,2019-07-28


### 3. Scaling

**MinMaxScaler**

In [116]:
col_mm_scale = ['calculated_host_listings_count', 'availability_365', 'latitude', 'longitude']

mm_scaler = MinMaxScaler()

for col in col_mm_scale:
    df[col] = mm_scaler.fit_transform(np.array(df_singa_airbnb[col]).reshape(-1,1))

In [117]:
col_ro_scale = ['minimum_nights', 'number_of_reviews', 'reviews_per_month']

ro_scaler = RobustScaler()

for col in col_ro_scale:
    df[col] = ro_scaler.fit_transform(np.array(df_singa_airbnb[col]).reshape(-1,1))

In [118]:
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,last_review_date
0,49091,COZICOMFORT LONG TERM STAY ROOM 2,266763,Francesca,2.0,41.0,0.942863,0.456587,1.0,83,19.666667,-0.1,2013-10-21,-0.453782,0.003663,1.0,2013-10-21
1,50646,Pleasant Room along Bukit Timah,227796,Sujatha,0.0,6.0,0.419894,0.424188,1.0,81,9.666667,1.6,2014-12-26,-0.226891,0.0,1.0,2014-12-26
2,56334,COZICOMFORT,266763,Francesca,2.0,41.0,0.942435,0.459249,1.0,69,0.333333,1.8,2015-10-01,-0.294118,0.003663,1.0,2015-10-01
3,71609,Ensuite Room (Room 1 & 2) near EXPO,367042,Belinda,1.0,36.0,0.481872,0.950132,1.0,206,-0.222222,1.2,2019-08-11,-0.336134,0.029304,0.967123,2019-08-11
4,71896,B&B Room 1 near Airport & EXPO,367042,Belinda,1.0,36.0,0.483106,0.957811,1.0,94,-0.222222,2.0,2019-07-28,-0.277311,0.029304,0.972603,2019-07-28


### 4. Feature engineering

In [119]:
# https://towardsdatascience.com/machine-learning-with-datetime-feature-engineering-predicting-healthcare-appointment-no-shows-5e4ca3a85f96

df['last_review_year'] = df['last_review_date'].dt.year
df['last_review_month'] = df['last_review_date'].dt.month
df['last_review_week'] = df['last_review_date'].dt.week
df['last_review_day'] = df['last_review_date'].dt.day
df['last_review_dayofweek'] = df['last_review_date'].dt.dayofweek

  df['last_review_week'] = df['last_review_date'].dt.week


In [120]:
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,...,last_review,reviews_per_month,calculated_host_listings_count,availability_365,last_review_date,last_review_year,last_review_month,last_review_week,last_review_day,last_review_dayofweek
0,49091,COZICOMFORT LONG TERM STAY ROOM 2,266763,Francesca,2.0,41.0,0.942863,0.456587,1.0,83,...,2013-10-21,-0.453782,0.003663,1.0,2013-10-21,2013,10,43,21,0
1,50646,Pleasant Room along Bukit Timah,227796,Sujatha,0.0,6.0,0.419894,0.424188,1.0,81,...,2014-12-26,-0.226891,0.0,1.0,2014-12-26,2014,12,52,26,4
2,56334,COZICOMFORT,266763,Francesca,2.0,41.0,0.942435,0.459249,1.0,69,...,2015-10-01,-0.294118,0.003663,1.0,2015-10-01,2015,10,40,1,3
3,71609,Ensuite Room (Room 1 & 2) near EXPO,367042,Belinda,1.0,36.0,0.481872,0.950132,1.0,206,...,2019-08-11,-0.336134,0.029304,0.967123,2019-08-11,2019,8,32,11,6
4,71896,B&B Room 1 near Airport & EXPO,367042,Belinda,1.0,36.0,0.483106,0.957811,1.0,94,...,2019-07-28,-0.277311,0.029304,0.972603,2019-07-28,2019,7,30,28,6


#### 5. Final wrap up and saving processed data for ML training

In [127]:
df.to_pickle('./processed_data.p')

In [126]:
df

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,...,last_review,reviews_per_month,calculated_host_listings_count,availability_365,last_review_date,last_review_year,last_review_month,last_review_week,last_review_day,last_review_dayofweek
0,49091,COZICOMFORT LONG TERM STAY ROOM 2,266763,Francesca,2.0,41.0,0.942863,0.456587,1.0,83,...,2013-10-21,-0.453782,0.003663,1.000000,2013-10-21,2013,10,43,21,0
1,50646,Pleasant Room along Bukit Timah,227796,Sujatha,0.0,6.0,0.419894,0.424188,1.0,81,...,2014-12-26,-0.226891,0.000000,1.000000,2014-12-26,2014,12,52,26,4
2,56334,COZICOMFORT,266763,Francesca,2.0,41.0,0.942435,0.459249,1.0,69,...,2015-10-01,-0.294118,0.003663,1.000000,2015-10-01,2015,10,40,1,3
3,71609,Ensuite Room (Room 1 & 2) near EXPO,367042,Belinda,1.0,36.0,0.481872,0.950132,1.0,206,...,2019-08-11,-0.336134,0.029304,0.967123,2019-08-11,2019,8,32,11,6
4,71896,B&B Room 1 near Airport & EXPO,367042,Belinda,1.0,36.0,0.483106,0.957811,1.0,94,...,2019-07-28,-0.277311,0.029304,0.972603,2019-07-28,2019,7,30,28,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7902,38105126,Loft 2 pax near Haw Par / Pasir Panjang. Free ...,278109833,Belle,0.0,27.0,0.170178,0.431224,0.0,100,...,,0.550000,0.109890,0.167123,2019-06-27,2019,6,26,27,3
7903,38108273,3bedroom luxury at Orchard,238891646,Neha,0.0,37.0,0.231682,0.549685,0.0,550,...,,0.550000,0.120879,1.000000,2019-06-27,2019,6,26,27,3
7904,38109336,[ Farrer Park ] New City Fringe CBD Mins to MRT,281448565,Mindy,0.0,15.0,0.327401,0.652879,1.0,58,...,,0.550000,0.007326,0.473973,2019-06-27,2019,6,26,27,3
7905,38110493,Cheap Master Room in Central of Singapore,243835202,Huang,0.0,28.0,0.244685,0.585725,1.0,56,...,,0.550000,0.003663,0.082192,2019-06-27,2019,6,26,27,3
