In [1]:
# Import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Load data

df = pd.read_csv('airbnb_dataset_ml_process.csv')
df.head(3)

Unnamed: 0.1,Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,expensive
0,0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365,non-expensive
1,1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355,expensive
2,2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365,non-expensive


In [3]:
# Drop nulls

df.replace([np.inf, -np.inf], np.nan, inplace=True)
df = df.dropna()

In [4]:
# Check the price of expensive vs. non-expensive

df.groupby(['expensive']).mean()['price']

expensive
expensive        311.277369
non-expensive     92.569628
Name: price, dtype: float64

# Categorical Variables

In [5]:
# Method 1.1: One Hot Encoding (use dummy variables) - Multiple Categories

dummies = pd.get_dummies(df['expensive'])
# Concat the dummies to the main df
hot_encoded_single_df = pd.concat([df, dummies], axis=1)
dummies.head()

Unnamed: 0,expensive,non-expensive
0,0,1
1,1,0
3,0,1
4,0,1
5,1,0


In [6]:
# Method 1.2: One Hot Encoding (use dummy variables) - Multiple Categories

mult_dummies = pd.get_dummies(df['neighbourhood'])
hot_encoded_mult_df = pd.concat([df, dummies], axis=1)
mult_dummies.head()

# High cardinality: Too many columns that slow the ML training process

Unnamed: 0,Allerton,Arden Heights,Arrochar,Arverne,Astoria,Bath Beach,Battery Park City,Bay Ridge,Bay Terrace,"Bay Terrace, Staten Island",...,Westchester Square,Westerleigh,Whitestone,Williamsbridge,Williamsburg,Willowbrook,Windsor Terrace,Woodhaven,Woodlawn,Woodside
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Method 2: Ordinal Endcoding
# Can be used when the data has a kind of order in it.
# Can cop with high cardinality

df_ord = df.copy()
ord_encoding = {
    'Entire home/apt':0,
    'Private room':1,
    'Shared room':2}
df_ord['room_type_ord_enc'] = df_ord['room_type'].map(ord_encoding)

In [8]:
# Method 3: Frequency Enconding
# Replace categories with their frequency.
# Can cope with high cardinality.

frequency_encoder = df.groupby(['neighbourhood']).size()
frequency_encoder

neighbourhood
Allerton            37
Arden Heights        4
Arrochar            20
Arverne             66
Astoria            709
                  ... 
Willowbrook          1
Windsor Terrace    128
Woodhaven           72
Woodlawn            11
Woodside           170
Length: 218, dtype: int64

In [9]:
freq_enc_df = df.copy()
freq_enc_df['neighbourhood'] = df['neighbourhood'].map(frequency_encoder)

In [10]:
# Method 4: Target Encoding
# Replace categories with their target (e.g. mean of the target).
# Can cope with high cardinality.

target_encoder = df.groupby(['neighbourhood']).mean()['price']
target_encoder

neighbourhood
Allerton            90.594595
Arden Heights       67.250000
Arrochar           118.250000
Arverne            158.515152
Astoria            116.018336
                      ...    
Willowbrook        249.000000
Windsor Terrace    130.687500
Woodhaven           62.722222
Woodlawn            60.090909
Woodside            89.852941
Name: price, Length: 218, dtype: float64

In [11]:
target_enc_df = df.copy()
target_enc_df['neighbourhood'] = df['neighbourhood'].map(target_encoder)