In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from matplotlib import pyplot as plt 
from scipy.stats import norm
from scipy import stats

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

import warnings
pd.set_option('max_columns', None)
pd.set_option('display.float_format', str)
warnings.filterwarnings('ignore')
%matplotlib inline

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = [8,4]
plt.rcParams["axes.edgecolor"] = "black"

# Feature engineering notebook

We don't really need to normalize the values for the XGB model we will be using as its a tree based model but if we plan to use a regression based model like a NN then encoding will be necesary. 

1. For the neighborhood column given its nomimal, it will need to be onehot encoded. 

2. The numerical columns will be normalized. 

4. GPS coordinates can be encoded using radian conversion

3. Ordinal values can be ordinally encoded. 

## Load dataset and info for use later 

In [2]:
df = pd.read_csv('datasets/cleaned.csv')
df.head()

Unnamed: 0,neighborhood,latitude,longitude,price,size_in_sqft,price_per_sqft,no_of_bedrooms,no_of_bathrooms,quality,maid_room,unfurnished,balcony,barbecue_area,built_in_wardrobes,central_ac,childrens_play_area,childrens_pool,concierge,covered_parking,kitchen_appliances,lobby_in_building,maid_service,networked,pets_allowed,private_garden,private_gym,private_jacuzzi,private_pool,security,shared_gym,shared_pool,shared_spa,study,vastu_compliant,view_of_landmark,view_of_water,walk_in_closet,num_venues,venue_senti,population_density,num_stations
0,NAKHLAT JUMEIRA,25.113208,55.138932,2700000,1079,2502.32,1,2,1,0,0,1,1,0,1,1,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,99,8.488888888888889,914.2330830451072,0
1,NAKHLAT JUMEIRA,25.106809,55.151201,2850000,1582,1801.52,2,2,1,0,0,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,99,8.488888888888889,914.2330830451072,0
2,AL THANYAH FIFTH,25.063302,55.137728,1150000,1951,589.44,3,5,1,1,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,1,1,0,0,0,1,1,1,51,8.21764705882353,4178.052207612604,2
3,AL JADAF,25.227295,55.341761,2850000,2020,1410.89,2,3,0,0,1,1,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,18,8.0,704.9373846101159,1
4,NAKHLAT JUMEIRA,25.114275,55.139764,1729200,507,3410.65,0,1,1,0,0,0,0,1,1,0,0,0,1,1,0,0,1,0,0,0,0,0,1,1,1,1,0,0,1,1,0,99,8.488888888888889,914.2330830451072,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1844 entries, 0 to 1843
Data columns (total 41 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   neighborhood         1844 non-null   object 
 1   latitude             1844 non-null   float64
 2   longitude            1844 non-null   float64
 3   price                1844 non-null   int64  
 4   size_in_sqft         1844 non-null   int64  
 5   price_per_sqft       1844 non-null   float64
 6   no_of_bedrooms       1844 non-null   int64  
 7   no_of_bathrooms      1844 non-null   int64  
 8   quality              1844 non-null   int64  
 9   maid_room            1844 non-null   int64  
 10  unfurnished          1844 non-null   int64  
 11  balcony              1844 non-null   int64  
 12  barbecue_area        1844 non-null   int64  
 13  built_in_wardrobes   1844 non-null   int64  
 14  central_ac           1844 non-null   int64  
 15  childrens_play_area  1844 non-null   i

In [4]:
df.columns

Index(['neighborhood', 'latitude', 'longitude', 'price', 'size_in_sqft',
       'price_per_sqft', 'no_of_bedrooms', 'no_of_bathrooms', 'quality',
       'maid_room', 'unfurnished', 'balcony', 'barbecue_area',
       'built_in_wardrobes', 'central_ac', 'childrens_play_area',
       'childrens_pool', 'concierge', 'covered_parking', 'kitchen_appliances',
       'lobby_in_building', 'maid_service', 'networked', 'pets_allowed',
       'private_garden', 'private_gym', 'private_jacuzzi', 'private_pool',
       'security', 'shared_gym', 'shared_pool', 'shared_spa', 'study',
       'vastu_compliant', 'view_of_landmark', 'view_of_water',
       'walk_in_closet', 'num_venues', 'venue_senti', 'population_density',
       'num_stations'],
      dtype='object')

## Convert latitude and longitude to radians

In [5]:
# changing GPS cols to radians for better scaling 
df['latitude'] = np.radians(df['latitude'])
df['longitude'] = np.radians(df['longitude'])

df.head()

Unnamed: 0,neighborhood,latitude,longitude,price,size_in_sqft,price_per_sqft,no_of_bedrooms,no_of_bathrooms,quality,maid_room,unfurnished,balcony,barbecue_area,built_in_wardrobes,central_ac,childrens_play_area,childrens_pool,concierge,covered_parking,kitchen_appliances,lobby_in_building,maid_service,networked,pets_allowed,private_garden,private_gym,private_jacuzzi,private_pool,security,shared_gym,shared_pool,shared_spa,study,vastu_compliant,view_of_landmark,view_of_water,walk_in_closet,num_venues,venue_senti,population_density,num_stations
0,NAKHLAT JUMEIRA,0.4383081653381801,0.962355909433262,2700000,1079,2502.32,1,2,1,0,0,1,1,0,1,1,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,99,8.488888888888889,914.2330830451072,0
1,NAKHLAT JUMEIRA,0.438196481719345,0.9625700438791892,2850000,1582,1801.52,2,2,1,0,0,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,99,8.488888888888889,914.2330830451072,0
2,AL THANYAH FIFTH,0.4374371413216798,0.962334895669068,1150000,1951,589.44,3,5,1,1,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,1,1,0,0,0,1,1,1,51,8.21764705882353,4178.052207612604,2
3,AL JADAF,0.4402993591219029,0.9658959433017896,2850000,2020,1410.89,2,3,0,0,1,1,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,18,8.0,704.9373846101159,1
4,NAKHLAT JUMEIRA,0.4383267880012989,0.9623704305726386,1729200,507,3410.65,0,1,1,0,0,0,0,1,1,0,0,0,1,1,0,0,1,0,0,0,0,0,1,1,1,1,0,0,1,1,0,99,8.488888888888889,914.2330830451072,0


## Normalize numerical values

In [6]:
#numerical columns
num_cols = ['price', 'size_in_sqft',
            'price_per_sqft', 'no_of_bedrooms', 'no_of_bathrooms',
            'quality','num_venues', 'venue_senti',
            'population_density','num_stations']

# normalization function
norm = MinMaxScaler().fit(df[num_cols])

df[num_cols] = norm.transform(df[num_cols])


df.head()

Unnamed: 0,neighborhood,latitude,longitude,price,size_in_sqft,price_per_sqft,no_of_bedrooms,no_of_bathrooms,quality,maid_room,unfurnished,balcony,barbecue_area,built_in_wardrobes,central_ac,childrens_play_area,childrens_pool,concierge,covered_parking,kitchen_appliances,lobby_in_building,maid_service,networked,pets_allowed,private_garden,private_gym,private_jacuzzi,private_pool,security,shared_gym,shared_pool,shared_spa,study,vastu_compliant,view_of_landmark,view_of_water,walk_in_closet,num_venues,venue_senti,population_density,num_stations
0,NAKHLAT JUMEIRA,0.4383081653381801,0.962355909433262,0.0713053479010925,0.0845722904546434,0.4816494149414941,0.2,0.2,0.3333333333333333,0,0,1,1,0,1,1,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0.98989898989899,0.8520710059171606,0.039276146494397,0.0
1,NAKHLAT JUMEIRA,0.438196481719345,0.9625700438791892,0.0756181713628522,0.138763197586727,0.3239536453645364,0.4,0.2,0.3333333333333333,0,0,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0.98989898989899,0.8520710059171606,0.039276146494397,0.0
2,AL THANYAH FIFTH,0.4374371413216798,0.962334895669068,0.0267395054629097,0.178517560870502,0.0512083708370837,0.6000000000000001,0.8,0.3333333333333333,1,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,1,1,0,0,0,1,1,1,0.5050505050505052,0.7076226940480348,0.1798788778884154,1.0
3,AL JADAF,0.4402993591219029,0.9658959433017896,0.0756181713628522,0.1859513035983624,0.236053105310531,0.4,0.4000000000000001,0.0,0,1,1,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.1717171717171717,0.5917159763313613,0.0302598543085124,0.5
4,NAKHLAT JUMEIRA,0.4383267880012989,0.9623704305726386,0.0433927544565842,0.0229476405946994,0.6860441044104411,0.0,0.0,0.3333333333333333,0,0,0,0,1,1,0,0,0,1,1,0,0,1,0,0,0,0,0,1,1,1,1,0,0,1,1,0,0.98989898989899,0.8520710059171606,0.039276146494397,0.0


## One hot encode neighborhood column

In [None]:
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)