In [1]:
import os
import numpy as np
import pandas as pd
import geopandas as gpd

In [2]:
os.listdir('datasets/neighbourhood_data')


['dubai.geojson',
 'dubai_metro_stations.geojson',
 'dubai_pop_2018.csv',
 'dubai_pop_2019.csv',
 'dubai_venues.csv',
 'metro_venues_total.csv']

In [3]:
os.listdir('datasets')

['neighbourhood_data', 'prices_main.csv']

# Load datasets

## Main dataset

In [4]:
df_main = pd.read_csv('datasets/prices_main.csv')
df_main.head()

Unnamed: 0,id,neighborhood,latitude,longitude,price,size_in_sqft,price_per_sqft,no_of_bedrooms,no_of_bathrooms,quality,...,private_pool,security,shared_gym,shared_pool,shared_spa,study,vastu_compliant,view_of_landmark,view_of_water,walk_in_closet
0,5528049,Palm Jumeirah,25.113208,55.138932,2700000,1079,2502.32,1,2,Medium,...,False,False,True,False,False,False,False,False,True,False
1,6008529,Palm Jumeirah,25.106809,55.151201,2850000,1582,1801.52,2,2,Medium,...,False,False,True,True,False,False,False,False,True,False
2,6034542,Jumeirah Lake Towers,25.063302,55.137728,1150000,1951,589.44,3,5,Medium,...,False,True,True,True,False,False,False,True,True,True
3,6326063,Culture Village,25.227295,55.341761,2850000,2020,1410.89,2,3,Low,...,False,False,False,False,False,False,False,False,False,False
4,6356778,Palm Jumeirah,25.114275,55.139764,1729200,507,3410.65,0,1,Medium,...,False,True,True,True,True,False,False,True,True,False


## Geojson file

In [5]:
df_geo = gpd.read_file('datasets/neighbourhood_data/dubai.geojson')
df_geo.head()

Unnamed: 0,CNAME_E,CNAME_A,COMMUNITY_E,COMMUNITY_A,COMM_NUM,SHAPE_AREA,SHAPE_LEN,Sector,Population 2018,Population 2019,Area Sq Km,Latitude,Longitude,geometry
0,HEFAIR,حفير,HEFAIR - 991,حفير - 991,991,143566502.005,51721.2111825,9,0,0,143.565131,24.693237,55.24914,"POLYGON ((55.24246 24.74379, 55.26266 24.74935..."
1,AL QUSAIS IND. SECOND,القصيص الصناعية الثانية,AL QUSAIS IND. SECOND - 243,القصيص الصناعية الثانية - 243,243,1728332.8155,6052.3652815,2,8063,8834,1.728331,25.281654,55.393184,"POLYGON ((55.39830 25.29000, 55.39917 25.28948..."
2,AL JAFILIYA,الجافلية,AL JAFILIYA - 323,الجافلية - 323,323,1669479.4215,5220.96603607,3,21957,23963,1.669479,25.237635,55.286851,"POLYGON ((55.28809 25.24352, 55.28829 25.24341..."
3,JABAL ALI SECOND,جبل علي الثانية,JABAL ALI SECOND - 592,جبل علي الثانية - 592,592,5059596.0639,10664.7225181,5,1561,1570,5.059545,25.053979,55.115282,"POLYGON ((55.13269 25.06475, 55.12714 25.05840..."
4,WADI AL SAFA 5,وادي الصفا 5,WADI AL SAFA 5 - 648,وادي الصفا 5 - 648,648,16319666.8572,22350.6627618,6,19550,21241,16.31881,25.077814,55.352154,"POLYGON ((55.37906 25.10325, 55.37908 25.10322..."


In [6]:
df_geo.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 226 entries, 0 to 225
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   CNAME_E          226 non-null    object  
 1   CNAME_A          226 non-null    object  
 2   COMMUNITY_E      226 non-null    object  
 3   COMMUNITY_A      226 non-null    object  
 4   COMM_NUM         226 non-null    object  
 5   SHAPE_AREA       226 non-null    object  
 6   SHAPE_LEN        226 non-null    object  
 7   Sector           226 non-null    object  
 8   Population 2018  226 non-null    int64   
 9   Population 2019  226 non-null    int64   
 10  Area Sq Km       226 non-null    float64 
 11  Latitude         226 non-null    float64 
 12  Longitude        226 non-null    float64 
 13  geometry         226 non-null    geometry
dtypes: float64(3), geometry(1), int64(2), object(8)
memory usage: 24.8+ KB


In [7]:
df_geo.columns

Index(['CNAME_E', 'CNAME_A', 'COMMUNITY_E', 'COMMUNITY_A', 'COMM_NUM',
       'SHAPE_AREA', 'SHAPE_LEN', 'Sector', 'Population 2018',
       'Population 2019', 'Area Sq Km', 'Latitude', 'Longitude', 'geometry'],
      dtype='object')

## Mapping dictionary

In [8]:
# Going to use this to replace the column names in df_main with the df_geo neighborhood names
# To make sure the datasets have primary key  
# This had to be done manually to apply discretion into which neighborhood belongs where 
map_dict = {'Palm Jumeirah':'NAKHLAT JUMEIRA',
           'Jumeirah Lake Towers':'AL THANYAH FIFTH',
           'Culture Village':'AL JADAF',
           'Downtown Dubai':'BURJ KHALIFA',
           'Dubai Marina':'MARSA DUBAI',
           'Business Bay':'BUSINESS BAY',
           'Old Town':'BURJ KHALIFA',
           'Al Kifaf':'AL KIFAF',
           'Meydan':'NADD AL SHIBA FIRST',
           'Arjan':'AL BARSHA SOUTH THIRD',
           'Jumeirah Beach Residence':'MARSA DUBAI',
           'Dubai Creek Harbour (The Lagoons)':'AL KHEERAN FIRST',
           'Greens':'AL THANYAH THIRD',
           'City Walk':'AL WASL',
           'Al Furjan':'JABAL ALI FIRST',
           'DAMAC Hills':'AL HEBIAH THIRD',
           'Jumeirah Golf Estates':'MEAISEM FIRST',
           'Jumeirah':'JUMEIRA FIRST',
           'Dubai Hills Estate':'HADAEQ SHEIKH MOHAMMED BIN RASHID',
           'Umm Suqeim':'UMM SUQEIM FIRST',
           'Motor City':'AL HEBIAH FIRST',
           'DIFC':'ZAABEEL SECOND',
           'Jumeirah Village Circle':'AL BARSHA SOUTH FOURTH',
           'Barsha Heights (Tecom)':'AL THANYAH FIRST',
           'Al Barari':'WADI AL SAFA 3',
           'Dubai Production City (IMPZ)':'MEAISEM FIRST',
           'The Hills':'AL THANYAH THIRD',
           'The Views':'AL THANYAH THIRD',
           'Dubai Sports City':'AL HEBIAH FOURTH',
           'Dubai Silicon Oasis':'NADD HESSA',
           'Jumeirah Village Triangle':'AL BARSHA SOUTH FIFTH',   
           'Mohammed Bin Rashid City':'WADI AL SAFA 3',
           'Dubai Harbour':'MARSA DUBAI',
           'Bluewaters':'MARSA DUBAI',
           'International City':'WARSAN FIRST',
           'Falcon City of Wonders':'WADI AL SAFA 2',
           'Mina Rashid':'MADINAT DUBAI AL MELAHEYAH',
           'Town Square':'AL YALAYIS 2',
           'Green Community':'DUBAI INVESTMENT PARK FIRST',
           'Al Barsha':'AL BARSHA FIRST',
           'Al Sufouh':'AL SAFOUH FIRST',
           'Dubai Festival City':'AL KHEERAN',
           'Jebel Ali':'JABAL ALI FIRST',
           'World Trade Center':'TRADE CENTER SECOND',
           'Mudon':'AL HEBIAH SIXTH',
           'Discovery Gardens':'JABAL ALI FIRST',
           'Remraam':'AL HEBIAH FIFTH',
           'Mirdif':'MIRDIF',
           'Dubai South (Dubai World Central)':'MADINAT AL MATAAR',
           'Dubai Healthcare City':'UMM HURAIR SECOND',
           'wasl gate':'JABAL ALI FIRST',
           'Dubai Residence Complex':'WADI AL SAFA 5',
           'Al Quoz':'AL QOUZ FIRST'}

___________________________________________________________

## Venues

In [9]:
df_venues = pd.read_csv('datasets/neighbourhood_data/dubai_venues.csv')
df_venues.head()

Unnamed: 0,COMM_NUM,CNAME_E,Sector,Community Latitude,Community Longitude,Venue Id,Venue,Venue Latitude,Venue Longitude,Venue Category,Stats TipsCount,Stats UsersCount,Stats CheckinsCount,Rating,Rating Color,Rating Signals,Url,Price Tier,Price Message,Price Currency
0,111,AL CORNICHE,1,25.277837,55.302557,4ccafdcaaa25a35d0e100b0f,Miyako 京,25.278925,55.304343,Japanese Restaurant,11,213,406,8.7,73CF42,42.0,http://dubai.regency.hyatt.com/en/hotel/dining...,2.0,Moderate,$
1,111,AL CORNICHE,1,25.277837,55.302557,4d76451348b7f04dabadf2f5,Al Dawaar,25.279111,55.304414,Restaurant,19,533,651,8.5,73CF42,72.0,http://dubai.regency.hyatt.com/en/hotel/dining...,2.0,Moderate,$
2,111,AL CORNICHE,1,25.277837,55.302557,4c0190f4b58376b05062443c,The Kitchen,25.279047,55.304267,Restaurant,3,242,437,8.3,73CF42,26.0,http://dubai.regency.hyatt.com/en/hotel/dining...,2.0,Moderate,$
3,111,AL CORNICHE,1,25.277837,55.302557,4df4e8d9aeb7170aa2f5e7c5,Club Olympus,25.278362,55.304438,Gym,2,167,749,8.2,73CF42,18.0,http://dubai.regency.hyatt.com/en/hotel/dining...,,,
4,111,AL CORNICHE,1,25.277837,55.302557,4bb0bf0ef964a52055593ce3,Hyatt Regency Dubai (حياة ريجنسي),25.27924,55.304364,Hotel,55,4498,8660,8.1,73CF42,366.0,http://dubai.regency.hyatt.com,,,


In [10]:
# Number of venues around neighbourhood 
df_venues.groupby('CNAME_E').count()['Venue']

CNAME_E
ABU HAIL            1
AL AWIR FIRST       7
AL AWIR SECOND      6
AL BADA'            2
AL BARAHA           5
                   ..
WARSAN FIRST       46
WARSAN SECOND       2
WARSAN THIRD        2
ZAA'BEEL FIRST      2
ZAA'BEEL SECOND    32
Name: Venue, Length: 178, dtype: int64

In [11]:
# Average quality of venues by neighbourhood 
df_venues.groupby('CNAME_E').mean()['Rating']

CNAME_E
ABU HAIL           7.500000
AL AWIR FIRST      8.320000
AL AWIR SECOND     7.733333
AL BADA'           8.300000
AL BARAHA          7.360000
                     ...   
WARSAN FIRST       7.639130
WARSAN SECOND      8.250000
WARSAN THIRD       8.250000
ZAA'BEEL FIRST     8.200000
ZAA'BEEL SECOND    8.903125
Name: Rating, Length: 178, dtype: float64

In [12]:
# Neighborhoods and their polygons
df_geo.groupby('CNAME_E').first().head()['geometry']

CNAME_E
ABU HAIL          POLYGON ((55.33669 25.28525, 55.33752 25.28458...
AL AWIR FIRST     POLYGON ((55.63303 25.19255, 55.61132 25.18878...
AL AWIR SECOND    POLYGON ((55.63307 25.19236, 55.63440 25.18606...
AL BADA'          POLYGON ((55.27568 25.23722, 55.27591 25.23708...
AL BARAHA         POLYGON ((55.32648 25.27985, 55.32700 25.27888...
Name: geometry, dtype: geometry

## Metros

In [13]:
df_metro = pd.read_csv('datasets/neighbourhood_data/metro_venues_total.csv')
df_metro.head(5)

Unnamed: 0,station_no,station_name,latitude,longitude,route,category_name,venues
0,E07,Expo Station,24.963368,55.146201,Expo,Arts & Entertainment,1
1,E07,Expo Station,24.963368,55.146201,Expo,College & University,0
2,E07,Expo Station,24.963368,55.146201,Expo,Event,0
3,E07,Expo Station,24.963368,55.146201,Expo,Food,0
4,E07,Expo Station,24.963368,55.146201,Expo,Nightlife Spot,0


In [14]:
df_metro['station_name'].nunique()

54

In [15]:
df_metro.groupby('station_name').first()[['latitude','longitude']].head(10)

Unnamed: 0_level_0,latitude,longitude
station_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Abu Baker Al Siddique,25.270904,55.332983
Abu Dhabi Commercial Bank,25.244494,55.298196
Abu Hail,25.275242,55.346268
Airport Terminal 1,25.248428,55.352474
Airport Terminal 3,25.245013,55.359526
Al Fahidi,25.258301,55.297559
Al Ghubaiba,25.265085,55.288954
Al Jadaf,25.224978,55.333674
Al Jafiliya,25.233497,55.292132
Al Nahda,25.273274,55.369341


# Let the cleaning begin 

### 1. Replace the names in the training dataset with the geospatial neighborhood names

In [16]:
df_main['neighborhood'] = df_main['neighborhood'].replace(map_dict)
df_main.head(5)

Unnamed: 0,id,neighborhood,latitude,longitude,price,size_in_sqft,price_per_sqft,no_of_bedrooms,no_of_bathrooms,quality,...,private_pool,security,shared_gym,shared_pool,shared_spa,study,vastu_compliant,view_of_landmark,view_of_water,walk_in_closet
0,5528049,NAKHLAT JUMEIRA,25.113208,55.138932,2700000,1079,2502.32,1,2,Medium,...,False,False,True,False,False,False,False,False,True,False
1,6008529,NAKHLAT JUMEIRA,25.106809,55.151201,2850000,1582,1801.52,2,2,Medium,...,False,False,True,True,False,False,False,False,True,False
2,6034542,AL THANYAH FIFTH,25.063302,55.137728,1150000,1951,589.44,3,5,Medium,...,False,True,True,True,False,False,False,True,True,True
3,6326063,AL JADAF,25.227295,55.341761,2850000,2020,1410.89,2,3,Low,...,False,False,False,False,False,False,False,False,False,False
4,6356778,NAKHLAT JUMEIRA,25.114275,55.139764,1729200,507,3410.65,0,1,Medium,...,False,True,True,True,True,False,False,True,True,False


In [17]:
# view columns
df_main.columns

Index(['id', 'neighborhood', 'latitude', 'longitude', 'price', 'size_in_sqft',
       'price_per_sqft', 'no_of_bedrooms', 'no_of_bathrooms', 'quality',
       'maid_room', 'unfurnished', 'balcony', 'barbecue_area',
       'built_in_wardrobes', 'central_ac', 'childrens_play_area',
       'childrens_pool', 'concierge', 'covered_parking', 'kitchen_appliances',
       'lobby_in_building', 'maid_service', 'networked', 'pets_allowed',
       'private_garden', 'private_gym', 'private_jacuzzi', 'private_pool',
       'security', 'shared_gym', 'shared_pool', 'shared_spa', 'study',
       'vastu_compliant', 'view_of_landmark', 'view_of_water',
       'walk_in_closet'],
      dtype='object')

df_main