In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import geopandas
from branca.colormap import linear
import branca.colormap as cmp
import folium
from shapely.ops import nearest_points
from shapely.geometry import *


In [2]:
neighbour_df = geopandas.read_file('../input/dubai-neighborhoods/dubai.geojson')
stations_df = geopandas.read_file('../input/dubai-neighborhoods/dubai_metro_stations.geojson')
bus_df = pd.read_csv('../input/dubai-bus-stops/Bus_Stop_Details (1).csv')
venues_df = pd.read_csv('../input/dubai-neighborhoods/dubai_venues.csv')

In [3]:
bus_df.head()

# Data Exploration and Visualisation

In [4]:
print(neighbour_df.info())
print(stations_df.info())

In [5]:
# Dataset information
print(bus_df.info())
print(venues_df.info())

In [6]:
# Lets look at the missing values  
def missing_data_info(df, size):
    missing_df = pd.DataFrame({'Total Missing': df.isnull().sum(), 'Percentage': (df.isnull().sum()/size)*100})
    missing_df = missing_df[missing_df['Total Missing'] > 0]
    return missing_df

In [7]:
print(missing_data_info(bus_df, bus_df.shape[0]))

In [8]:
print(missing_data_info(venues_df, bus_df.shape[0]))

## Observation
The Bus and Venue dataset have null values. 
1. In the bus dataset, *street_name, bus_stop_type, mupi_available, rtpi_available, last_survey_date* have null values
2. In the venues dataset, *Rating, Rating Color, Rating Signals, Url, Price Tier, Price Message, and Price currency* have null values.

The first cleaning included dropping columns with null values based on two conditions:
1. They do not provide any valuable data for our objective
2. There aren't enough non null values to extrapolate information for the remaining records, eg. more than 75% of the data is missing

Based on condition 1, *street_name, bus_stop_type, mupi_available, rtpi_available, last_survey_date* were deleted from bus dataset

Based on condition 2, *Rating Color, Rating Signals, Price Message, and Price currency* were deleted from the venues dataset. These had repitive information on existing columns and did not affect the objective of the application


In [9]:
bus_df.drop(['street_name', 'last_survey_date', 'bus_stop_type', 'mupi_available', 'rtpi_available'], axis = 1, inplace = True)
venues_df.drop(['Rating Color', 'Rating Signals', 'Price Message', 'Price Currency'], axis = 1, inplace = True)

In [10]:
print(neighbour_df['Sector'].value_counts())

In [11]:
print(stations_df['ROUTE'].value_counts())

## Observing Value Datatypes

The following feature columns seemed to have mismatched datatypes
1. neighbour_df = *Sector* column can be better expressed as category representing the sector the community area belongs to
2. stations_df = The *ROUTE* column represents which metro line the metro station is at, which is better as a category. 
3. bus_df = *valid_from* and *valid_to* columns represent dates, but are stored as objects
4. venues_df = *Sector, Venue Category, Rating,* and *Price Tier* are all categorical data as well

In [12]:
neighbour_df['Sector'] = neighbour_df.Sector.astype('category')
stations_df['ROUTE'] = stations_df.ROUTE.astype('category')
venues_df['Sector'] = venues_df.Sector.astype('category')
venues_df['Venue Category'] = venues_df['Venue Category'].astype('category')
venues_df['Rating'] = venues_df.Rating.astype('category')
venues_df['Price Tier'] = venues_df['Price Tier'].astype('category')
bus_df['valid_from'] = pd.to_datetime(bus_df['valid_from'])
bus_df['valid_until'] = pd.to_datetime(bus_df['valid_until'])

## Visualising Geometric Data

In [13]:
bus_df = geopandas.GeoDataFrame(bus_df, 
            geometry = geopandas.points_from_xy(bus_df['stop_location_longitude'], bus_df['stop_location_latitude']), 
            crs="EPSG:4326")                       

In [14]:
venues_df = geopandas.GeoDataFrame(venues_df, 
            geometry = geopandas.points_from_xy(venues_df['Venue Longitude'], venues_df['Venue Latitude']), 
            crs="EPSG:4326")                       

In [15]:
bus_stations_df = bus_df[['stop_id', 'stop_name', 'stop_location_latitude', 'stop_location_longitude', 'route_name', 'geometry']]     
bus_stations_df = bus_stations_df.drop_duplicates('stop_id')

In [16]:
#setup map
dubai_geo = r'../input/dubai-neighborhoods/dubai.geojson'
dubai_lat = 25.05
dubai_long = 55.11
map1 = folium.Map([dubai_lat, dubai_long], tiles='CartoDB positron', zoom_start=10.5)


fg_stations = folium.FeatureGroup(name='Metro Stations', overlay=True, control=True, show=True ).add_to(map1)
fg_venues = folium.FeatureGroup(name='Venues', overlay=True, control=True, show=True ).add_to(map1)
fg_bus_stations = folium.FeatureGroup(name='Bus Stations', overlay=True, control=True, show=True ).add_to(map1)

for index, row in venues_df.iterrows():
    popup_text = '{}, {}'.format(row['Venue'], row['CNAME_E'])
    popup_text = folium.Popup(popup_text, parse_html=True)    
    folium.CircleMarker(location=(row['Venue Latitude'],
                                  row['Venue Longitude']),  
                        radius=1,
                        color='#0000ff',
                        popup=popup_text,
                        parse_html=False,
                        fill=False).add_to(fg_venues)
    
for index, row in stations_df.iterrows():
    popup_text = '{}, {}'.format(row['STATION_NAME'], row['COMMUNITY_NAME'])
    popup_text = folium.Popup(popup_text, parse_html=True)    
    folium.CircleMarker(location=(row['LAT_STATION'],
                                  row['LON_STATION']),   
                        radius=1,
                        color='#ff0000',
                        popup=popup_text,
                        parse_html=False,
                        fill=False).add_to(fg_stations)
    
for index, row in bus_stations_df.iterrows():
    popup_text = '{}, {}'.format(row['stop_name'], row['route_name'])
    popup_text = folium.Popup(popup_text, parse_html=True)    
    folium.CircleMarker(location=(row['stop_location_latitude'],
                                  row['stop_location_longitude']),   
                        radius=1,
                        color='#00ff00',
                        popup=popup_text,
                        parse_html=False,
                        fill=False).add_to(fg_bus_stations)
    


folium.LayerControl().add_to(map1)
map1.save('PointMap.html')
#IFrame(src='./MetroMap.html', width=900, height=500)
map1

In [17]:
neighbour_df

In [18]:
neighbour_df['population_density'] = neighbour_df['Population 2019'] / neighbour_df['Area Sq Km']

# Data Science Application
## Building communities
From the data visualisation, we can see that we have many bus stops and metro stations near venues, with venues having multiple stations in close proximity.

Our first approach was to match each venue to a bus stop or station, however from our preliminary visualisation we can see a very obvious problem, as illustrated below.

In [20]:
# communities = bus_stations_df.within(neighbour_df.loc['geometry'])
bus_communities_df = geopandas.sjoin(bus_stations_df, neighbour_df[['CNAME_E', 'geometry']], op='within')

# drop the indices
bus_communities_df.drop('index_right', axis = 1, inplace = True)

# renaming and rearranging columns
bus_communities_df.columns = ['STATION_NO', 'STATION_NAME', 'LAT_STATION', 'LON_STATION', 'ROUTE', 'geometry', 'COMMUNITY_NAME']
bus_communities_df = bus_communities_df[['STATION_NO', 'COMMUNITY_NAME', 'STATION_NAME', 'LAT_STATION', 'LON_STATION', 'ROUTE', 'geometry']]
bus_communities_df

In [27]:
all_stations_df = stations_df.append(bus_communities_df)
all_stations_df.to_csv('stations.csv')

In [24]:
venues_df['COMMUINITY_NAME'] = venues_df['CNAME_E']
venues_df.drop('CNAME_E', axis = 1)

In [26]:
venues_df.to_csv('./venues.csv')

In [None]:
venues_df.to_csv('./venues_station_nearest.csv')  

In [22]:
def calculate_nearest(row, destination, val, col="geometry"):
    # 1 - create unary union    
    dest_unary = destination['geometry'].unary_union
    # 2 - find closest point
    nearest_geom = nearest_points(row[col], dest_unary)
    # 3 - Find the corresponding geom
    match_geom = destination.loc[destination.geometry 
                == nearest_geom[1]]
    # 4 - get the corresponding value
    match_value = match_geom[val].to_numpy()[0]
    return match_value

In [None]:
# Get the nearest geometry
venues_df["nearest_geometry"] = venues_df.apply(calculate_nearest, destination=all_stations_df, val="geometry", axis=1)
# Get the nearest metro station name
venues_df["nearest_station"] = venues_df.apply(calculate_nearest, destination=all_stations_df, val="STATION_NAME", axis=1)

In [None]:
venues_df.to_csv('./venues_station_nearest.csv')  

## Visualise the nearest stations

In [None]:
# Create LineString Geometry
venues_df['line'] = venues_df.apply(lambda row: LineString([row['geometry'], row['nearest_geometry']]), axis=1)
# Create Line Geodataframe
line_df = venues_df[["Venue", "nearest_station", "line"]].set_geometry('line')
# Set the Coordinate reference
line_df.crs = "epsg:4326"

In [None]:
map2 = folium.Map([dubai_lat, dubai_long],zoom_start = 12,  tiles="CartoDb dark_matter")

locs_stations = zip(stations_df.LON_STATION, stations_df.LAT_STATION)
locs_points = zip(venues_df['Venue Longitude'], venues_df['Venue Latitude'])

for location in locs_stations:
    folium.CircleMarker(location=location, 
              color="red",  radius=4).add_to(map2)
    
for location in locs_points:
    folium.CircleMarker(location=location, 
              color="white", radius=2).add_to(map2)
    
folium.GeoJson(line_df).add_to(map2)

map2.save("line_map2.html")
map2

## Observations
As we can see, this creates a limited approach, where in dense areas, each venue only has one corresponding bus stop or metro station. This is an inaccruate representation as the venues are in the same locality, but will not be considered a near by spot for the other stations in the vicinity.

Hence, we adapted another approach by clustering these in the same community by using an inner join on getting the metro and bus stations in the same community first, and then mapping accordingly

In [None]:
m = folium.Map(location=[dubai_lat, dubai_long], zoom_start=10, tiles='CartoDB dark_matter')

for _, r in neighbour_df.iterrows():
    #without simplifying the representation of each borough, the map might not be displayed
    #sim_geo = gpd.GeoSeries(r['geometry'])
    sim_geo = geopandas.GeoSeries(r['geometry']).simplify(tolerance=0.001)
    geo_j = sim_geo.to_json()
    geo_j = folium.GeoJson(data=geo_j,
                           style_function=lambda x: {'fillColor': 'white'})
    folium.Popup(r['CNAME_E']).add_to(geo_j)
    geo_j.add_to(m)
    
fg_venues = folium.FeatureGroup(name='Venues', overlay=True, control=True, show=True ).add_to(m)
fg_stations = folium.FeatureGroup(name='Stations', overlay=True, control=True, show=True ).add_to(m)

for index, row in venues_df.iterrows():
    popup_text = '{}, {}'.format(row['Venue'], row['CNAME_E'])
    popup_text = folium.Popup(popup_text, parse_html=True)    
    folium.CircleMarker(location=(row['Venue Latitude'],
                                  row['Venue Longitude']),  
                        radius=1,
                        color='#0000ff',
                        popup=popup_text,
                        parse_html=False,
                        fill=False).add_to(fg_venues)
    
for index, row in all_stations_df.iterrows():
    popup_text = '{}, {}'.format(row['STATION_NAME'], row['COMMUNITY_NAME'])
    popup_text = folium.Popup(popup_text, parse_html=True)    
    folium.CircleMarker(location=(row['LAT_STATION'],
                                  row['LON_STATION']),   
                        radius=1,
                        color='#ff0000',
                        popup=popup_text,
                        parse_html=False,
                        fill=False).add_to(fg_stations)
m