# First phase notebook: Segmenting and Clustering Neighborhoods in Toronto
TOC to be completed later

In [None]:
# importing libraries
import pandas as pd
import numpy as np
import requests
from pandas.io.json import json_normalize
from dotenv import load_dotenv
from pathlib import Path
import os

## 1st step, importing the dataset
in this step the dataset is read using pandas library. Then its 5 first row printed. The desired table is stored in the first table of url.

In [None]:
# importing dataset
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df = pd.read_html(url)[0]
df.head()

## 2nd step, cleaning and forming the dataset
According to the provided instruction, unique postal codes are analysed. Becasue the number of unique codes are the same of the current recodes, there is no need of merging or combining rows. In the next stage, Borough without assigned values are deleted. Then neigbourhoods without assigned value are investigated. Becasue there are no rows with such a specification, no cell is replaced with its borough. Finally, the shape of the dataset is printed and the last 5 rows are shown.

In [None]:
# cleaning and forming the dataset
print('The dataset includes {} records with {} unique postal codes \n'.format(len(df) , len(df['Postal Code'].unique())))
# igonring cells that Borough is not assigned
df = df[df['Borough'] != 'Not assigned']
df.reset_index(inplace = True, drop = True)
print('Aftering deleting rows without assigned boroughs, the number of records reduced to {} \n'.format(len(df)))
# assigning Borough to Neighbourhood where Neighbourhood is 'Not assigned'
n_na_neighbour = df['Neighbourhood'][df['Neighbourhood'] == 'Not assigned'].count()
print('After correcting NA boroughs, {} neighbourhoods found without assigned value \n'.format(n_na_neighbour))
print('the final shape of the dataset is {} \n'.format(df.shape))
df.tail()

## 3rd step, transforming the database
In the next phase of the project, each neighbourhood's coordinates should be found. So, having their name stored in a single cell is not desirable. The ideal form of dataset is having neighbourhood name in a cell, preferably set as index.

In [None]:
# creating a dataset setting each neighbourhood in one row
dfn = pd.DataFrame(columns = ['Postal Code', 'Borough', 'Neighbourhood'])
for nn in range(0, len(df) - 1):
    borough = df['Borough'].iloc[nn]
    post_code = df['Postal Code'].iloc[nn]
    neighbourhoods = df['Neighbourhood'].iloc[nn].split(', ')
    for neighbourhood in neighbourhoods: 
        dfn_add = pd.DataFrame({'Borough': [borough], 'Postal Code': [post_code], 'Neighbourhood' : [neighbourhood]})
        dfn = dfn.append(dfn_add, ignore_index=True)
print('the dataset includes {} neighbourhoods \n'.format(len(dfn)))
dfn.tail()

## 4th step, finding coordinates
According to the provided instructions of the assignment, geocoder is used in a while loop to find the corresponding long/lat of each rows in the newly transformed dataset. Unfortunately, it has not ended to any plausible result. So, I used instead geopy which made it possible. Two columns have been added to the new dataset.
there are several differences which made the code into work:
1. using geopy, Nominatim
2. passing GeocoderTimedOut for avoiding errors of timing out
3. setting a search limit for a neighbourhood
4. using sleep of 1 sec for avoiding server runtime limit block
5. passing a random symbolic password
6. random ordering of address 
<br>

Finally, geocoder fails to locate some neibourhoods. These records should be handled manually.

In [None]:
!pip install geopy
import geopy
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut

In [None]:
def do_geocode(address):
    geopy = Nominatim(user_agent="aron.shirazi@gmail.com")
    try:
        sleep(1)
        return geopy.geocode(address)
    except GeocoderTimedOut:
        return do_geocode(address)

dfn['latitude'] = 'NA'
dfn['longitude'] = 'NA'
max_try = 10
for nn in range(0, len(dfn)):
    neighbourhood = dfn['Neighbourhood'].iloc[nn]
    location = None
    count = 0
    while (location == None) & (count < max_try):
        password = ''.join(random.choice(['#', '$', '%', '@', '*', '-', '&', '~', '!']) for i in range(8))
        address_list = [neighbourhood, 'Toronto', 'Ontario', password]
        order = ''.join(random.sample(['0', '1', '2', '3'], 4))
        n0 = int(order[0]); n1 = int(order[1]); n2 = int(order[2]); n3 = int(order[3])
        address = '{}, {}, {}, {}'.format(address_list[n0], address_list[n1], address_list[n2], address_list[n3])
        location = do_geocode(address)
        count += 1
    if location is not None:
        print('{}, coordinates found for {}'.format(nn, neighbourhood))
        dfn['latitude'].iloc[nn] = location.latitude
        dfn['longitude'].iloc[nn] = location.longitude
    else:
        print('{}, coordinates not found for {}'.format(nn, neighbourhood))

In [None]:
# finding unlocated neighbourhoods to set the m manually
dfn['latitude'][~dfn['latitude'].apply(np.isreal)] = '0'
dfn['longitude'][~dfn['longitude'].apply(np.isreal)] = '0'
dfn['latitude'] = dfn['latitude'].astype('float', errors='ignore')
dfn['longitude'] = dfn['longitude'].astype('float', errors='ignore')
dfn.dtypes

In [None]:
dfo = pd.read_csv('Geospatial_Coordinates.csv')
for nn in dfn[dfn['longitude'] == 0].index:
    dfn['latitude'].iloc[nn] = float(dfo['Latitude'][dfo['Postal Code'] == dfn['Postal Code'].iloc[nn]])
    dfn['longitude'].iloc[nn] = float(dfo['Longitude'][dfo['Postal Code'] == dfn['Postal Code'].iloc[nn]])

In [None]:
dfn.head(10)

In [None]:
from sklearn.cluster import KMeans
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

In [None]:
# finding the center of map for illustration purpuse
df_loc = dfn[dfn['Neighbourhood'] != 'South Niagara'] # South Niagra is far away and makes our analysis inefficient so it is omitted
center_lat = df_loc['latitude'].mean()
center_lon = df_loc['longitude'].mean()
# to set boundaries of folium
lat_min = df_loc['latitude'].min()
lat_max = df_loc['latitude'].max()
lon_min = df_loc['longitude'].min()
lon_max = df_loc['longitude'].max()

In [None]:
map_toronto = folium.Map(location=[center_lat, center_lon], width=750, height=500)
map_toronto.fit_bounds([[lat_min, lon_min], [lat_max, lon_max]])
# add markers to map
for lat, lng, label in zip(df_loc['latitude'], df_loc['longitude'], df_loc['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
map_toronto

In [None]:
# Defining Foursquare Credentials and Version
# importing credentials
load_dotenv()
env_path = Path('.') / '.env'
load_dotenv(dotenv_path=env_path)
CLIENT_ID = os.getenv("Foursquare_CLIENT_ID")
CLIENT_SECRET = os.getenv("Foursquare_CLIENT_SECRET")
VERSION = '20180605' # Foursquare API version

In [None]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

def find_venue(lat, lon, limit = 100, radius = 500):
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    lat, 
    lon, 
    radius, 
    limit)
    results = requests.get(url).json()
    venues = results['response']['groups'][0]['items']
    nearby_venues = None
    if len(venues) > 0:
        nearby_venues = pd.json_normalize(venues) # flatten JSON
        # filter columns
        filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
        nearby_venues =nearby_venues.loc[:, filtered_columns]
        # filter the category for each row
        nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)
        # clean columns
        nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
    return nearby_venues

In [None]:
df_venues = pd.DataFrame(columns = ['name', 'categories', 'lat', 'lng'])
nn = 0
for name, lat, lng in zip(df_loc['Neighbourhood'], df_loc['latitude'], df_loc['longitude']):
    df_tr = find_venue(lat, lon)
    if df_tr is None: 
        len_found = 0 
    else: 
        len_found = len(df_tr)
        df_tr['neighbourhood'] = name
    print('{}, venues of {} explored at lat: {} and long: {}, with {} venues'.format(nn, name, lat, lng, len_found))
    df_venues = pd.concat([df_venues, df_tr])
    nn += 1
df_venues.reset_index(inplace = True, drop = True)
print('venues of Toronto are explored, the dataset shape is {} \n'.format(df_venues.shape))
df_venues

In [None]:
df_venues.groupby('neighbourhood').count()

In [None]:
print('There are {} uniques categories \n'.format(len(df_venues['categories'].unique())))
print('There are {} uniques venues \n'.format(len(df_venues['name'].unique())))