#  Agrupación y Segmentación de Vecindarios en la Ciudad de Toronto

## Librerias a Emplear

In [142]:
import numpy as np # librería para manejar datos vectorizados

import pandas as pd # librería para análisis de datos
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # librería para manejar archivos JSON 

#!conda install -c conda-forge geopy --yes # retirar el comentario de esta línea si no ha completado el laboratorio de la API de FourSquare 
from geopy.geocoders import Nominatim # convertir una dirección en valores de latitud y longitud

import requests # librería para manejar solicitudes
from pandas.io.json import json_normalize # librería para convertir un archivo json en un dataframe pandas

# Matplotlib y módulos asociados para graficar
import matplotlib.cm as cm
import matplotlib.colors as colors

# importar k-means desde la fase de agrupación
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # retirar el comentario de esta línea si no ha completado el laboratorio de la API de FourSquare
import folium # librería para graficar mapas 

print('Libraries imported.')

Libraries imported.


In [143]:
from bs4 import BeautifulSoup
import matplotlib.cm as cm
import matplotlib.colors as colors

## Lectura de Datos a Emplear

In [144]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
req = requests.get(url)
soup = BeautifulSoup(req.content, "html.parser")

In [145]:
tabla = soup.find('table')

In [146]:
tabla_final = []

for i in range(0,9):
    for j in tabla.find_all('tr'):
        cols = j.find_all('p')
        tabla_final.append(cols[i].getText())

In [147]:
df = pd.DataFrame(tabla_final)
df.head()

Unnamed: 0,0
0,M1ANot assigned\n
1,M1BScarborough(Malvern / Rouge)\n
2,M1CScarborough(Rouge Hill / Port Union / Highl...
3,M1EScarborough(Guildwood / Morningside / West ...
4,M1GScarborough(Woburn)\n


Le damos un formato a la tabla, para obtener la tabla solicitada

In [148]:
df['Postal Code'] = df[0].str[0:3]
df["Borough"]=df[0].str[3:].str.split("(",n=1,expand=True)[0]
df["Neigh"]=df[0].str.rsplit("(",n=2,expand=True)[1].str.split(")",n=1,expand=True)[0]
df["Neighborhood"]=df["Neigh"].str.replace(" / ",",")

In [149]:
df.head()

Unnamed: 0,0,Postal Code,Borough,Neigh,Neighborhood
0,M1ANot assigned\n,M1A,Not assigned\n,,
1,M1BScarborough(Malvern / Rouge)\n,M1B,Scarborough,Malvern / Rouge,"Malvern,Rouge"
2,M1CScarborough(Rouge Hill / Port Union / Highl...,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,"Rouge Hill,Port Union,Highland Creek"
3,M1EScarborough(Guildwood / Morningside / West ...,M1E,Scarborough,Guildwood / Morningside / West Hill,"Guildwood,Morningside,West Hill"
4,M1GScarborough(Woburn)\n,M1G,Scarborough,Woburn,Woburn


In [150]:
df_final=df[(df["Borough"]!="Not assigned\n")]
df_final=df_final.drop([0,"Neigh"],axis=1)

In [151]:
df_final

Unnamed: 0,Postal Code,Borough,Neighborhood
1,M1B,Scarborough,"Malvern,Rouge"
2,M1C,Scarborough,"Rouge Hill,Port Union,Highland Creek"
3,M1E,Scarborough,"Guildwood,Morningside,West Hill"
4,M1G,Scarborough,Woburn
5,M1H,Scarborough,Cedarbrae
6,M1J,Scarborough,Scarborough Village
7,M1K,Scarborough,"Kennedy Park,Ionview,East Birchmount Park"
8,M1L,Scarborough,"Golden Mile,Clairlea,Oakridge"
9,M1M,Scarborough,"Cliffside,Cliffcrest,Scarborough Village West"
10,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [152]:
df_final.shape

(103, 3)

## Obtención de las coordenadas de Latitud y Longitud

In [153]:
file_name ='https://cocl.us/Geospatial_data'
Geoconder = pd.read_csv(file_name)

In [154]:
neighborhoods=pd.merge(df_final,Geoconder,on="Postal Code")

In [155]:
neighborhoods.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern,Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill,Port Union,Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [156]:
type(neighborhoods)

pandas.core.frame.DataFrame

### Observamos cuantos barrios unicos hay en Toronto

In [160]:
print('There are {} uniques Borough.'.format(len(neighborhoods['Borough'].unique())))

There are 15 uniques Borough.


In [161]:
neighborhoods['Borough'].unique()

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'East YorkEast Toronto', 'Central Toronto', 'Downtown Toronto',
       'Downtown TorontoStn A PO Boxes25 The Esplanade', 'York',
       'West Toronto', "Queen's Park",
       'MississaugaCanada Post Gateway Processing Centre',
       'East TorontoBusiness reply mail Processing Centre969 Eastern',
       'Etobicoke', 'EtobicokeNorthwest'], dtype=object)

## Agrupación y segmentación de los vecindarios

#### Utilice la librería geopy para obtener la latitud y la longitud de la Ciudad de Toronto

Para poder definir una instancia del geocoder necesitaremos definir un user_agent. Nombraremos a nuestro agente ny_explorer, como se muestra a continuación.

In [162]:
neighborhoods = neighborhoods[neighborhoods['Borough'].str.contains('Toronto')].reset_index(drop=True)
neighborhoods.groupby("Borough").size().sort_values(ascending=False)

Borough
Downtown Toronto                                                17
Central Toronto                                                  9
West Toronto                                                     6
East Toronto                                                     4
East YorkEast Toronto                                            1
East TorontoBusiness reply mail Processing Centre969 Eastern     1
Downtown TorontoStn A PO Boxes25 The Esplanade                   1
dtype: int64

In [163]:
address = 'TORONTO,CA'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


### Genere un mapa de Toronto con los barrios super puestos por encima.


In [164]:
# crear un mapa de Nueva York utilizando los valores de latitud y longitud
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# añadir marcadores al mapa
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto

## Explorar Barrios en Toronto

In [166]:
neighborhood_latitude = neighborhoods.loc[0, 'Latitude'] # latitud del barrio 
neighborhood_longitude = neighborhoods.loc[0, 'Longitude'] # longitud del barrio

neighborhood_name = neighborhoods.loc[0, 'Neighborhood'] # nombre del barrio

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of The Beaches are 43.67635739999999, -79.2930312.


In [167]:
headers = {
        'Accept': 'application/json',
        'Authorization': 'fsq3z8M+K9/afb++fE1PLmG6xOKTPAj/SsruzHI0bYoklOk='
    }
url = 'https://api.foursquare.com/v3/places/search?ll={},{}&radius={}&limit={}'
url = url.format(neighborhood_latitude,neighborhood_longitude, 500, 50)
url

'https://api.foursquare.com/v3/places/search?ll=43.67635739999999,-79.2930312&radius=500&limit=50'

In [168]:
results = requests.get(url,headers = headers).json()
results

{'results': [{'fsq_id': '4bd461bc77b29c74a07d9282',
   'categories': [{'id': 16019,
     'name': 'Hiking Trail',
     'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/parks_outdoors/hikingtrail_',
      'suffix': '.png'}}],
   'chains': [],
   'distance': 89,
   'geocodes': {'main': {'latitude': 43.676676, 'longitude': -79.294126},
    'roof': {'latitude': 43.676676, 'longitude': -79.294126}},
   'link': '/v3/places/4bd461bc77b29c74a07d9282',
   'location': {'country': 'CA',
    'cross_street': 'Queen St.',
    'formatted_address': 'Queen St., Toronto ON',
    'locality': 'Toronto',
    'region': 'ON'},
   'name': 'Glen Manor Ravine',
   'related_places': {},
   'timezone': 'America/Toronto'},
  {'fsq_id': '4dbc8fe96a23e294ba3237bd',
   'categories': [{'id': 16032,
     'name': 'Park',
     'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/parks_outdoors/park_',
      'suffix': '.png'}}],
   'chains': [],
   'distance': 176,
   'geocodes': {'main': {'latitude': 43.67

In [169]:
def getNearbyVenues(names, latitudes, longitudes, radius=500,limit=5):

    URL='https://api.foursquare.com/v3/places/search?ll={},{}&radius={}&limit={}'
    headers = {
        'Accept': 'application/json',
        'Authorization': 'fsq3z8M+K9/afb++fE1PLmG6xOKTPAj/SsruzHI0bYoklOk='
    }
    df_list=[]
    
    for name, lat, lng in zip(names, latitudes, longitudes):
        url = URL.format(lat, lng, radius, limit)
        results = requests.get(url,headers = headers).json()
        
        for each_result in results['results']: # filter the result based on JSON identification
            result={}
            result['Neighborhood']=name
            result['Neighborhood Latitude']=lat
            result['Neighborhood Longitude']=lng
            result['Name']=each_result['name']
            result['Venue Latitude']=each_result['geocodes']['main']['latitude']
            result['Venue Longitude']=each_result['geocodes']['main']['longitude']
            result['Locality']=each_result['location']['locality']
            result['Category_Names']=each_result['categories'][0]['name']
            df_list.append(result.copy())
    return pd.DataFrame(df_list)

#### Ahora ejecutamos la función anterior para cada categoria y creamos un nuevo dataframe llamado Toronto_venues


In [170]:
Toronto_venues = getNearbyVenues(names=neighborhoods['Neighborhood'],
                                   latitudes=neighborhoods['Latitude'],
                                   longitudes=neighborhoods['Longitude'])

IndexError: list index out of range

#### Revisemos el tamaño del dataframe resultante

In [None]:
print(Toronto_venues.shape)
Toronto_venues.head()

Revisemos cuantos sitios se regresarón para cada barrio

In [None]:
Toronto_venues.groupby('Neighborhood').count()

Encontremos cuantas categorías únicas se pueden conservar de todos los sitios regresados

In [None]:
print('There are {} uniques categories.'.format(len(Toronto_venues['Category_Names'].unique())))

## Analizar Cada Barrio

In [None]:
# codificación
Toronto_onehot = pd.get_dummies(Toronto_venues[['Category_Names']], prefix="", prefix_sep="")

# añadir la columna de barrio de regreso al dataframe
Toronto_onehot['Neighborhood'] = Toronto_venues['Neighborhood'] 

# mover la columna de barrio a la primer columna
fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

Toronto_onehot.head()

Examinemos el tamaño del nuevo dataframe

In [None]:
Toronto_onehot.shape

#### Agrupemos las filas por barrios tomando la média de la frecuancia de la ocurrencia de cada categoría

In [None]:
Toronto_grouped = Toronto_onehot.groupby('Neighborhood').mean().reset_index()
Toronto_grouped

#### Confirmemos el nuevo tamaño

In [None]:
Toronto_grouped.shape

#### Imprimamos cada barrio junto con los 5 sitios mas comunes

In [None]:
num_top_venues = 5

for hood in Toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = Toronto_grouped[Toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

#### Pongamos eso en el dataframe

Primero escribamos una función para ordenar los sitios en orden descendente.

In [171]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Generemos el nuevo dataframe y mostremos los primeros 10 sitios de cada barrio.

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# crear las columnas acorde al numero de sitios populares
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# crear un nuevo dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Toronto_grouped['Neighborhood']

for ind in np.arange(Toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

## 4. Barrios Agrupados


Ejecutemos k-means para agrupar los barrios en 5 agrupaciones.

In [None]:
# establecer el número de agrupaciones
kclusters = 5

Toronto_grouped_clustering = Toronto_grouped.drop('Neighborhood', 1)

# ejecutar k-means
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

# revisar las etiquetas de las agrupaciones generadas para cada fila del dataframe
kmeans.labels_[0:10] 

Generemos un nuevo dataframe que incluya la agrupación asi como los 10 sitios mas populares de cada barrio.

In [None]:
# añadir etiquetas
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

Toronto_merged = Toronto_data

# juntar manhattan_grouped con manhattan_data 
Toronto_merged = Toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

Toronto_merged.head() # revisar las ultimas columnas

Finalmente visualicemos las agrupaciones resultantes

In [None]:
# crear mapa
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# establecer el esquema de color para las agrupaciones
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# añadir marcadores al mapa
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_merged['Latitude'], Toronto_merged['Longitude'], Toronto_merged['Neighborhood'], Toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examinar Agrupaciones

Ahora puede examinar cada agrupación y determinar las categorias del sitio que distingue a cada agrupación. En base a las categorias definidas usted puede asignar un nombre a cada agrupación. Dejaré este ejercicio para usted.

#### Agrupación 1

In [None]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 0, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

#### Agrupación 2


In [172]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 1, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

NameError: name 'Toronto_merged' is not defined

### Agrupación 3

In [None]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 2, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

### Agrupación 4

In [None]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 3, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

### Agrupación 5

In [None]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 4, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]