In [1]:
# Importando librerías esenciales
import pandas as pd
import numpy as np

In [2]:
# Importamos librerías necesarias para obtener información 
import requests
from bs4 import BeautifulSoup

# Creamos arrays y dataframes necesarios
df = pd.DataFrame()
codes_array = np.array([])
boroughs_array = np.array([])
neighborhoods_array = np.array([])

# Proceso de Web Scraping a través de Beautiful Soup
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
contentTable  = soup.find('table')
codes  = contentTable.findAll('b')
boroughs_neighborhoods = contentTable.findAll('span')

# Rellenando arrays de codigos postales y vecindarios
for i in range(0, len(codes)):
    codes[i] = codes[i].get_text()
    boroughs_neighborhoods[i] = boroughs_neighborhoods[i].get_text()
    boroughs_neighborhoods[i] = boroughs_neighborhoods[i].split("(") 
    if boroughs_neighborhoods[i][0] != "Not assigned": # Solo procesando información válida
        boroughs_neighborhoods[i][1] = boroughs_neighborhoods[i][1].replace(")", "") # Eliminando paréntesis
        boroughs_neighborhoods[i][1] = boroughs_neighborhoods[i][1].replace(" /", ",") # Cambiar barra oblicua por comas
        codes_array = np.append(codes_array, codes[i])
        boroughs_array = np.append(boroughs_array, boroughs_neighborhoods[i][0])
        neighborhoods_array = np.append(neighborhoods_array, boroughs_neighborhoods[i][1])

# Añadiendo datos a columnas del dataframe
df['Postal Code'] = codes_array.tolist()
df['Borough'] = boroughs_array.tolist()
df['Neighborhood'] = neighborhoods_array.tolist()

df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [3]:
# Revisando si existen vecindarios "Not assigned"
df[df["Neighborhood"]=="Not assigned"]

Unnamed: 0,Postal Code,Borough,Neighborhood


In [4]:
df.shape

(103, 3)

In [5]:
# Obteniendo información del archivo .csv 
df_geo = pd.read_csv('Geospatial_Coordinates.csv')

df_geo.head()

FileNotFoundError: [Errno 2] No such file or directory: 'Geospatial_Coordinates.csv'

In [13]:
# Uniendo ambos marcos de datos basados en el código postal
df = pd.merge(df,
                 df_geo[['Postal Code', 'Latitude', 'Longitude']],
                 on='Postal Code')
                 
df.head()

NameError: name 'df_geo' is not defined

In [14]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# Importando Matplotlib y otros módulos
import matplotlib.cm as cm
import matplotlib.colors as colors

# Importando k-Means 
from sklearn.cluster import KMeans

# Importando folium para gráficos
import folium


In [None]:
# Filtrando marco de datos para que solo contenga los vecindarios de Toronto
df_TO = df[df['Borough'].str.contains("Toronto")].reset_index(drop=True)

df_TO.head()

In [None]:
# Obteniendo las coordenadas de la ciudad de Toronto
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('Las coordenadas geográficas de Toronto son {}, {}.'.format(latitude, longitude))

In [None]:
# Creando un mapa de Toronto usando los datos de latitud y longitud
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# Añadir marcadores al mapa
for lat, lng, borough, neighborhood in zip(df_TO['Latitude'], df_TO['Longitude'], df_TO['Borough'], df_TO['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
# Debido a la incompatibilidad de folium con GitHub se ha decidido agregar una captura del mapa
# map_toronto

In [None]:
# Valores de latitud y longitud como X
X= df_TO.values[:,3:]
X

In [None]:
# Número de clusters
kclusters =4

# Ejecutando clustering de k-Means
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(X)

# Revisar los clusters creados
kmeans.labels_[0:10]

In [None]:
# Agregando los clusters al marco de datos principal
df_TO['Cluster'] = kmeans.labels_

df_TO.head()

In [None]:
# Definiendo colores para marcadores
rainbow = ['blue','green','yellow','red']

# Añadir marcadores al mapa
markers_colors = []
for lat, lon, poi, cluster in zip(df_TO['Latitude'], df_TO['Longitude'], df_TO['Neighborhood'], df_TO['Cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster],
        fill=True,
        fill_color=rainbow[cluster],
        fill_opacity=0.7).add_to(map_toronto)
       
# Debido a la incompatibilidad de folium con GitHub se ha decidido agregar una captura del mapa
# map_toronto