# Assignment for Segmenting and Clustering Neighborhoods in Toronto by Anagha Kulkarni

## Problem 2: Scraping the source Wikipedia page to build a dataframe and getting the latitude and the longitude co-ordinates of each neighborhood.

###### Importing required libraries

In [25]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

from bs4 import BeautifulSoup

!pip install folium
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


###### Scraping the raw data (Wikipedia page) to build a dataframe



In [28]:
raw_data = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
soup = BeautifulSoup(raw_data, 'xml')

table = soup.find("table")
table_rows = table.tbody.find_all("tr")

res = []
for tr in table_rows:
    td = tr.find_all("td")
    row = [tr.text for tr in td]
    
    # Cleaning up the data by ignoring cells with a borough that is Not assigned.
    if row != [] and row[1] != "Not assigned\n":
        # Assigning neighborhood to be the same as the borough, when a cell has a borough but a "Not assigned" neighborhood.
        if "Not assigned" in row[2]: 
            row[2] = row[1]
        res.append(row)

# Creating a Dataframe with 3 columns
df = pd.DataFrame(res, columns = ["PostalCode", "Borough", "Neighborhood"])
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A\n,North York\n,Parkwoods\n
1,M4A\n,North York\n,Victoria Village\n
2,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"
3,M6A\n,North York\n,"Lawrence Manor, Lawrence Heights\n"
4,M7A\n,Downtown Toronto\n,"Queen's Park, Ontario Provincial Government\n"


###### Removing "\n" from all columns

In [29]:
df["PostalCode"] = df["PostalCode"].str.replace("\n","")
df["Borough"] = df["Borough"].str.replace("\n","")
df["Neighborhood"] = df["Neighborhood"].str.replace("\n","")
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


###### Printing the number of rows in the dataframe

In [32]:
print("Shape: ", df.shape)

Shape:  (103, 3)


###### Getting the latitude and the longitude co-ordinates of each neighborhood

In [33]:
def get_geocode(postal_code):
    # initialize your variable to None
    lat_lng_coords = None
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    return latitude,longitude

###### Since we are not able to get the geographical coordinates of the neighborhoods using the Geocoder package, using csv file that has the geographical coordinates of each postal code

In [34]:
df_geo_coordinates = pd.read_csv('http://cocl.us/Geospatial_data')
df_geo_coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


###### Combining the dataframe for neighborhood data with dataframe for geographical coordinates of each postal code to get final data frame

In [37]:
df_neighb_geo = pd.merge(df, df_geo_coordinates, how='left', left_on = 'PostalCode', right_on = 'Postal Code')

# removing the duplicate "Postal Code" column
df_neighb_geo.drop("Postal Code", axis=1, inplace=True)
df_neighb_geo.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
