In [10]:
#Prepare dependencies
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2019.9.11  |       hecc5488_0         144 KB  conda-forge
    certifi-2019.9.11          |           py36_0         147 KB  conda-forge
    openssl-1.1.1c             |       h516909a_0         2.1 MB  conda-forge
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0         conda-forge
    geopy:           1.20.0-py_0       conda-forge

The following packages will be UPDATED:

    ca-

In [301]:
#Ignore version and other warnings
#This will not stop exeptions
import sys
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

In [304]:
#Scraping wikipedia
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wiki=pd.read_html(url)
neighborhoods = wiki[0][:]


In [305]:
#remove any Not Assigned Borough records

cond = neighborhoods['Borough']=='Not assigned' 
neighborhoods.drop(neighborhoods[cond].index, inplace = True)

#neighborhoods

In [309]:
# Identify duplicate neighbourhoods for postcodes
nh = neighborhoods.groupby('Postcode').count()
nh.head()

Unnamed: 0_level_0,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,2,2
M1C,3,3
M1E,3,3
M1G,1,1
M1H,1,1


In [271]:
# Create a list of postalcodes. Will be used for the iteration 
nharr = neighborhoods.iloc[:,0].unique()
nharr

array(['M3A', 'M4A', 'M5A', 'M6A', 'M7A', 'M9A', 'M1B', 'M3B', 'M4B',
       'M5B', 'M6B', 'M9B', 'M1C', 'M3C', 'M4C', 'M5C', 'M6C', 'M9C',
       'M1E', 'M4E', 'M5E', 'M6E', 'M1G', 'M4G', 'M5G', 'M6G', 'M1H',
       'M2H', 'M3H', 'M4H', 'M5H', 'M6H', 'M1J', 'M2J', 'M3J', 'M4J',
       'M5J', 'M6J', 'M1K', 'M2K', 'M3K', 'M4K', 'M5K', 'M6K', 'M1L',
       'M2L', 'M3L', 'M4L', 'M5L', 'M6L', 'M9L', 'M1M', 'M2M', 'M3M',
       'M4M', 'M5M', 'M6M', 'M9M', 'M1N', 'M2N', 'M3N', 'M4N', 'M5N',
       'M6N', 'M9N', 'M1P', 'M2P', 'M4P', 'M5P', 'M6P', 'M9P', 'M1R',
       'M2R', 'M4R', 'M5R', 'M6R', 'M7R', 'M9R', 'M1S', 'M4S', 'M5S',
       'M6S', 'M1T', 'M4T', 'M5T', 'M1V', 'M4V', 'M5V', 'M8V', 'M9V',
       'M1W', 'M4W', 'M5W', 'M8W', 'M9W', 'M1X', 'M4X', 'M5X', 'M8X',
       'M4Y', 'M7Y', 'M8Y', 'M8Z'], dtype=object)

In [316]:
#Create a new dataframe that will contain the final data
df = pd.DataFrame(columns = ['Postalcode', 'Borough', 'Neighborhood']) 


#Iterate through the different postcodes
for n in nharr:
    alln=neighborhoods.ix[neighborhoods['Postcode'] == n]   
    
    if alln.shape[0] >1:
        #More the one record exist for the current postal code, so we'll create a new string that will gather all the neighbourhoods for that code
        str=''
        for x in range(alln.shape[0]):
            if str=='':
                str=alln['Neighbourhood'].iloc[x]
            else:
                str=str + ", " + alln['Neighbourhood'].iloc[x]
        
        #Add this postal code to the final df
        df.loc[len(df)] = [alln['Postcode'].iloc[x], alln['Borough'].iloc[x], str]

    else:
        df.loc[len(df)] = [alln['Postcode'].iloc[0], alln['Borough'].iloc[0], alln['Neighbourhood'].iloc[0]]
    

print (df.head(10))
      

  Postalcode           Borough                      Neighborhood
0        M3A        North York                         Parkwoods
1        M4A        North York                  Victoria Village
2        M5A  Downtown Toronto         Harbourfront, Regent Park
3        M6A        North York  Lawrence Heights, Lawrence Manor
4        M7A      Queen's Park                      Not assigned
5        M9A         Etobicoke                  Islington Avenue
6        M1B       Scarborough                    Rouge, Malvern
7        M3B        North York                   Don Mills North
8        M4B         East York   Woodbine Gardens, Parkview Hill
9        M5B  Downtown Toronto          Ryerson, Garden District


In [317]:
print (df.shape)

(103, 3)


# Joining the geospatial data to the dataframe
## Using the Geospatial_data.csv provided

In [318]:
#Load the Geospatial_data into a new dataframe
df1 = pd.read_csv('https://cocl.us/Geospatial_data')

In [326]:
#Merge the dataframes on the postal code columns
df_merge_col = pd.merge(df, df1, left_on='Postalcode', right_on='Postal Code')

In [329]:
df_merge_col = df_merge_col.drop('Postal Code', 1)

In [330]:
df_merge_col.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Not assigned,43.662301,-79.389494
