### Notebook for scrapping Postal_Code , Borough and Neighborhood from given Wikipedia page.

#### Import Libraries

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
# get response from the given wikipedia page

website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [3]:
# convert above response in HTML format using BeautifulSoup.

soup = BeautifulSoup(website_url,'lxml')

In [4]:
# In the above HTML page table of interest is "wikitable sortable".
# We can access the table using below code.
my_table = soup.find('table',{'class':'wikitable sortable'})

In [5]:
columns = []           # list for header / columns of the table.
all_data = []          # list for all <td> values. 
postal_codes = []      # list for all postal codes i.e. every third item of all_data[] list starting from 0 index.
borough = []           # list for all borough i.e. every third item of all_data[] list starting from 1 index.
neighborhood = []      # list for all neighnorhood i.e. every third item of all_data[] list starting from 2 index.

In [6]:
# get all <th> values from the table i.e. column names for our dataframe.
# removing 'newline' from last column.

cols = my_table.find_all('th')
for c in cols:
    columns.append(c.text)
columns[2] = columns[2].replace('\n','')
columns

['Postcode', 'Borough', 'Neighbourhood']

In [7]:
#populate all_data[] with all <td> values and removing 'newline' from last column.

all_d = my_table.find_all('td')
for a in all_d:
    all_data.append(a.text)
    
for i in range(len(all_data)):
    all_data[i] = all_data[i].replace('\n','')


In [8]:
#polulate Postal_Codes[] , Borough[] and Neighborhood[].

postal_codes = all_data[::3]
borough = all_data[1::3]
neighborhood = all_data[2::3]


In [10]:
# Creating Dataframe from above data.
df = pd.DataFrame() #creating empty dataframe.

#Populating columns
df[columns[0]] = postal_codes
df[columns[1]] = borough
df[columns[2]] = neighborhood

### Now we need to clean the above dataframe using below conditions:
1.Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

2.If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. So for the 9th cell in the table on the Wikipedia page, the value of the Borough and the Neighborhood columns will be Queen's Park

In [11]:
df = df[df.Borough != 'Not assigned'] #satisfies first condition.
df.Neighbourhood.replace('Not assigned' , df.Borough , inplace = True)  #satisfies second condition.
df

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [12]:
# shape of the resulting dataframe
df.shape

(211, 3)

In [13]:
#import geospatial file into dataframe
df_geo = pd.read_csv(r'C:\Users\aksha\Desktop\projects\Coursera_Capstone\Geospatial_Coordinates.csv')

In [14]:
# lets compare columns of both dataframe in order to join and get latitude and longitudes

print(df.columns)
print(df_geo.columns)

Index(['Postcode', 'Borough', 'Neighbourhood'], dtype='object')
Index(['Postal Code', 'Latitude', 'Longitude'], dtype='object')


In [15]:
#change column name of first dataframe from 'Postcode' to 'Postal Code'

df.rename(columns={'Postcode' : 'Postal Code'}, inplace=True)
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [16]:
# Now lets merge both dataframes on 'Postal code'

df = df.merge(df_geo , on='Postal Code')
df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.654260,-79.360636
3,M5A,Downtown Toronto,Regent Park,43.654260,-79.360636
4,M6A,North York,Lawrence Heights,43.718518,-79.464763
5,M6A,North York,Lawrence Manor,43.718518,-79.464763
6,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
7,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
8,M1B,Scarborough,Rouge,43.806686,-79.194353
9,M1B,Scarborough,Malvern,43.806686,-79.194353


## Now lets get toronto map and place all borough and neighborhood

In [17]:
## install geopy

!conda install -c conda-forge geopy --yes

Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - defaults/win-64::anaconda==2018.12=py37_0
  - defaults/win-64::blaze==0.11.3=py37_0
  - defaults/win-64::numba==0.41.0=py37hf9181ef_0
done

## Package Plan ##

  environment location: C:\Users\aksha\Anaconda3

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    _anaconda_depends-2019.03  |           py37_0           5 KB
    anaconda-custom            |           py37_1           2 KB
    ca-certificates-2019.9.11  |       hecc5488_0         181 KB  conda-forge
    certifi-2019.6.16          |           py37_1         149 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1

DEBUG menuinst_win32:__init__(196): Menu: name: 'Anaconda${PY_VER} ${PLATFORM}', prefix: 'C:\Users\aksha\Anaconda3', env_name: 'None', mode: 'user', used_mode: 'user'
DEBUG menuinst_win32:create(320): Shortcut cmd is %windir%\System32\WindowsPowerShell\v1.0\powershell.exe, args are ['-ExecutionPolicy', 'ByPass', '-NoExit', '-Command', '"& \'C:\\Users\\aksha\\Anaconda3\\shell\\condabin\\conda-hook.ps1\' ; conda activate \'C:\\Users\\aksha\\Anaconda3\' "']


In [18]:
from geopy.geocoders import Nominatim
import folium
import matplotlib.pyplot as plt

In [19]:
#Get toronto co-ordinates

address = 'Toronto , Canada'
geolocator = Nominatim()
location = geolocator.geocode(address)

latitude = location.latitude
longitude = location.longitude

print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

  after removing the cwd from sys.path.


The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [21]:
# plot map of toronto

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(df['Latitude'] , df['Longitude'] , df['Borough'] , df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto

## Now lets make cluster of all borough that contains 'toronto' in their names.

In [22]:
df_borough_toronto = df[df['Borough'].str.contains('Toronto')].reset_index(drop = True)
df_borough_toronto

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,Harbourfront,43.654260,-79.360636
1,M5A,Downtown Toronto,Regent Park,43.654260,-79.360636
2,M5B,Downtown Toronto,Ryerson,43.657162,-79.378937
3,M5B,Downtown Toronto,Garden District,43.657162,-79.378937
4,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
5,M4E,East Toronto,The Beaches,43.676357,-79.293031
6,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
7,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
8,M6G,Downtown Toronto,Christie,43.669542,-79.422564
9,M5H,Downtown Toronto,Adelaide,43.650571,-79.384568


In [34]:
# group above data by 'Borough'

map_toronto_borough = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(df_borough_toronto['Latitude'] , df_borough_toronto['Longitude'] , df_borough_toronto['Borough'] , df_borough_toronto['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto_borough)
    
map_toronto_borough

In [33]:
# Applying K means to create 4 clusters

from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.preprocessing import LabelEncoder

toronto_grouped_clustering = df_borough_toronto.drop(columns=['Postal Code'] , axis=1)

le = LabelEncoder()
le.fit_transform(df_borough_toronto)



kclusters = 4

kmeans = KMeans(n_clusters=kclusters, random_state=1).fit(toronto_grouped_clustering)

df_borough_toronto['Cluster Labels'] = KMeans.labels_

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_borough_toronto['Latitude'], df_borough_toronto['Longitude'], df_borough_toronto['Neighbourhood'],kmeans.labels_):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters


ValueError: bad input shape (74, 5)