# ___This Notebook Contains all 3 Parts of the Assignment___


## ___Part 1 of 3 : Working on Postal Data and Dataframe___

## ___Import libary pandas, request and BeautifulSoup___

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

## Define URL of wikipedia site to scrape, Request site download and check status (response)

In [2]:
wikipage = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

In [3]:
wikipage.status_code

200

In [4]:
wikipage

<Response [200]>

### Call and assign wikipage Content and parser

In [5]:
content = BeautifulSoup(wikipage.content, 'lxml')

In [6]:

#list = content.find(id="mw-parser-output")
postal_table = content.find_all("table")

#print(content.prettify())
print(content.title.text)


List of postal codes of Canada: M - Wikipedia


### Specifying The Table of Interest

In [7]:
postal_table = content.find("table", attrs={"class": "wikitable"})
postal_table_data = postal_table.find_all("tr")  # contains 2 rows

### Grab the Cell "td" from TableHeader (indentified by "th")  and Table Content (identified by "tr") and Clear unwanted html tag

In [8]:
data = {}
# Get all the headings of Lists

t_headers = []
for th in postal_table_data[0].find_all("th"):
    # remove any newlines and extra spaces from left and right
    t_headers.append(th.text.replace('\n', ' ').strip())
#    print(t_headers)

t_data = []    
for tr in postal_table.find_all("tr") :
    t_row = {}
    for thd, head in zip(tr.find_all("td"),t_headers):               
        t_row[head]=thd.text.replace('\n', ' ').strip()
    t_data.append(t_row)


#t_headers.append[t_data]
type(t_data)
data=t_headers + t_data
newData=pd.DataFrame(t_data)

### Get Description of dataframe and Understanding Data Types

In [9]:
newData.describe()

Unnamed: 0,Borough,Neighborhood,Postal Code
count,180,180.0,180
unique,11,99.0,180
top,Not assigned,,M9X
freq,77,77.0,1


In [10]:
newData.dtypes

Borough         object
Neighborhood    object
Postal Code     object
dtype: object

### Reordering Columns to match instruction , Drop empty Cell, Drop "Not assigned" Borough and Reset Index

In [144]:
df = newData[['Postal Code','Borough', 'Neighborhood']]
#df.dropna()
indexNames = df[ df['Borough'] =="Not assigned" ].index 

# Delete these row indexes from dataFrame
Cleaned_df=df.drop(indexNames)
Cleaned_df.dropna(how='any', axis=0, inplace=True)
Cleaned_df.head(20)
Postal_Data = Cleaned_df.reset_index(drop=True)
Postal_Data

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


### Inspecting Dataframe Dimension

In [145]:
Postal_Data.shape

(103, 3)

# Part 2 of 3 : Adding Longitude and Latitude Data

In [13]:
#Attempting to use geocoder
import sys
!{sys.executable} -m pip install geocoder



In [38]:
#installation of geopy as alternative geocoder
import sys
!{sys.executable} -m pip install geopy
!{sys.executable} -m pip install numpy



### Using geopy Nominatim package for Geocoding 

In [146]:
import numpy as np
#function to get longitude and latitude data from address
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="my-application")
def geolocate(postal_address):
    try:
        # Geolocate the center of the country
        loc = geolocator.geocode(postal_address)
        # And return latitude and longitude
        return (loc.latitude, loc.longitude)
    except:
        # Return missing value
        return np.nan

In [227]:
#Result Sample from Geopy 
geolocate('Downtown Toronto, Canada')

(43.6563221, -79.3809161)

In [226]:
#Geocoder attempt was not very successful
import geocoder
geocoder.google('Toronto, Ontario')

<[REQUEST_DENIED] Google - Geocode [empty]>

In [147]:
Postal_Data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### Iterating Postal data to find corresponding Coordinate and add new LatLong  Columns

In [224]:
# Declare a list that is to be converted into a column 
latlist=[]
lonlist=[]


for row in Postal_Data.iterrows():
    address = row[1][1]
    
    # couple of special cases where Borough return coordinate outside Canada,hence a fix is required for Scarborough & York
    if (address == "Scarborough") :
        lat,long = geolocate("Scarborough, Canada")  
    elif (address == "York"):
        lat,long = geolocate("York, Canada")
    else :  
        lat,long = geolocate(address)
        
    latlist.append(lat)
    lonlist.append(long)
    #print(row[1][1])
    
    #Add new Lat and Long Columnn to dataframe
Postal_Data['Latitude'] = latlist
Postal_Data['Longitude'] = lonlist
# Observe the result 
Postal_Data.head(20)

Unnamed: 0,Postal Code,Borough,Neighborhood,Longitude,Latitude
0,M3A,North York,Parkwoods,-79.449117,43.754326
1,M4A,North York,Victoria Village,-79.449117,43.754326
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",-79.380812,43.654174
3,M6A,North York,"Lawrence Manor, Lawrence Heights",-79.449117,43.754326
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",-79.380812,43.654174
5,M9A,Etobicoke,Islington Avenue,-79.552492,43.671459
6,M1B,Scarborough,"Malvern, Rouge",-79.257774,43.773077
7,M3B,North York,Don Mills,-79.449117,43.754326
8,M4B,East York,"Parkview Hill, Woodbine Gardens",-76.686355,39.973709
9,M5B,Downtown Toronto,"Garden District, Ryerson",-79.380812,43.654174


In [225]:
Postal_Data.head(20)

Unnamed: 0,Postal Code,Borough,Neighborhood,Longitude,Latitude
0,M3A,North York,Parkwoods,-79.449117,43.754326
1,M4A,North York,Victoria Village,-79.449117,43.754326
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",-79.380812,43.654174
3,M6A,North York,"Lawrence Manor, Lawrence Heights",-79.449117,43.754326
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",-79.380812,43.654174
5,M9A,Etobicoke,Islington Avenue,-79.552492,43.671459
6,M1B,Scarborough,"Malvern, Rouge",-79.257774,43.773077
7,M3B,North York,Don Mills,-79.449117,43.754326
8,M4B,East York,"Parkview Hill, Woodbine Gardens",-76.686355,39.973709
9,M5B,Downtown Toronto,"Garden District, Ryerson",-79.380812,43.654174


# Part 3/3 Exploring using map

In [233]:
print('The Postal dataframe has {} boroughs and {} neighborhoods.'.format(
        len(Postal_Data['Borough'].unique()),
        Postal_Data.shape[0]
    )
)

The Postal dataframe has 10 boroughs and 103 neighborhoods.


In [234]:
address = 'Toronto, Canada '

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [235]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(Postal_Data['Latitude'], Postal_Data['Longitude'], Postal_Data['Borough'], Postal_Data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto