# Applied Data Science Capstone Project
## Final Project of [IBM's Data Science Professional Certificate Course] (https://www.coursera.org/professional-certificates/ibm-data-science)

First, let's import all the libraries needed

In [1]:
import pandas as pd
import numpy as np 
import geocoder

from pandas.io.html import read_html

### Downloading Data
Let's scrape the Toronto's neighborhoods dataframe from a wikipedia table

In [2]:
# Get a list of wiki tables from the following link 
page = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wikitable = read_html(page,  attrs = {"class":"wikitable"})

# Get the dataframe for the first table 
df_toronto = wikitable[0]

### Pre-processing

In [3]:
# Drop all rows where borough is not assigned
df_toronto.drop(df_toronto[df_toronto['Borough'] == 'Not assigned'].index, inplace = True)

# As neighborhoods are already grouped by postal codes, only replace the slashes with commas 
for i in df_toronto.index:
    df_toronto.at[i, 'Neighborhood'] = df_toronto.at[i, 'Neighborhood'].replace(" /", ",")

# Also, there are no "Not assigned" neighborhoods, so there is no need to correct it

# Reset the index, as some rows were dropped 
df_toronto.reset_index(drop = True, inplace = True)

# Print the dataframe
df_toronto.head(20)

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


Now, let's add the Latitude and Logitude for each location 

In [4]:
latitude = []
longitude = []

# For each postal code, we find its coordinates and append it to the latitude and longitude lists
for postal_code in df_toronto['Postal code']:
    lat_lng_coords = None

    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng

    latitude.append(lat_lng_coords[0])
    longitude.append(lat_lng_coords[1])

# Create new columns with the latitude and longitude lists
df_toronto['Latitude'] = latitude
df_toronto['Longitude'] = longitude

# Print the dataframe 
df_toronto.head(20)

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.752935,-79.335641
1,M4A,North York,Victoria Village,43.728102,-79.31189
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.650964,-79.353041
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.723265,-79.451211
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66179,-79.38939
5,M9A,Etobicoke,Islington Avenue,43.667481,-79.528953
6,M1B,Scarborough,"Malvern, Rouge",43.808626,-79.189913
7,M3B,North York,Don Mills,43.7489,-79.35722
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.707193,-79.311529
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657491,-79.377529
