# Welcome to my capstone project
#### By Annie Vega

I'll start the project by setting up my environment, installing all that I need and keep going

In [None]:
#Make sure all the pip are installed in the terminal
#pip install jupyter
#pip install lxml

# specify which URL/web page we are going to be scraping
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from unicodedata import normalize

### I'm reading the table from Wikipedia using the function read_html, since the Neighbourhood table is the first one, we just call the table 0

In [9]:
table_CA = pd.read_html(url)
df = table_CA[0]
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


Ignore cells with a borough that is Not assigned

In [12]:
# get names of indexes for which column Borough has value "Not assigned" 
borough = df[ df['Borough'] == 'Not assigned' ].index 
  
# drop these row indexes from dataFrame 
df.drop(borough, inplace = True) 
  
df 

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


### If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough.
### To prove if there's any Neighbourhood defined as "Not assigned", if all the values are 

In [17]:
(df['Neighbourhood'] == 'Not assigned').sum()

0

In [18]:
df.shape

(103, 3)

In [105]:
#Geospatial coordinates
filename = "https://cocl.us/Geospatial_data"
df1 = pd.read_csv(filename)
df1.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [22]:
df1.shape

(103, 3)

In [108]:
df1.sort_values('Postal Code')
df1.tail()

Unnamed: 0,Postal Code,Latitude,Longitude
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437
102,M9W,43.706748,-79.594054


In [109]:
df2 = df1.drop(['Postal Code'], axis = 1) 
df2

Unnamed: 0,Latitude,Longitude
0,43.806686,-79.194353
1,43.784535,-79.160497
2,43.763573,-79.188711
3,43.770992,-79.216917
4,43.773136,-79.239476
...,...,...
98,43.706876,-79.518188
99,43.696319,-79.532242
100,43.688905,-79.554724
101,43.739416,-79.588437


In [43]:
df.sort_values('Postal Code')
df.tail()

Unnamed: 0,Postal Code,Borough,Neighbourhood
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


### Concatenate the data frames and include the postal codes

In [110]:
frames = [df, df2] 

result = pd.concat([df2, df], axis=1, join='outer') 
display(result)

Unnamed: 0,Latitude,Longitude,Postal Code,Borough,Neighbourhood
0,43.806686,-79.194353,,,
1,43.784535,-79.160497,,,
2,43.763573,-79.188711,M3A,North York,Parkwoods
3,43.770992,-79.216917,M4A,North York,Victoria Village
4,43.773136,-79.239476,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...,...,...
160,,,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,,,M4Y,Downtown Toronto,Church and Wellesley
168,,,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,,,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [111]:
# get names of indexes for which column Borough has value "Not assigned" 
result = result.dropna() 

result

Unnamed: 0,Latitude,Longitude,Postal Code,Borough,Neighbourhood
2,43.763573,-79.188711,M3A,North York,Parkwoods
3,43.770992,-79.216917,M4A,North York,Victoria Village
4,43.773136,-79.239476,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,43.744734,-79.239476,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,43.727929,-79.262029,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...,...,...
95,43.643515,-79.577201,M6N,York,"Runnymede, The Junction North"
98,43.706876,-79.518188,M9N,York,Weston
99,43.696319,-79.532242,M1P,Scarborough,"Dorset Park, Wexford Heights, Scarborough Town..."
100,43.688905,-79.554724,M2P,North York,York Mills West


In [113]:
#Reorganize columns
result1=result.reindex(columns= ['Postal Code', 'Borough', 'Neighbourhood','Latitude','Longitude'])
result1


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.763573,-79.188711
3,M4A,North York,Victoria Village,43.770992,-79.216917
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.773136,-79.239476
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.744734,-79.239476
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.727929,-79.262029
...,...,...,...,...,...
95,M6N,York,"Runnymede, The Junction North",43.643515,-79.577201
98,M9N,York,Weston,43.706876,-79.518188
99,M1P,Scarborough,"Dorset Park, Wexford Heights, Scarborough Town...",43.696319,-79.532242
100,M2P,North York,York Mills West,43.688905,-79.554724
