## WEEK 3 - Toronto - Segmenting and Clustering

#### Part 1 - Get Data and Output Shape

In [None]:
#Import Libraries 
import pandas as pd
import numpy as np

In [2]:
#Getting data from wikipedia website
Neigh = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

In [3]:
#Cleaning Data
Neigh_df = pd.DataFrame(Neigh[0])

In [4]:
#Removing all cells that don't have a value of Borough
Neigh_df = Neigh_df[Neigh_df["Borough"] != "Not assigned"]
print(Neigh_df)

    Postal Code           Borough  \
2           M3A        North York   
3           M4A        North York   
4           M5A  Downtown Toronto   
5           M6A        North York   
6           M7A  Downtown Toronto   
..          ...               ...   
160         M8X         Etobicoke   
165         M4Y  Downtown Toronto   
168         M7Y      East Toronto   
169         M8Y         Etobicoke   
178         M8Z         Etobicoke   

                                         Neighbourhood  
2                                            Parkwoods  
3                                     Victoria Village  
4                            Regent Park, Harbourfront  
5                     Lawrence Manor, Lawrence Heights  
6          Queen's Park, Ontario Provincial Government  
..                                                 ...  
160      The Kingsway, Montgomery Road, Old Mill North  
165                               Church and Wellesley  
168  Business reply mail Processing Centre

In [5]:
#Checking if there are any not assigned values in Neighbourhoods
sum = Neigh_df.Neighbourhood.str.count("Not assigned").sum()
print(sum)

0


In [24]:
# Resetting the index
Neigh_df = Neigh_df.reset_index()
Neigh_df.drop(['index'], axis = 'columns', inplace = True)
Neigh_df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [6]:
#Printing the shape of the data frame
Neigh_df_shape = Neigh_df.shape
print(Neigh_df_shape)

(103, 3)


#### Part 2 - Get Latitude and Longitude

##### Installing Geocoder 

In [9]:
pip install geocoder

Collecting geocoder
  Downloading geocoder-1.38.1-py2.py3-none-any.whl (98 kB)
[K     |████████████████████████████████| 98 kB 10.2 MB/s eta 0:00:01
Collecting ratelim
  Downloading ratelim-0.1.6-py2.py3-none-any.whl (4.0 kB)
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6
Note: you may need to restart the kernel to use updated packages.


In [10]:
#import library
import geocoder

In [None]:
# initialize your variable to None
lat_lng_coords = None

postal_code = 'M3A'

# loop until you get the coordinates
while(lat_lng_coords is None):
  g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
  lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

##### Tried the approach above but ran for 25 mins, then killed it. Using alternative method given suggested in the assignment

In [17]:
data = pd.read_csv("https://cocl.us/Geospatial_data")
data

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [14]:
#Checking the shape of data for longitude and latitude

data.shape

(103, 3)

###### The shape of the longitude and latitude file retrieved above is the same as the our data that was retrieved from wikipedia. We can now check the data types and if everything matches for the postal code we can join the two data frames and have our data set.

In [15]:
data.dtypes

Postal Code     object
Latitude       float64
Longitude      float64
dtype: object

In [16]:
Neigh_df.dtypes

Postal Code      object
Borough          object
Neighbourhood    object
dtype: object

##### Data type for postal code matches so now we can combine our data

In [20]:
#Combining data frames

Toronto_df = Neigh_df.join(data.set_index('Postal Code'), on = 'Postal Code', how = 'inner')
Toronto_df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.753259,-79.329656
3,M4A,North York,Victoria Village,43.725882,-79.315572
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
165,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509
