# Applied Data Science Capstone -- Week 3
## Segmenting and Clustering Neighborhoods in Toronto -- Part 1

First, import the data table into a dataframe:

In [1]:
import pandas as pd

data1 = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
data1.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


Remove records where a Borough is not assigned to a Postcode:

In [2]:
data1.drop(index=data1.loc[data1['Borough']=='Not assigned'].index, inplace=True)  # remove postcodes not assigned
data1.reset_index(inplace=True)  # reindex remaining entries
data1.drop(columns='index', inplace=True)  # remove extra index column
data1.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


If Neighborhood is not assigned, rename to Borough name:

In [3]:
renamelist=data1.loc[data1['Neighbourhood']=='Not assigned'].index
data1.iloc[renamelist,2]=data1.iloc[renamelist,1]  # for each neighborhood not assigned, rename to Borough
data1.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


If a Postcode contains multiple neighborhoods, combine them into the first postcode entry and then remove duplicate postcode entries

In [4]:
data1['Dupes']=data1.duplicated(subset='Postcode') #identify duplicate Postcodes

for i in data1.index:
    if not data1.iloc[i,3]:  # if Dupes=False, advance, keeping track if it is first duplicate entry
        k=i
    else:
        data1.iloc[k,2]=data1.iloc[k,2]+', '+data1.iloc[i,2]  #if Dupes=True, append neighborhood indo first duplicate entry
    
data1.drop(index=data1.loc[data1['Dupes']].index, inplace=True)  #remove the duplicate entries and reindex
data1.reset_index(inplace=True)
data1.drop(columns=['index','Dupes'], inplace=True)  #remove extra columns
data1.head(20)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


Print number of rows in dataframe

In [5]:
print("Number of rows in Postcode dataframe:  " + str(data1.shape[0]))

Number of rows in Postcode dataframe:  103


## Segmenting and Clustering Neighborhoods in Toronto -- Part 2

Try to get the latitude and the longitude coordinates of each Postcode using geocoder

In [38]:
import geocoder   # import the geocoder module

# initialize collectors for latitudes and longitudes
lats=[]
longs=[]
codes=[]
for i in data1.index:
    latlong_coords=None
    # loop until you get the coordinates
    k=0    # put a limit on how many requests for each borough
    while((latlong_coords is None) and (k<20) ):
        k+=1
        # use the geocode farm instead of google, since google is denying requests
        g = geocoder.geocodefarm('{}, Toronto, Ontario'.format(data1.iloc[i,0]))
        latlong_coords = g.latlng

    print(i,k, data1.iloc[i,0], g)
    codes.append(data1.iloc[i,0])
    lats.append(latlong_coords[0])
    longs.append(latlong_coords[1])
    
# create a new dataframe with coordinates and merge with neighborhoods dataframe
d={'Postcode':codes, 'Latitude':lats, 'Longitude':longs}
pcodes2=pd.DataFrame(data=d)
gdata2 = data1.merge(pcodes2, on="Postcode")
gdata2.head(20)


0 3 M3A <[OK] Geocodefarm - Geocode [M3A, ON, Canada]>
1 1 M4A <[OK] Geocodefarm - Geocode [M4A, ON, Canada]>
2 3 M5A <[OK] Geocodefarm - Geocode [M5A, ON, Canada]>
3 2 M6A <[OK] Geocodefarm - Geocode [M6A, ON, Canada]>
4 1 M7A <[OK] Geocodefarm - Geocode [M7A, ON, Canada]>
5 3 M9A <[OK] Geocodefarm - Geocode [M9A, ON, Canada]>
6 1 M1B <[OK] Geocodefarm - Geocode [M1B, ON, Canada]>
7 1 M3B <[OK] Geocodefarm - Geocode [M3B, ON, Canada]>
8 4 M4B <[OK] Geocodefarm - Geocode [M4B, ON, Canada]>
9 5 M5B <[OK] Geocodefarm - Geocode [M5B, ON, Canada]>
10 1 M6B <[OK] Geocodefarm - Geocode [M6B, ON, Canada]>
11 3 M9B <[OK] Geocodefarm - Geocode [M9B, ON, Canada]>
12 4 M1C <[OK] Geocodefarm - Geocode [M1C, ON, Canada]>
13 1 M3C <[OK] Geocodefarm - Geocode [M3C, ON, Canada]>
14 3 M4C <[OK] Geocodefarm - Geocode [M4C, ON, Canada]>
15 1 M5C <[OK] Geocodefarm - Geocode [M5C, ON, Canada]>
16 1 M6C <[OK] Geocodefarm - Geocode [M6C, ON, Canada]>
17 1 M9C <[OK] Geocodefarm - Geocode [M9C, ON, Canada]>
18

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.751255,-79.329895
1,M4A,North York,Victoria Village,43.729958,-79.314201
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65522,-79.361969
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.722801,-79.450691
4,M7A,Queen's Park,Queen's Park,43.664486,-79.393021
5,M9A,Etobicoke,Islington Avenue,43.662743,-79.528427
6,M1B,Scarborough,"Rouge, Malvern",43.810154,-79.194603
7,M3B,North York,Don Mills North,43.749134,-79.362007
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.707577,-79.310913
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657467,-79.377708


Backup plan -- import the latitude and the longitude coordinates of the Postcodes from 

In [7]:
pcodes = pd.read_csv('Geospatial_Coordinates.csv')
pcodes.rename(columns={"Postal Code": "Postcode"}, inplace=True)
gdata = data1.merge(pcodes, on="Postcode")
gdata.head(20)


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
