# Week 3
## 1. Data acquisition and cleaning
#### The dataframe will consist of three columns: Postal Code, Borough, and Neighborhood. Rows will be ignored if they have a borough that is Not assigned.
#### More than one neighborhood can exist in one postal code area. Here, we treat each postal code as one unique neighbourhood for convenience. 
#### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.


In [1]:
# import libraries that will be used later
import pandas as pd
import numpy as np
import json

# loading saved foursquare credentials in a local file
with open('foursquareCredentials.json') as js:
    fscredential=json.load(js)

# loading postal code table
dflink='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df_raw=pd.read_html(dflink)
df_raw=df_raw[0]
df_raw=df_raw[df_raw['Borough']!='Not assigned']

postcode=df_raw['Postal Code'].unique()
df_new=pd.DataFrame(columns=['Postal Code','Borough','Neighbourhood'])
for i in range(postcode.size):
    df_tmp=df_raw[df_raw['Postal Code']==postcode[i]]
    tmpstr=''
    for j in range(df_tmp.shape[0]):
        if j>0:
            tmpstr=tmpstr+', '+df_tmp.iloc[j,2]
        else:
            tmpstr=tmpstr+df_tmp.iloc[j,2]
                
    df_new=df_new.append({'Postal Code':df_tmp.iloc[0,0],
                  'Borough':df_tmp.iloc[0,1],
                  'Neighbourhood':tmpstr},ignore_index=True)


for i in range(df_new.shape[0]):
    if df_new.iloc[i,1]=='Not assigned':
        df_new.iloc[i,1]=df_new.iloc[i,2]
        
print(r'The rows of the dataframe = '+str(df_new.shape[0]))
print(r'Data cleaning is done!')

The rows of the dataframe = 103
Data cleaning is done!


## 2. Adding goelocation data to the neighbourhoods


In [4]:
# now insert geospatial data into the dataframe
# initialize two new columns
df_new['Latitude']=0
df_new['Longitude']=0

# this is a dataframe of coordinates
df_coord=pd.read_csv('http://cocl.us/Geospatial_data')

# adding coordinate information into the main dataframe
for i in df_new['Postal Code']:
    df_new.loc[df_new['Postal Code']==i,'Latitude']=df_coord[df_coord['Postal Code']==i].iloc[0,1]
    df_new.loc[df_new['Postal Code']==i,'Longitude']=df_coord[df_coord['Postal Code']==i].iloc[0,2]
    
df_new.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
