# Section 1 - Scrape Web Page

Import the libaries and then Scrape the HTML table to a dataframe

In [1]:
import lxml.html as lh
import pandas as pd

url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
scrape_source = pd.read_html(url)

df_temp=scrape_source[0]

df_temp.sort_values(by='Postal Code',inplace=True)


Now clean the dataframe to remove unwanted values, rename columns and concatenate duplicate instances of the PostalCode values

In [2]:
df_subs = df_temp[["Postal Code","Borough","Neighbourhood"]]

df_subs.rename(columns ={"Postal Code":"PostalCode"},inplace=True)

df_subs.drop(df_subs.loc[df_subs['Borough']=="Not assigned"].index, inplace=True)

df_subs=df_subs.groupby(['PostalCode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()

Make sure there are no instances of Not Assigned in the Neighbourhood col

In [3]:
df_subs[df_subs.Neighbourhood == "Not assigned"].shape[0]

0

Report the count of rows in the final table

In [4]:
df_subs.shape

(103, 3)

Show the first 5 rows of the final table

In [5]:
df_subs.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


# Section 2 - Merge Geospatial Data

In [6]:
df_geo=pd.read_csv('http://cocl.us/Geospatial_data')

In [7]:
df_geo.rename(columns ={"Postal Code":"PostalCode"},inplace=True)


In [8]:
df_subs_geo=pd.merge(df_geo, df_subs, on="PostalCode")

# Section 3 - Cluster and generate Map

In [None]:
!conda install -c conda-forge folium=0.5.0

from sklearn.cluster import KMeans
import folium
print('Libraries imported.')

Solving environment: / 

In [None]:
# set number of clusters
kclusters = 5

tor_clust= df_subs_geo.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(tor_clust)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = df_subs_geo['Neighborhood']

for ind in np.arange(df_subs_geo.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(df_subs_geo.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

In [None]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

manhattan_merged = manhattan_data

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
manhattan_merged = manhattan_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

manhattan_merged.head() # check the last columns!