In [3]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from bs4 import BeautifulSoup

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

import requests #Use to retrieve the data from the website

from sklearn.cluster import KMeans # import k-means from clustering stage



# Part 1: Cleaning of data with neighborhoods and boroughs

Retrieving data from website

In [4]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
url_source = requests.get(url).text
print("Data on postal codes downloaded!")

Data on postal codes downloaded!


Loading the downloaded data

In [5]:
#!conda install -c conda-forge BeautifulSoup --yes
xml_soup = BeautifulSoup(url_source, 'xml')

In [68]:
soup_table=xml_soup.find('table')

#Each tr represents a row  
soup_table.find_all('tr')[i]  
#Each td element in tr is a column  
soup_table.find_all('tr')[i].find_all('td')  


Putting the data into a dataframe

In [9]:
df=pd.DataFrame(columns=["Postcode","Borough","Neighborhood"])
# Searching through the weppage for postcode, borough, neighborhood 
for tr in soup_table.find_all('tr'):
    row_data=[]
    for td in tr.find_all('td'):
        row_data.append(td.text.strip())
        if len(row_data)==3:
            df.loc[len(df)] = row_data
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


Extracting data with assigned Boroghs 

In [10]:
df_Br=df[df["Borough"]!="Not assigned"]
df_Br=df_Br.reset_index(drop=True)
df_Br.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


Replacing the Nieghborhoods having "/" with ","

In [11]:
for n_row in range(0,len(df_Br)):
    df_Br.iloc[n_row,2]=df_Br.iloc[n_row,2].replace("/",",")
df_Br.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


No neighborhoods with "not assigned"

In [12]:
len(df_Br[df_Br["Neighborhood"]=="Not assigned"])

0

In [13]:
print("The dimensions of the dataframe are ",df_Br.shape[0]," rows and",df_Br.shape[1], " columns")

The dimensions of the dataframe are  103  rows and 3  columns


# Part 2: Obtaining geolocation of neighborhoods

Obtaining the data having longitudes and latitudes

In [14]:
url_csv='http://cocl.us/Geospatial_data'
!wget -q -O 'geo_data.csv' http://cocl.us/Geospatial_data

In [15]:
geo_data=pd.read_csv("geo_data.csv")
geo_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Combining the 2 data sets to have postal codes with geolocation, Borough and neighborhoods

In [16]:
merged_df=pd.merge(geo_data,df_Br,left_on="Postal Code",right_on="Postcode",how="left")
merged_df=merged_df.drop(columns="Postcode")
merged_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude,Borough,Neighborhood
0,M1B,43.806686,-79.194353,Scarborough,"Malvern , Rouge"
1,M1C,43.784535,-79.160497,Scarborough,"Rouge Hill , Port Union , Highland Creek"
2,M1E,43.763573,-79.188711,Scarborough,"Guildwood , Morningside , West Hill"
3,M1G,43.770992,-79.216917,Scarborough,Woburn
4,M1H,43.773136,-79.239476,Scarborough,Cedarbrae


In [19]:
print("Number of rows: ",len(merged_df))

Number of rows:  103


# Part 3: Explore and cluster

Boroughs with Toronto in their names

In [25]:
df_tr=merged_df[merged_df["Borough"].str.contains("Toronto",regex=True)]
df_tr.head()

Unnamed: 0,Postal Code,Latitude,Longitude,Borough,Neighborhood
37,M4E,43.676357,-79.293031,East Toronto,The Beaches
41,M4K,43.679557,-79.352188,East Toronto,"The Danforth West , Riverdale"
42,M4L,43.668999,-79.315572,East Toronto,"India Bazaar , The Beaches West"
43,M4M,43.659526,-79.340923,East Toronto,Studio District
44,M4N,43.72802,-79.38879,Central Toronto,Lawrence Park


In [30]:
print("Number of Boroughs: ",len(list(set(df_tr["Borough"]))))
print("Number of Postal codes: ",len(list(set(df_tr.iloc[:,0]))))
print("Number of neighborhoods: ",len(list(set(df_tr["Neighborhood"]))))
print("Number of rows: ",len(df_tr))

Number of Boroughs:  4
Number of Postal codes:  39
Number of neighborhoods:  39
Number of rows:  39


Preparation of variables to be used for clustering

In [47]:
# Creating dummies out of the Neighborhoods
Toronto_Br_Dummies = pd.get_dummies(df_tr[['Borough']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Toronto_Br_Dummies['Neighborhood'] = df_tr['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [Toronto_Br_Dummies.columns[-1]] + list(Toronto_Br_Dummies.columns[:-1])
Toronto_Br_Dummies = Toronto_Br_Dummies[fixed_columns]

Toronto_Br_Dummies.head()

Unnamed: 0,Neighborhood,Central Toronto,Downtown Toronto,East Toronto,West Toronto
37,The Beaches,0,0,1,0
41,"The Danforth West , Riverdale",0,0,1,0
42,"India Bazaar , The Beaches West",0,0,1,0
43,Studio District,0,0,1,0
44,Lawrence Park,1,0,0,0


Begining of k-means clustering

In [67]:
#Initialise with 5 clusters
kclusters = 5

#Dropping of the neighborhood column
Toronto_clustering = Toronto_Br_Dummies.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_clustering)

#Checking the lables
print("Cluster labels: ",list(set(kmeans.labels_)))

Cluster labels:  [0, 1, 2, 3]


  return_n_iter=True)


In [50]:
#Adding the cluster labels to the dataframe
df_tr["Clusters"]=kmeans.labels_
df_tr.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0,Postal Code,Latitude,Longitude,Borough,Neighborhood,Clusters
37,M4E,43.676357,-79.293031,East Toronto,The Beaches,0
41,M4K,43.679557,-79.352188,East Toronto,"The Danforth West , Riverdale",0
42,M4L,43.668999,-79.315572,East Toronto,"India Bazaar , The Beaches West",0
43,M4M,43.659526,-79.340923,East Toronto,Studio District,0
44,M4N,43.72802,-79.38879,Central Toronto,Lawrence Park,2


In [40]:
!conda install -c conda-forge folium=0.5.0 --yes # installation of folium
import folium # map rendering library


Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2020.4.5.1         |   py36h9f0ad1d_0         151 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    altair-4.1.0               |             py_1         614 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    ca-certificates-2020.4.5.1 |       hecc5488_0         146 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    branca-0.4.0               |             py_0          26 KB  conda-forge
    openssl-1.1.1f             |       h516909a_0         2.1 MB  conda-forge
    ------------------------------------------------------------
                       

In github, the map doesn't appear. Therefore, see the uploaded image file "Clusters of neighborhoods in toronto.JPG"

In [57]:
#Geolocation of Toronto
latitude=43.6532
longitude=-79.3832

#Initialising the Map object
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, nh, cluster in zip(df_tr['Latitude'], df_tr['Longitude'], df_tr['Neighborhood'], df_tr['Clusters']):
    label = folium.Popup(str(nh) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

The clusters were labelled using indeces hence, 1 will be added to the cluster column to display the cluster number

Cluster 1

In [63]:
cluster1=df_tr[df_tr["Clusters"]==0]
cluster1["Clusters"]=cluster1["Clusters"]+1
cluster1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,Postal Code,Latitude,Longitude,Borough,Neighborhood,Clusters
37,M4E,43.676357,-79.293031,East Toronto,The Beaches,1
41,M4K,43.679557,-79.352188,East Toronto,"The Danforth West , Riverdale",1
42,M4L,43.668999,-79.315572,East Toronto,"India Bazaar , The Beaches West",1
43,M4M,43.659526,-79.340923,East Toronto,Studio District,1
87,M7Y,43.662744,-79.321558,East Toronto,Business reply mail Processing CentrE,1


Cluster 2

In [64]:
cluster2=df_tr[df_tr["Clusters"]==1]
cluster2["Clusters"]=cluster2["Clusters"]+1
cluster2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,Postal Code,Latitude,Longitude,Borough,Neighborhood,Clusters
50,M4W,43.679563,-79.377529,Downtown Toronto,Rosedale,2
51,M4X,43.667967,-79.367675,Downtown Toronto,"St. James Town , Cabbagetown",2
52,M4Y,43.66586,-79.38316,Downtown Toronto,Church and Wellesley,2
53,M5A,43.65426,-79.360636,Downtown Toronto,"Regent Park , Harbourfront",2
54,M5B,43.657162,-79.378937,Downtown Toronto,"Garden District, Ryerson",2
55,M5C,43.651494,-79.375418,Downtown Toronto,St. James Town,2
56,M5E,43.644771,-79.373306,Downtown Toronto,Berczy Park,2
57,M5G,43.657952,-79.387383,Downtown Toronto,Central Bay Street,2
58,M5H,43.650571,-79.384568,Downtown Toronto,"Richmond , Adelaide , King",2
59,M5J,43.640816,-79.381752,Downtown Toronto,"Harbourfront East , Union Station , Toronto Is...",2


Cluster 3

In [65]:
cluster3=df_tr[df_tr["Clusters"]==2]
cluster3["Clusters"]=cluster3["Clusters"]+1
cluster3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,Postal Code,Latitude,Longitude,Borough,Neighborhood,Clusters
44,M4N,43.72802,-79.38879,Central Toronto,Lawrence Park,3
45,M4P,43.712751,-79.390197,Central Toronto,Davisville North,3
46,M4R,43.715383,-79.405678,Central Toronto,North Toronto West,3
47,M4S,43.704324,-79.38879,Central Toronto,Davisville,3
48,M4T,43.689574,-79.38316,Central Toronto,"Moore Park , Summerhill East",3
49,M4V,43.686412,-79.400049,Central Toronto,"Summerhill West , Rathnelly , South Hill , For...",3
63,M5N,43.711695,-79.416936,Central Toronto,Roselawn,3
64,M5P,43.696948,-79.411307,Central Toronto,Forest Hill North & West,3
65,M5R,43.67271,-79.405678,Central Toronto,"The Annex , North Midtown , Yorkville",3


In [66]:
cluster4=df_tr[df_tr["Clusters"]==3]
cluster4["Clusters"]=cluster4["Clusters"]+1
cluster4

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,Postal Code,Latitude,Longitude,Borough,Neighborhood,Clusters
76,M6H,43.669005,-79.442259,West Toronto,"Dufferin , Dovercourt Village",4
77,M6J,43.647927,-79.41975,West Toronto,"Little Portugal , Trinity",4
78,M6K,43.636847,-79.428191,West Toronto,"Brockton , Parkdale Village , Exhibition Place",4
82,M6P,43.661608,-79.464763,West Toronto,"High Park , The Junction South",4
83,M6R,43.64896,-79.456325,West Toronto,"Parkdale , Roncesvalles",4
84,M6S,43.651571,-79.48445,West Toronto,"Runnymede , Swansea",4
