In [1]:
!pip install bs4

Collecting bs4
  Downloading https://files.pythonhosted.org/packages/10/ed/7e8b97591f6f456174139ec089c769f89a94a1a4025fe967691de971f314/bs4-0.0.1.tar.gz
Collecting beautifulsoup4 (from bs4)
[?25l  Downloading https://files.pythonhosted.org/packages/66/25/ff030e2437265616a1e9b25ccc864e0371a0bc3adb7c5a404fd661c6f4f6/beautifulsoup4-4.9.1-py3-none-any.whl (115kB)
[K     |████████████████████████████████| 122kB 7.4MB/s eta 0:00:01
[?25hCollecting soupsieve>1.2 (from beautifulsoup4->bs4)
  Downloading https://files.pythonhosted.org/packages/6f/8f/457f4a5390eeae1cc3aeab89deb7724c965be841ffca6cfca9197482e470/soupsieve-2.0.1-py3-none-any.whl
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/jupyterlab/.cache/pip/wheels/a0/b0/b2/4f80b9456b87abedbc0bf2d52235414c3467d8889be38dd472
Successfully built bs4
Installing collected packages: soupsieve, beautifulsoup4, bs4
Successfully installed beautifulsoup4-4.9.1 bs4-0.0.

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [3]:
wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(wiki_url)
soup = BeautifulSoup(response.text, 'html.parser')

In [4]:
table = soup.find('table', {'class':'wikitable sortable'}).tbody
#print(table)

In [5]:
rows = table.find_all('tr')
columns = [v.text.replace('\n','') for v in rows[0].find_all('th')]
print(columns)

['Postal Code', 'Borough', 'Neighbourhood']


In [6]:
print(rows[0:5])

[<tr>
<th>Postal Code
</th>
<th>Borough
</th>
<th>Neighbourhood
</th></tr>, <tr>
<td>M1A
</td>
<td>Not assigned
</td>
<td>Not assigned
</td></tr>, <tr>
<td>M2A
</td>
<td>Not assigned
</td>
<td>Not assigned
</td></tr>, <tr>
<td>M3A
</td>
<td>North York
</td>
<td>Parkwoods
</td></tr>, <tr>
<td>M4A
</td>
<td>North York
</td>
<td>Victoria Village
</td></tr>]


In [7]:
df = pd.DataFrame(columns=columns)
for i in range(1,len(rows)):
    tds = rows[i].find_all('td')
    
    if len(tds) < 3:
        values = [tds[0].text.replace('\n',''), tds[1].text.replace('\n',''), tds[2].text.replace('\n','')]
    else:
        values = [td.text.replace('\n','') for td in tds]
    df = df.append(pd.Series(values, index=columns), ignore_index=True) 

In [8]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [9]:
import numpy as np

In [10]:
# Remove rows with 'Not assigned' in their Borough attribute
df = df[df['Borough'].replace('Not assigned',np.nan).notnull()]

In [11]:
# Reset the index after removing some rows
df = df.reset_index(drop=True)

In [12]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### Let's check if there is any repitition in the postal code:
#### If the 

In [13]:
pd.set_option("display.max_rows", None, "display.max_columns", None) 
# Show the counts of each unique postal code
df.groupby(['Postal Code'])['Postal Code'].value_counts()

Postal Code  Postal Code
M1B          M1B            1
M1C          M1C            1
M1E          M1E            1
M1G          M1G            1
M1H          M1H            1
M1J          M1J            1
M1K          M1K            1
M1L          M1L            1
M1M          M1M            1
M1N          M1N            1
M1P          M1P            1
M1R          M1R            1
M1S          M1S            1
M1T          M1T            1
M1V          M1V            1
M1W          M1W            1
M1X          M1X            1
M2H          M2H            1
M2J          M2J            1
M2K          M2K            1
M2L          M2L            1
M2M          M2M            1
M2N          M2N            1
M2P          M2P            1
M2R          M2R            1
M3A          M3A            1
M3B          M3B            1
M3C          M3C            1
M3H          M3H            1
M3J          M3J            1
M3K          M3K            1
M3L          M3L            1
M3M          M3

In [14]:
# If the number of the unique postal codes is equal to postal codes in the original dataframe, then all postal codes are unique
if df.shape[0] == df.groupby(['Postal Code'])['Postal Code'].value_counts().shape[0]:
    print('Every postal code in the data frame is unique')

Every postal code in the data frame is unique


### The dataframe shape (rows, columns) is:

In [15]:
# Show the shape of the cleaned dataframe
print(df.shape)

(103, 3)


### Adding latitudes and longitudes

In [16]:
import pandas as pd
lat_long_df = pd.read_csv('https://cocl.us/Geospatial_data')
lat_long_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [17]:
# let's take the values of latitude and longitudes from the lat_long_df dataframe
df_new = df.merge(lat_long_df, on='Postal Code', how='left')
df_new.head(30)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [24]:
# let's study Toronto neighborhoods only:
import re

# new dataframe for Toronto 
toronto_df = pd.DataFrame(columns=['Postal Code', 'Borough', 'Neighbourhood', 'Latitude', 'Longitude'])
toronto_df

#extract Toronto areas from the table
for i in range(1,len(df_new)):
    if bool(re.search('Toronto',df_new.iloc[i,1])):
        toronto_df =  toronto_df.append(df_new.iloc[i,:])
toronto_df = toronto_df.reset_index(drop=True)

In [19]:
# intall and import folium library to visualize the neighborhoods
!conda install -c conda-forge folium=0.5.0 --yes
import folium

Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Collecting package metadata (repodata.json): done
Solving environment: done


  current version: 4.8.3
  latest version: 4.8.4

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    altair-4.1.0               |             py_1         614 KB  conda-forge
    branca-0.4.1               |             py_0          26 KB  conda-forge
    brotlipy-0.7.0             |py36h8c4c3a4_1000         346 KB  conda-forge
    chardet-3.0.4              |py36h9f0ad1d_1006         188 KB  conda-forge
    cryptography-3.0           |   py36h45558ae_0         640 KB  c

In [22]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[43.6532, 79.3832], zoom_start=5)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Borough'], toronto_df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [26]:
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [27]:
toronto_df.dtypes

Postal Code       object
Borough           object
Neighbourhood     object
Latitude         float64
Longitude        float64
dtype: object

In [31]:
toronto_df['Borough'].value_counts()

Downtown Toronto    19
Central Toronto      9
West Toronto         6
East Toronto         5
Name: Borough, dtype: int64

In [47]:
toronto_cluster_df = toronto_df[['Borough','Latitude','Longitude']]
toronto_cluster_df.head()

Unnamed: 0,Borough,Latitude,Longitude
0,Downtown Toronto,43.65426,-79.360636
1,Downtown Toronto,43.662301,-79.389494
2,Downtown Toronto,43.657162,-79.378937
3,Downtown Toronto,43.651494,-79.375418
4,East Toronto,43.676357,-79.293031


In [48]:
toronto_cluster_df2 = pd.concat([toronto_cluster_df, pd.get_dummies(toronto_cluster_df['Borough'])], axis=1)
toronto_cluster_df2.drop('Borough',axis=1,inplace=True)
toronto_cluster_df2

Unnamed: 0,Latitude,Longitude,Central Toronto,Downtown Toronto,East Toronto,West Toronto
0,43.65426,-79.360636,0,1,0,0
1,43.662301,-79.389494,0,1,0,0
2,43.657162,-79.378937,0,1,0,0
3,43.651494,-79.375418,0,1,0,0
4,43.676357,-79.293031,0,0,1,0
5,43.644771,-79.373306,0,1,0,0
6,43.657952,-79.387383,0,1,0,0
7,43.669542,-79.422564,0,1,0,0
8,43.650571,-79.384568,0,1,0,0
9,43.669005,-79.442259,0,0,0,1


## Clustering

In [49]:
from sklearn.cluster import KMeans
k_means = KMeans(init='k-means++',n_clusters=4,n_init=12)
k_means.fit(toronto_cluster_df2)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=4, n_init=12, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [50]:
k_means_labels = k_means.labels_
k_means_cluster_centeres = k_means.cluster_centers_

In [51]:
toronto_cluster_df['Labels'] = k_means_labels
toronto_cluster_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Borough,Latitude,Longitude,Labels
0,Downtown Toronto,43.65426,-79.360636,1
1,Downtown Toronto,43.662301,-79.389494,1
2,Downtown Toronto,43.657162,-79.378937,1
3,Downtown Toronto,43.651494,-79.375418,1
4,East Toronto,43.676357,-79.293031,3
5,Downtown Toronto,43.644771,-79.373306,1
6,Downtown Toronto,43.657952,-79.387383,1
7,Downtown Toronto,43.669542,-79.422564,1
8,Downtown Toronto,43.650571,-79.384568,1
9,West Toronto,43.669005,-79.442259,2
