<h3>importing Packages</h3>

In [2]:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import numpy as np 
import pandas as pd

<h3>importing Data</h3>

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
request = requests.get(url).text


In [4]:
soup = BeautifulSoup(request, 'lxml') #initialize object from beautiful soup class

<h3>define the Scrapping function</h3>

In [5]:
class Scrap_wiki:
       
        def parse_url(self, url):
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'lxml')
            return [(self.parse_html_table(table))\
                    for table in soup.find_all('table', class_="wikitable sortable")]  
    
        def parse_html_table(self, table):
            n_columns = 0
            n_rows=0
            column_names = []
            for row in table.find_all('tr'):
                td_tags = row.find_all('td')
                if len(td_tags) > 0:
                    n_rows+=1
                    if n_columns == 0:
                        n_columns = len(td_tags)
                        
                th_tags = row.find_all('th') 
                if len(th_tags) > 0 and len(column_names) == 0:
                    for th in th_tags:
                        column_names.append(th.get_text())
    
            if len(column_names) > 0 and len(column_names) != n_columns:
                raise Exception("Column titles do not match the number of columns")
    
            columns = column_names if len(column_names) > 0 else range(0,n_columns)
            df = pd.DataFrame(columns = columns,
                              index= range(0,n_rows))
            row_marker = 0
            for row in table.find_all('tr'):
                column_marker = 0
                columns = row.find_all('td')
                for column in columns:
                    df.iat[row_marker,column_marker] = column.get_text()
                    column_marker += 1
                if len(columns) > 0:
                    row_marker += 1
                    
            for col in df:
                try:
                    df[col] = df[col].astype(float)
                except ValueError:
                    pass
            
            return df

<h3>create and make some changes on the data</h3>

In [6]:
scrap = Scrap_wiki()
df_initial = scrap.parse_url(url)[0] 

df_initial.rename(columns={"Postal Code\n": "Postal Code", "Borough\n": "Borough","Neighborhood\n": "Neighborhood"}, inplace=True)

df_initial['Postal Code']= df_initial['Postal Code'].str.replace("\n", "", case = False)
df_initial['Borough']= df_initial['Borough'].str.replace("\n", "", case = False)
df_initial['Neighborhood']= df_initial['Neighborhood'].str.replace("\n", "", case = False)
df_initial.head(20)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


In [7]:
df_without_NA = df_initial[df_initial.Borough != 'Not assigned'] #dropping the line with not assigned Borough
df_without_NA.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [8]:
#handling repeated postal code and borough 
df = df_without_NA.groupby(['Postal Code','Borough'])['Neighborhood'].apply(lambda x: ", ".join(x.astype(str))).reset_index()
df = df_without_NA.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M9M,North York,"Humberlea, Emery"
1,M1N,Scarborough,"Birch Cliff, Cliffside West"
2,M4B,East York,"Parkview Hill, Woodbine Gardens"
3,M1P,Scarborough,"Dorset Park, Wexford Heights, Scarborough Town..."
4,M9L,North York,Humber Summit


In [9]:
df.shape 

(103, 3)

<h3>
Obtaining geospatial data</h3>

In [10]:

url_geo="http://cocl.us/Geospatial_data"
geo=pd.read_csv(url_geo)
geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
join = df.set_index('Postal Code').join(geo.set_index('Postal Code'))
join.index.name = 'Postcode'
join.reset_index(inplace=True)
join.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M9M,North York,"Humberlea, Emery",43.724766,-79.532242
1,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848
2,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
3,M1P,Scarborough,"Dorset Park, Wexford Heights, Scarborough Town...",43.75741,-79.273304
4,M9L,North York,Humber Summit,43.756303,-79.565963


In [17]:
from geopy.geocoders import Nominatim 
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans
import folium

<h3>Create tha Map and displaying Toronto's coordinate</h3>

In [18]:
address = 'Toronto'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

  This is separate from the ipykernel package so we can avoid doing imports until


The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [21]:
map_geo = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(join['Latitude'], join['Longitude'], join['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_geo)  
    
map_geo

<h3>Getting foursquare data and Analyzing it</h3>

In [27]:
CLIENT_ID = '3WKK2DXV2E3ZOUU3GEHKEMN3AYJOM4RBGBRXUXFFGBDLH3CH' # your Foursquare ID
CLIENT_SECRET = 'YUIKNKEQAJ2JHAYILQIYE01GAS5FIMW10NMPRLYPXKQMMWD2' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 50

In [28]:
def getNearbyVenues(postcode, names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for pc, names, lat, lng in zip(postcode, names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            pc,
            names, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])


    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postcode',
                  'Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [29]:

toronto_venues = getNearbyVenues(postcode=join['Postcode'], names=join['Neighborhood'],
                                   latitudes=join['Latitude'],
                                   longitudes=join['Longitude']
                                  )
toronto_venues.head()

Unnamed: 0,Postcode,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M9M,"Humberlea, Emery",43.724766,-79.532242,Strathburn Park,43.721765,-79.532854,Baseball Field
1,M1N,"Birch Cliff, Cliffside West",43.692657,-79.264848,The Birchcliff,43.691666,-79.264532,Café
2,M1N,"Birch Cliff, Cliffside West",43.692657,-79.264848,Birchmount Community Centre,43.695175,-79.262161,General Entertainment
3,M1N,"Birch Cliff, Cliffside West",43.692657,-79.264848,Scarborough Gardens,43.694647,-79.26223,Skating Rink
4,M1N,"Birch Cliff, Cliffside West",43.692657,-79.264848,Birchmount Stadium,43.695323,-79.261293,College Stadium


In [30]:

# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Postcode'] = toronto_venues['Postcode']

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Postcode,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M9M,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M1N,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M1N,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M1N,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M1N,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
toronto_grouped = toronto_onehot.groupby('Postcode').mean().reset_index()

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Postcode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
postcode_venues_sorted = pd.DataFrame(columns=columns)
postcode_venues_sorted['Postcode'] = toronto_grouped['Postcode']

for ind in np.arange(toronto_grouped.shape[0]):
    postcode_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

postcode_venues_sorted.head()

Unnamed: 0,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Fast Food Restaurant,College Rec Center,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Drugstore,Donut Shop,Doner Restaurant
1,M1C,Construction & Landscaping,Bar,Yoga Studio,Donut Shop,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Drugstore
2,M1E,Medical Center,Intersection,Electronics Store,Breakfast Spot,Rental Car Location,Mexican Restaurant,Bank,Discount Store,Distribution Center,Yoga Studio
3,M1G,Coffee Shop,Indian Restaurant,Korean Restaurant,Yoga Studio,Doner Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Drugstore
4,M1H,Fried Chicken Joint,Lounge,Caribbean Restaurant,Hakka Restaurant,Bank,Athletics & Sports,Thai Restaurant,Bakery,Gas Station,Discount Store


Clustering and showing clusters in the Map

<h3>Clustering and showing clusters in the Map</h3

In [32]:
# set number of clusters
kclusters = 3

toronto_grouped_clustering = toronto_grouped.drop('Postcode', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

In [33]:
postcode_venues_sorted['Cluster label'] = kmeans.labels_
join2 = postcode_venues_sorted.set_index('Postcode').join(join.set_index('Postcode'))
join2.index.name = 'Postcode'
join2.reset_index(inplace=True)
join2.head()

Unnamed: 0,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster label,Borough,Neighborhood,Latitude,Longitude
0,M1B,Fast Food Restaurant,College Rec Center,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Drugstore,Donut Shop,Doner Restaurant,0,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Construction & Landscaping,Bar,Yoga Studio,Donut Shop,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Drugstore,0,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Medical Center,Intersection,Electronics Store,Breakfast Spot,Rental Car Location,Mexican Restaurant,Bank,Discount Store,Distribution Center,Yoga Studio,0,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Coffee Shop,Indian Restaurant,Korean Restaurant,Yoga Studio,Doner Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Drugstore,0,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Fried Chicken Joint,Lounge,Caribbean Restaurant,Hakka Restaurant,Bank,Athletics & Sports,Thai Restaurant,Bakery,Gas Station,Discount Store,0,Scarborough,Cedarbrae,43.773136,-79.239476


In [36]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neigh, pc, cluster in zip(join2['Latitude'], join2['Longitude'], join2['Neighborhood'], join2['Postcode'], join2['Cluster label']):
    label = folium.Popup(str(neigh) + '(' + str(pc) + '): Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters