In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import folium
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

Below we get the HTML with requests, and cut all of the HTML around the first table out of the variable.
We're assuming that the table we want is the first table on the page

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')
#print(soup.prettify())
soup = str(soup)
start = soup.find('<tbody')
end = soup.find('</tbody>')
table = soup[start:end]

We split the rows of the table text into a list and coerce this into a dataframe.
We're assuming the table has a header row, and that each table row is denoted by a <tr> tag, as well as each cell in a row being denoted by a <td> tag.

In [3]:
dataframe = []
table = table[table.find('</th>'):]
table = table[table.find('</tr>')+4:]
append = table.split('<tr>')
dataframe = [i.split('<td>') for i in append]



df = pd.DataFrame(dataframe)        

We need to clean up the data frame a little. We're just finding/replacing and slicing out a bunch of the HTML surrounding dataframe values

In [4]:
df.rename(columns={1: 'PostalCode', 2:'Borough', 3:'Neighborhood'}, inplace = True)
df.drop(columns=0, inplace=True)

df['PostalCode'] = df['PostalCode'].str[:3]

df.drop([0], inplace = True)

for i, j in df.iterrows():
    j.Borough = j.Borough.replace('</a>', '')
    j.Borough = j.Borough.replace('</td>', '')
    j.Borough = j.Borough.replace('\n', '')
    if j.Borough.find('<a') > -1:
        j.Borough = j.Borough[j.Borough.find('>')+1:]
    j.Neighborhood = j.Neighborhood.replace('</td></tr>', '')
    j.Neighborhood = j.Neighborhood.replace('</a>', '')
    j.Neighborhood = j.Neighborhood.replace('\n', '')
    if j.Neighborhood.find('<a') > -1:
        j.Neighborhood = j.Neighborhood[j.Neighborhood.find('>')+1:]
    
df = df.loc[df['Borough'] != 'Not assigned']
df.reset_index()

Unnamed: 0,index,PostalCode,Borough,Neighborhood
0,3,M3A,North York,Parkwoods
1,4,M4A,North York,Victoria Village
2,5,M5A,Downtown Toronto,Harbourfront
3,6,M5A,Downtown Toronto,Regent Park
4,7,M6A,North York,Lawrence Heights
5,8,M6A,North York,Lawrence Manor
6,9,M7A,Queen's Park,Not assigned
7,11,M9A,Etobicoke,Islington Avenue
8,12,M1B,Scarborough,Rouge
9,13,M1B,Scarborough,Malvern


To consolidate duplicates, we'll group by postal code and borough, and aggregate the neighborhood values.

In [5]:
df_grouped = pd.DataFrame(df.groupby(['PostalCode', 'Borough'])['Neighborhood'].aggregate(lambda x: ', '.join(x)))
df_grouped.reset_index(inplace=True)
print(df_grouped.shape)
df_grouped[:50]

(103, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [6]:
lat_lng_csv = requests.get('http://cocl.us/Geospatial_data').text
lat_lng = lat_lng_csv.split('\r')
lat_lng = [i.split(',') for i in lat_lng]
postal_codes = pd.DataFrame(lat_lng)
postal_codes.rename(columns={0:'PostalCode', 1:'Latitude', 2:'Longitude'}, inplace = True)
postal_codes.drop([0], inplace=True)
for i, j in postal_codes.iterrows():
    j.PostalCode = j.PostalCode[1:]

df_grouped = df_grouped.merge(postal_codes, how = 'left', on = 'PostalCode')
df_grouped.Latitude = df_grouped.Latitude.astype('float')
df_grouped.Longitude = df_grouped.Longitude.astype('float')
df_grouped[:5]

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [7]:
import folium
toronto_map = folium.Map(location=[df_grouped.Latitude.mean(), df_grouped.Longitude.mean()], zoom_start = 12)

for i, j in df_grouped.iterrows():
    folium.Marker(location = [j.Latitude, j.Longitude]).add_to(toronto_map)

toronto_map

In [8]:
clientid = #your Client ID
secret = #your ID Secret
version = '20180605'

In [9]:
def get_venues(lat, lng, radius, limit):
    url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{},&v={}&radius = {}&limit = {}'.format(
        clientid,
        secret,
        lat,
        lng,
        version,
        radius,
        limit)
    
    results = requests.get(url).json()
    items = results['response']['groups'][0]['items']
    results_df = json_normalize(items)
    return results_df[['venue.id', 'venue.location.lat', 'venue.location.lng', 'venue.categories']]


In [10]:
cat_url = 'https://api.foursquare.com/v2/venues/categories?client_id={}&client_secret={}&v={}'.format(
    clientid,
    secret,
    version)
venue_categories = requests.get(cat_url).json()
cat_df = pd.DataFrame(columns=['Group', 'Name', 'Detail'])
for i in venue_categories['response']['categories']:
    group = i['name']
    for j in i['categories']:
        name = j['name']
        for k in j['categories']:
            detail = k['name']
            entry = {'Group':group, 'Name':name, 'Detail': detail}
            cat_df = cat_df.append(entry, ignore_index = True)

In [11]:
radius = 1000 #we'll travel up to 1km for our daily activities?
limit = 500 #let's only retrieve 500 results
df2 = None
df2 = pd.DataFrame(columns = ['PostalCode','Borough', 'Neighborhood','Latitude', 'Longitude',
                                 'venue.location.lat', 'venue.location.lng','venue.categories'])
for i, j in df_grouped.iterrows():
        results = get_venues(j.Latitude, j.Longitude, radius, limit)
        results['venue.categories'] = [i[0].get('name') for i in results['venue.categories']]
        results['PostalCode'] = j.PostalCode
        results['Borough'] = j.Borough
        results['Neighborhood'] = j.Neighborhood
        results['Latitude'] = j.Latitude
        results['Longitude'] = j.Longitude
        df2 = df2.append(results, ignore_index = True)




In [16]:
df2 = df2.merge(cat_df, how = 'left', left_on = 'venue.categories', right_on = 'Name')
df2['venue_type'] = df2['Group']
df2.drop(columns=['Detail', 'Group', 'Name'], inplace = True)


df2 = df2.merge(cat_df, how = 'left', left_on = 'venue.categories',
               right_on = 'Detail')
for i, j in df2.iterrows():
    if pd.isnull(j.venue_type) == True:
        df2.set_value(i, 'venue_type', j.Group)
df2.drop(columns=['Group','Detail', 'Name'], inplace = True)


for i, j in df2.iterrows():
    if pd.isnull(j.venue_type) == True:
        df2.set_value(i, 'venue_type', 'Other')
    


  # Remove the CWD from sys.path while we load stuff.
  app.launch_new_instance()


In [20]:
#df3 = df2.pivot_table(index = ['Neighborhood', 'venue.categories'], aggfunc = 'size')
#df3 = df3.reset_index()
#df3 = df3.groupby('Neighborhood').head()
df3.sort_values(by=['Neighborhood', 0], ascending = [True, False], inplace = True)
onehot = pd.get_dummies(df3[['venue.categories']])
onehot['Neighborhood'] = df3['Neighborhood']
onehot = onehot.groupby('Neighborhood').mean().reset_index()

toronto_cluster_data = onehot.drop(columns=['Neighborhood'])
clusters = 5
for i in onehot['Neighborhood']:
    print(i)
    temp = onehot[onehot['Neighborhood'] == i].T.reset_index()
    temp.columns = ['venue', 'freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq':2})
    
toronto_grouped_clustering = onehot.drop(columns=['Neighborhood'])
kmeans = KMeans(n_clusters = clusters, random_state = 123).fit(toronto_grouped_clustering)

toronto_merged = df_grouped
toronto_merged['Cluster'] = kmeans.labels_

map_clusters = folium.Map(location = [df_grouped.Latitude.mean(), df_grouped.Longitude.mean()], zoom_start = 11)

x = np.arange(clusters)
y = [i+x+(i*x)**2 for i in range(clusters)]
colors_array = cm.rainbow(np.linspace(0,1,len(y)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
marker_colors =[]

for latitude, longitude, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Cluster']):
    label = folium.Popup(str(cluster), parse_html=True)
    folium.CircleMarker([latitude, longitude], radius = 5, popup = label,
                        color = rainbow[cluster-1], fill = True, fill_color = rainbow[cluster-1],
                       fill_opacity = 0.9).add_to(map_clusters)
    


Adelaide, King, Richmond
                                  venue  freq
0  venue.categories_American Restaurant   0.2
1                 venue.categories_Café   0.2
2     venue.categories_Asian Restaurant   0.2
3       venue.categories_Breakfast Spot   0.2
4                  venue.categories_Bar   0.2


Agincourt
                                   venue  freq
0                venue.categories_Bakery   0.2
1   venue.categories_Arts & Crafts Store   0.2
2      venue.categories_Asian Restaurant   0.2
3  venue.categories_Cantonese Restaurant   0.2
4        venue.categories_Breakfast Spot   0.2


Agincourt North, L'Amoreaux East, Milliken, Steeles East
                                   venue  freq
0    venue.categories_Chinese Restaurant   0.2
1       venue.categories_Bubble Tea Shop   0.2
2      venue.categories_Asian Restaurant   0.2
3  venue.categories_Caribbean Restaurant   0.2
4                venue.categories_Bakery   0.2


Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount

                                  venue  freq
0               venue.categories_Bakery   0.2
1  venue.categories_American Restaurant   0.2
2          venue.categories_Art Gallery   0.2
3          venue.categories_Coffee Shop   0.2
4                 venue.categories_Café   0.2


Davisville
                           venue  freq
0     venue.categories_Bookstore   0.2
1       venue.categories_Brewery   0.2
2   venue.categories_Coffee Shop   0.2
3  venue.categories_Churrascaria   0.2
4          venue.categories_Café   0.2


Davisville North
                             venue  freq
0       venue.categories_Bookstore   0.2
1            venue.categories_Café   0.2
2    venue.categories_Churrascaria   0.2
3  venue.categories_Baseball Field   0.2
4          venue.categories_Bakery   0.2


Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
                                  venue  freq
0            venue.categories_BBQ Joint   0.2
1  venue.categories_American Restaurant   0.2
2     

                                  venue  freq
0            venue.categories_BBQ Joint   0.2
1            venue.categories_Bookstore   0.2
2  venue.categories_American Restaurant   0.2
3         venue.categories_Burger Joint   0.2
4     venue.categories_Asian Restaurant   0.2


Humewood-Cedarvale
                             venue  freq
0       venue.categories_BBQ Joint   0.2
1      venue.categories_Bagel Shop   0.2
2            venue.categories_Café   0.2
3          venue.categories_Bakery   0.2
4  venue.categories_Breakfast Spot   0.2


Islington Avenue
                             venue  freq
0  venue.categories_Breakfast Spot   0.2
1     venue.categories_Coffee Shop   0.2
2    venue.categories_Burger Joint   0.2
3            venue.categories_Café   0.2
4          venue.categories_Bakery   0.2


Kingsview Village, Martin Grove Gardens, Richview Gardens, St. Phillips
                                  venue  freq
0              venue.categories_Brewery   0.2
1               venue.cate

                                  venue  freq
0            venue.categories_BBQ Joint   0.2
1            venue.categories_Bookstore   0.2
2  venue.categories_American Restaurant   0.2
3         venue.categories_Burger Joint   0.2
4                 venue.categories_Café   0.2


The Beaches
                             venue  freq
0      venue.categories_Bagel Shop   0.2
1       venue.categories_BBQ Joint   0.2
2           venue.categories_Beach   0.2
3          venue.categories_Bakery   0.2
4  venue.categories_Breakfast Spot   0.2


The Beaches West, India Bazaar
                               venue  freq
0           venue.categories_Brewery   0.2
1      venue.categories_Burger Joint   0.2
2     venue.categories_Burrito Place   0.2
3             venue.categories_Beach   0.2
4  venue.categories_Asian Restaurant   0.2


The Danforth West, Riverdale
                        venue  freq
0  venue.categories_BBQ Joint   0.2
1     venue.categories_Bakery   0.2
2  venue.categories_Bookstore   0.

In [21]:
map_clusters