In [1]:


import json
import folium
import requests
import numpy as np
import pandas as pd
import matplotlib.cm as cm
from sklearn.cluster import KMeans
import matplotlib.colors as colors


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

url = "https://raw.githubusercontent.com/AAEECARUSO/Coursera_Capstone/master/DATA.csv"

df = pd.read_csv(url)
df = df.drop('Unnamed: 0', 1)
df.head(12)



Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [2]:
# The code was removed by Watson Studio for sharing.

## Working with only boroughs that contain the word Toronto to create a map

In [5]:


def get_nearby(names, latitudes, longitudes, radius=500):
    global CLIENT_ID
    global CLIENT_SECRET
    global VERSION
    global LIMIT
    
    vens = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        ########################################
        # creating the URL for the API request
        ########################################
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT)
            
        results = requests.get(url).json()["response"]['groups'][0]['items']

        vens.append([(name, lat, lng, v['venue']['name'], v['venue']['location']['lat'], v['venue']['location']['lng'],  v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in vens for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']
    
    return nearby_venues


def most_common(row, top_lim):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:top_lim]


toronto_data = df[df['Borough'].str.contains('Toronto', na = False)].reset_index(drop=True)
toronto_data.head()
toronto_data.shape

latitude = 43.6532
longitude= -79.3832

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

########################################
# adding markers
########################################
for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker([lat, lng], radius=5, popup=label, color='blue', fill=True, fill_color='#red', fill_opacity=0.7, parse_html=False).add_to(map_toronto)

#map_toronto

toronto_venues = get_nearby(names=toronto_data['Neighborhood'], latitudes=toronto_data['Latitude'], longitudes=toronto_data['Longitude'])
toronto_venues.groupby('Neighborhood').count()

########################################
# one hot encoding
########################################
one_hot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
one_hot['Neighborhood'] = toronto_venues['Neighborhood'] 

################################################################################
# moving the column 'Neighborhood' over to the first column
################################################################################
cols = list(one_hot.columns.values)
cols.pop(cols.index('Neighborhood'))
one_hot=one_hot[['Neighborhood']+cols]
data_grouped = one_hot.groupby('Neighborhood').mean().reset_index()

for hood in data_grouped['Neighborhood']:
    temp = data_grouped[data_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})

top_lim = 10

indicators = ['st', 'nd', 'rd']

################################################################################
# making columns for the top 10 venues
################################################################################
columns = ['Neighborhood']
for idx in np.arange(top_lim):
    try:
        columns.append('{}{} Most Common Venue'.format(idx + 1, indicators[idx]))
    except:
        columns.append('{}th Most Common Venue'.format(idx + 1))

########################################
# new dataframe for sorted
########################################
df_venues_sorted = pd.DataFrame(columns=columns)
df_venues_sorted['Neighborhood'] = data_grouped['Neighborhood']

for idx in np.arange(data_grouped.shape[0]):
    df_venues_sorted.iloc[idx, 1:] = most_common(data_grouped.iloc[idx, :], top_lim)

kclusters = 5

data_grouped = data_grouped.drop('Neighborhood', 1)

########################################
# running k-means clustering
########################################
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(data_grouped)

data_merged = toronto_data

########################################
# adding labels
########################################
data_merged['Cluster Labels'] = kmeans.labels_

########################################################################################################################
# data_grouped and toronto_data get merged for adding latitude/longitude for each neighborhood
########################################################################################################################
data_merged = data_merged.join(df_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

########################################
# making map clusters
########################################
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
multi_color = [colors.rgb2hex(i) for i in colors_array]

for lat, lon, poi, cluster in zip(data_merged['Latitude'], data_merged['Longitude'], data_merged['Neighborhood'], data_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker([lat, lon], radius=5, popup=label, color=multi_color[cluster-1], fill=True, fill_color=multi_color[cluster-1], fill_opacity=0.7).add_to(map_clusters)

################################
# display the map with clusters
################################
map_clusters