In [None]:
# import necessary packages
import numpy as np
import pandas as pd 
import json
from geopy.geocoders import Nominatim 
import requests 
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import folium # map rendering library
from bs4 import BeautifulSoup

print('Libraries imported.')

In [None]:
# obtain New York neighborhood data
with open('new_york.json') as json_data:
    newyork_data = json.load(json_data)
    
nyneighborhoods = newyork_data['features']

# define the dataframe with four columns: City, Borough, Neighborhood, Latitude, Longitude
column_names = ['City','Borough', 'Neighborhood', 'Latitude', 'Longitude'] 
ny_neighborhoods = pd.DataFrame(columns=column_names)

for data in nyneighborhoods:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    ny_neighborhoods = ny_neighborhoods.append({'City': 'New York',
                                          'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)
ny_neighborhoods.head()

In [None]:
#Define a function to plot the city map with different neighborhood labels based on Borough
def citymap(cityname,countryname,dataframe):
    # create map
    address = cityname + ',' + countryname

    geolocator = Nominatim(user_agent="my-application")
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    my_map = folium.Map(location=[latitude, longitude], zoom_start=10)

    # set color scheme for the Borough
    borough_name = dataframe['Borough'].unique().tolist()
    colnum = dataframe['Borough'].unique().size
    x = np.arange(colnum)
    ys = [i+x+(i*x)**2 for i in range(colnum)]
    colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
    rainbow = [colors.rgb2hex(i) for i in colors_array]

    # add markers to the map

    for lat, lon, neighborhood, borough in zip(dataframe['Latitude'], dataframe['Longitude'], dataframe['Neighborhood'], dataframe['Borough']):
        cluster = borough_name.index(borough)
        label = '{}, {}'.format(neighborhood, borough)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lon],
            radius=5,
            popup=label,
            color=rainbow[cluster],
            fill=True,
            fill_color=rainbow[cluster],
            fill_opacity=0.7).add_to(my_map)
       
    
    return my_map

In [None]:
citymap('New York','USA',ny_neighborhoods)

We see that all the neighorhoods in the same Borough fall into the same color and are well seprated in map, indicating good quality of data.
---

<H3>Lets look into the neighborhood of TORONTO</H3>

In [None]:
# obtain Toronto neighborhood data
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
wikipage = requests.get(url)
wikipage.text[:100]

#Create a new pd DataFrame
toronto = pd.DataFrame()

#use beautifulsoup to read the wikipage
soup = BeautifulSoup(wikipage.text, 'lxml')
wikitable = soup.find_all('table')[0] 

row_marker = 0

for row in wikitable.find_all('tr'):
    column_marker = 0
    columns = row.find_all('td')
    for column in columns:
        toronto.loc[row_marker,column_marker] = column.get_text()
        column_marker += 1
    row_marker += 1

#rename column names
toronto.rename(columns={0:'PostalCode',1:'Borough',2:'Neighborhood'}, inplace=True)
#drop all rows with unassigned borough
toronto = toronto[toronto.Borough!='Not assigned']
toronto.reset_index(drop=True,inplace=True)

#create a new dataframe toronto_neighbor to find all neighborhoods with the location information
toronto_neighborhoods= pd.DataFrame(columns = ['City','Borough','Neighborhood'])

for ii in range(toronto.shape[0]):
    borough = toronto.loc[ii,'Borough']
    postalcode = toronto.loc[ii,'PostalCode']
    neighborhood = toronto.loc[ii,'Neighborhood'][:-1] #delete the last character \n
    
    #if the neighborhood name is not assigned, than the neighorhood name is same as borough
    if neighborhood == 'Not assigned':
        neighborhood = borough
    
    #find the location data, ignore the neighborhoods that are unable to be located by Nominatim
    geolocator = Nominatim(user_agent="mycapstoneproject")
    location = geolocator.geocode("{},{},Toronto,Ontario,Canada".format(neighborhood,borough),timeout=15)
    
    #try one more searching
    if location is None: 
        location = geolocator.geocode("{},Toronto,Ontario,Canada".format(neighborhood),timeout=15)
        
    if location is None: 
        print("The location data of {} in {} is not available!".format(neighborhood,borough))
    else:
        toronto_neighborhoods = toronto_neighborhoods.append({'City': 'Toronto',
                                                'Borough': borough,
                                                'Neighborhood': neighborhood,
                                                'Latitude': location.latitude,
                                                'Longitude': location.longitude
                                               }, ignore_index=True)

toronto_neighborhoods.head()

In [None]:
toronto_neighborhoods.shape

In [None]:
citymap('Toronto','Canada',toronto_neighborhoods)

<H3>Let's look at Beijing and Shanghai</H3>

In [None]:
# read the csv file extracted from Wikipedia including the Districts and Subdisctricts
beijing = pd.read_csv('Beijing.csv')
beijing.rename(columns={'District':'Borough','Subdisctricts':'Neighborhood'}, inplace=True)
beijing.head()

In [None]:
#create a new dataframe bj_neighbor to find all neighborhoods with the location information
bj_neighborhoods= pd.DataFrame(columns = ['City','Borough','Neighborhood'])

for ii in range(beijing.shape[0]):
    borough = beijing.loc[ii,'Borough']
    neighborhood = beijing.loc[ii,'Neighborhood']
      
    #find the location data, ignore the neighborhoods that are unable to be located by Nominatim
    geolocator = Nominatim(user_agent="my-application")
    location = geolocator.geocode("{},{},Beijing,China".format(neighborhood,borough),timeout=15)
    
    #try one more searching
    if location is None: 
        location = geolocator.geocode("{},Beijing,China".format(neighborhood),timeout=15)
        
    if location is None: 
        print("The location data of {} in {} is not available!".format(neighborhood,borough))
    else:
        bj_neighborhoods = bj_neighborhoods.append({'City': 'Beijing',
                                                'Borough': borough,
                                                'Neighborhood': neighborhood,
                                                'Latitude': location.latitude,
                                                'Longitude': location.longitude
                                               }, ignore_index=True)

bj_neighborhoods.head()

<b>A number of neighborhoods don't have location data. But anyway, there are more than 200 neighborhoods with valid location data, enough for my analysis. Show these neighborhoods on map:

In [None]:
bj_neighborhoods.shape

In [None]:
citymap('Beijing','China',bj_neighborhoods)

<b>Some nerghborhoods doesn't lie into their boroughs (wrong location data). Let's clean it out.

In [None]:
drop = ['Xiangyang','Shilipu','Jiuxian','Binhe','Xincun','Yongding','Shiyuan','Dayu','Shuguang','Guangning','Beiwu',
       'Dongfeng','Guangming','Liucun','Nankou']

for neighborhoods in drop:
    bj_neighborhoods = bj_neighborhoods[bj_neighborhoods.Neighborhood != neighborhoods]
    bj_neighborhoods.reset_index(drop=True,inplace=True)
    
bj_neighborhoods.head()

In [None]:
citymap('Beijing','China',bj_neighborhoods)

<b>Repeat the same process for Shanghai

In [None]:
# read the csv file extracted from Wikipedia including the Districts and Subdisctricts
sh = pd.read_csv('Shanghai.csv')
sh.rename(columns={'District':'Borough','Subdistrict':'Neighborhood'}, inplace=True)
sh.head()

In [None]:
#create a new dataframe sh_neighbor to find all neighborhoods with the location information
sh_neighborhoods= pd.DataFrame(columns = ['City','Borough','Neighborhood'])

for borough, neighborhood in zip(sh['Borough'],sh['Neighborhood']):   
    #find the location data, ignore the neighborhoods that are unable to be located by Nominatim
    geolocator = Nominatim(user_agent="my-application")
    location = geolocator.geocode("{},{},Shanghai,China".format(neighborhood,borough),timeout=15)
    
    #try one more searching
    if location is None: 
        location = geolocator.geocode("{},Shanghai,China".format(neighborhood),timeout=15)
        
    if location is None: 
        print("The location data of {} in {} is not available!".format(neighborhood,borough))
    else:
        sh_neighborhoods = sh_neighborhoods.append({'City': 'Shanghai',
                                                'Borough': borough,
                                                'Neighborhood': neighborhood,
                                                'Latitude': location.latitude,
                                                'Longitude': location.longitude
                                               }, ignore_index=True)

sh_neighborhoods.head()

In [None]:

citymap('Shanghai','China',sh_neighborhoods)

In [None]:
drop = ['Chenjia','Xinhe','Xincun','Zhongxing','Changzheng','Beizhan','Chaoyang Farm']

for neighborhoods in drop:
    sh_neighborhoods = sh_neighborhoods[sh_neighborhoods.Neighborhood != neighborhoods]
    sh_neighborhoods.reset_index(drop=True,inplace=True)
    
citymap('Shanghai','China',sh_neighborhoods)

In [None]:
# save the data
ny_neighborhoods.to_csv('ny_neighbor.csv', sep='\t')
toronto_neighborhoods.to_csv('toronto_neighbor.csv', sep='\t')
bj_neighborhoods.to_csv('bj_neighbor.csv', sep='\t')
sh_neighborhoods.to_csv('sh_neighbor.csv', sep='\t')

In [None]:
#Use Foursquare to expore the neighborhoods
#My Foursquare Credentials 
CLIENT_ID = 'HCGMALYUZ3TSKR1Z4YRDUOWVHYG1ZKCID4UX2WOJEQ4TT1TB' 
CLIENT_SECRET = 'JZY3DDCWDRJSV1VCM4W4WBLB2THSLPB04ENZ2XQMC4KQAYQZ' 
VERSION = '20200229' # Foursquare API version
LIMIT = 50 # only return the top 50 venues

# define a function to expore each city in radius of 1000 meters
def getNearbyVenues(nborhood, radius=1000):
    
    venues=pd.DataFrame(columns=['City','Borough','Neighborhood','VenueName','VenueCategory'])
    for city, borough, neighborhood, lat, lng in zip(nborhood['City'], nborhood['Borough'], nborhood['Neighborhood'], nborhood['Latitude'], nborhood['Longitude']):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        
        # make the GET request
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        if results is not None:
            for v in results:
                venues = venues.append({'City':city, 'Borough':borough, 'Neighborhood': neighborhood, 
                           'VenueName': v['venue']['name'], 
                           'VenueCategory': v['venue']['categories'][0]['name']},ignore_index=True)

   
    
    return(venues)

In [None]:
# obtain the nearby venues of each neighborhood in New York
ny_venues = getNearbyVenues(ny_neighborhoods)
ny_venues.head()

In [None]:
#save the data
ny_venues.to_csv('ny_venues.csv', sep='\t')
ny_venues.shape

In [None]:
#  to avoid multi-index problem
ny_venues = ny_venues.rename(columns={'Neighborhood': 'NeighborhoodName'})

<H3> Let's repeat the process for Toronto, Beijing and Shanghai.</H3><br>
<H3>Toronto</H3>

In [None]:
# repeat on Toronto
toronto_venues = getNearbyVenues(toronto_neighborhoods)
toronto_venues.head()

In [None]:
#save the data
toronto_venues.to_csv('toronto_venues.csv', sep='\t')

#  to avoid multi-index problem
toronto_venues = toronto_venues.rename(columns={'Neighborhood': 'NeighborhoodName'})
toronto_venues.shape

<H3>Beijing</H3>

In [None]:
# repeat on Beijing
bj_venues = getNearbyVenues(bj_neighborhoods)
bj_venues.head()

In [None]:
#save the data
bj_venues.to_csv('bj_venues.csv', sep='\t')

#  to avoid multi-index problem
bj_venues = bj_venues.rename(columns={'Neighborhood': 'NeighborhoodName'})
bj_venues.shape

<H3>Shanghai</H3>

In [None]:
# repeat on Beijing
sh_venues = getNearbyVenues(sh_neighborhoods)
sh_venues.head()

In [None]:
#save the data
sh_venues.to_csv('sh_venues.csv', sep='\t')

#  to avoid multi-index problem
sh_venues = sh_venues.rename(columns={'Neighborhood': 'NeighborhoodName'})
sh_venues.shape

<H3>Now let's combine all the data into single dataframe and use machine learning model for clustering.</H3>

In [None]:
#combine venues 
allvenues =  pd.concat([ny_venues,toronto_venues,bj_venues,sh_venues])
allvenues = allvenues.rename(columns={'Neighborhood': 'NeighborhoodName'})
allvenues.shape

In [None]:
allvenues.head()

In [None]:
#combine neighborhoods
allneighborhoods =  pd.concat([ny_neighborhoods,toronto_neighborhoods,bj_neighborhoods,sh_neighborhoods])
allneighborhoods = allneighborhoods.rename(columns={'Neighborhood': 'NeighborhoodName'})
allneighborhoods.shape

In [None]:
allneighborhoods.head()

<H3>Group the venues depends on their categories and calculate the total number of venues of each category.</H3>

In [None]:
# get dummies
allvenues_onehot = pd.get_dummies(allvenues, columns = ['VenueCategory'], prefix="", prefix_sep="")
allvenues_onehot = allvenues_onehot.drop('VenueName',axis = 1)
#allvenues_onehot.columns.values

<H3>We need to clean the colums. For example, 'Art Gallery', 'Art Museum', 'Arts & Crafts Store', and 'Arts & Entertainment' are basically in the same category. And 'Yunnan Resaurant', 'Zhejiang Resaurant','Hainan Resaurant' are all in the category of 'Chinese Resaurent'.</H3>

In [None]:
def clean_columns(columns,newcolumnname,dataframe):
    value = 0
    for column in columns:
        value = dataframe[column] + value
    
    dataframe = dataframe.drop(columns,axis = 1)
    dataframe[newcolumnname] = value
    
    return dataframe

columns = ['Art Gallery', 'Art Museum', 'Arts & Crafts Store','Arts & Entertainment','Museum','Street Art','Public Art']
newcolumnname = 'Art Museum'
allvenues_onehot = clean_columns(columns,newcolumnname,allvenues_onehot)

columns = ['Auto Dealership', 'Auto Garage', 'Auto Workshop','Automotive Shop']
newcolumnname = 'Auto Shop'
allvenues_onehot = clean_columns(columns,newcolumnname,allvenues_onehot)

columns = ['Baseball Field','Baseball Stadium']
newcolumnname = 'Baseball Field'
allvenues_onehot = clean_columns(columns,newcolumnname,allvenues_onehot)

columns = ['Basketball Court', 'Basketball Stadium']
newcolumnname = 'Basketball Stadium'
allvenues_onehot = clean_columns(columns,newcolumnname,allvenues_onehot)

columns = ['Tennis Court', 'Tennis Stadium']
newcolumnname = 'Tennis Court'
allvenues_onehot = clean_columns(columns,newcolumnname,allvenues_onehot)

columns = ['Soccer Field', 'Soccer Stadium','Stadium']
newcolumnname = 'Stadium'
allvenues_onehot = clean_columns(columns,newcolumnname,allvenues_onehot)

columns = ['Bar','Beer Bar','Beer Garden', 'Beer Store','Cocktail Bar','Whisky Bar', 'Wine Bar','Sake Bar','Hotel Bar']
newcolumnname = 'Bar'
allvenues_onehot = clean_columns(columns,newcolumnname,allvenues_onehot)

columns = ['Wine Shop','Liquor Store']
newcolumnname = 'Liquor'
allvenues_onehot = clean_columns(columns,newcolumnname,allvenues_onehot)

columns = ['Bus Line', 'Bus Station','Bus Stop']
newcolumnname = 'Bus Statiion'
allvenues_onehot = clean_columns(columns,newcolumnname,allvenues_onehot)

columns = ['Cafeteria', 'Café','Coffee Shop','Gaming Cafe']
newcolumnname = 'Cafeteria'
allvenues_onehot = clean_columns(columns,newcolumnname,allvenues_onehot)

columns = ['Food', 'Food & Drink Shop',
       'Food Court', 'Food Service', 'Food Truck', 'Street Food Gathering']
newcolumnname = 'Street Food'
allvenues_onehot = clean_columns(columns,newcolumnname,allvenues_onehot)
  
columns = ['Shopping Mall', 'Shopping Plaza']
newcolumnname = 'Shopping Mall'
allvenues_onehot = clean_columns(columns,newcolumnname,allvenues_onehot)

columns = ['Drugstore','Pharmacy']
newcolumnname = 'Pharmacy'
allvenues_onehot = clean_columns(columns,newcolumnname,allvenues_onehot)

columns = ['Sporting Goods Shop', 'Sports Bar', 'Sports Club']
newcolumnname = 'Sports'
allvenues_onehot = clean_columns(columns,newcolumnname,allvenues_onehot)

columns = ['Gym','Gym / Fitness Center', 'Gym Pool', 'Gymnastics Gym']
newcolumnname = 'Gym'
allvenues_onehot = clean_columns(columns,newcolumnname,allvenues_onehot)

columns = ['Hostel','Hotel','Motel']
newcolumnname = 'Hotel'
allvenues_onehot = clean_columns(columns,newcolumnname,allvenues_onehot)

columns = ['Japanese Curry Restaurant', 'Japanese Restaurant','Ramen Restaurant','Udon Restaurant','Soba Restaurant','Sushi Restaurant']
newcolumnname = 'Japanese Restaurant'
allvenues_onehot = clean_columns(columns,newcolumnname,allvenues_onehot)

columns = ['Metro Station','Light Rail Station','Tram Station']
newcolumnname = 'Metro Station'
allvenues_onehot = clean_columns(columns,newcolumnname,allvenues_onehot)

columns = ['Pet Café', 'Pet Service', 'Pet Store','Animal Shelter']
newcolumnname = 'Pet Service'
allvenues_onehot = clean_columns(columns,newcolumnname,allvenues_onehot)

columns = ['Chinese Breakfast Place', 'Chinese Restaurant','Hotpot Restaurant', 
           'Hubei Restaurant','Hunan Restaurant','Tibetan Restaurant','Anhui Restaurant','Cantonese Restaurant',
           'Dongbei Restaurant','Dim Sum Restaurant','Dumpling Restaurant','Szechuan Restaurant','Taiwanese Restaurant',
          'Fujian Restaurant','Xinjiang Restaurant','Yunnan Restaurant', 'Zhejiang Restaurant','Shanghai Restaurant',
          'Shanxi Restaurant','Shaanxi Restaurant','Guizhou Restaurant','Peking Duck Restaurant','Guizhou Restaurant',
          'Hainan Restaurant','Hong Kong Restaurant','Jiangsu Restaurant']
newcolumnname = 'Chinese Restaurant'
allvenues_onehot = clean_columns(columns,newcolumnname,allvenues_onehot)
 
#allvenues_onehot.columns.values

In [None]:
#group the venues based on Neighborhoods
allvenues_grouped = allvenues_onehot.groupby('NeighborhoodName',axis = 0).sum().reset_index()
allvenues_grouped = allneighborhoods.join(allvenues_grouped.set_index('NeighborhoodName'), on='NeighborhoodName')

#drop the rows with NaN (no venues information)
allvenues_grouped.dropna(inplace=True)
allvenues_grouped.head()

In [None]:
# create a new dataframe with most common venue catrgories
def return_most_common_venues(row, num_top_venues):
    row_categories = row
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

columns = ['City','Borough','NeighborhoodName','Latitude','Longitude','Total Number of Venues']
indicators = ['st', 'nd', 'rd']
# create columns according to number of top venues

for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
allvenues_sorted = pd.DataFrame(columns = columns)
#allvenues_sorted = allvenues_grouped[['City','Borough','NeighborhoodName']]

for ind in range(allvenues_grouped.shape[0]):
    allvenues_sorted.loc[ind, 'City'] = allvenues_grouped.iloc[ind].City
    allvenues_sorted.loc[ind, 'Borough'] = allvenues_grouped.iloc[ind].Borough
    allvenues_sorted.loc[ind, 'NeighborhoodName'] = allvenues_grouped.iloc[ind].NeighborhoodName
    allvenues_sorted.loc[ind, 'Latitude'] = allvenues_grouped.iloc[ind].Latitude
    allvenues_sorted.loc[ind, 'Longitude'] = allvenues_grouped.iloc[ind].Longitude
    allvenues_sorted.loc[ind, 'Total Number of Venues'] = allvenues_grouped.iloc[ind,5:].sum()
    allvenues_sorted.iloc[ind, 6:] = return_most_common_venues(allvenues_grouped.iloc[ind, 5:], num_top_venues)

allvenues_sorted.head()

<H3>Use hierarchical agglomerative clustering method to compare neighborhoods among cities.<br>
First, find the number of clusters. Let's use scipy library to create the dendrograms for our dataset.</H3>

In [None]:
import scipy.cluster.hierarchy as shc

data = allvenues_grouped.iloc[:,6:]
plt.figure(figsize=(10, 7))   
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Neighborhoods')
plt.ylabel('Distance')
plt.axhline(y=31, c='k')
dend = shc.dendrogram(shc.linkage(data, method='ward'))

<H3>According to the above graph, I decide to separate our neighborhoods into nine clusters (cut at distance of 31, horizontal black line). I will use the hierarchical agglomerative clustering of the sklearn.cluster library to cluster these neighborhoods.</H3>

In [None]:
from sklearn.cluster import AgglomerativeClustering

cluster = AgglomerativeClustering(n_clusters=9, affinity='euclidean', linkage='ward')  
clusterresult = cluster.fit_predict(data)

In [None]:
allvenues_sorted['Cluster_Labels'] = clusterresult

In [None]:
# Define a function to show the neighborhoods with same cluster labels
def clustermap(cityname,countryname,dataframe):
    # create map
    address = cityname + ',' + countryname

    geolocator = Nominatim(user_agent="my-application")
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    my_map = folium.Map(location=[latitude, longitude], zoom_start=10)

    # set color scheme for the Cluster_Labels
    colnum = 9
    x = np.arange(colnum)
    ys = [i+x+(i*x)**2 for i in range(colnum)]
    colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
    rainbow = [colors.rgb2hex(i) for i in colors_array]

    # add markers to the map

    for lat, lon, neighborhood, borough, cluster_labels in zip(dataframe['Latitude'], 
                                                               dataframe['Longitude'], 
                                                               dataframe['NeighborhoodName'], 
                                                               dataframe['Borough'],
                                                               dataframe['Cluster_Labels']):
        cluster = cluster_labels 
        label = '{}, {},Cluster:{}'.format(neighborhood, borough,cluster_labels)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lon],
            radius=5,
            popup=label,
            color=rainbow[cluster],
            fill=True,
            fill_color=rainbow[cluster],
            fill_opacity=0.7).add_to(my_map)
       
    
    return my_map

<H3>Total number of neighborhoods in each cluster</H3>

In [None]:
for ii in range(9):
    num = allvenues_sorted[allvenues_sorted['Cluster_Labels'] == ii].shape[0]
    print('Total number of neighborhoods in cluster {} is {}'.format(ii, num))

<H3>Cluster 5 only has 3 neighborhoods and cluster 7 has only 2. These two clusters probably just contain the outliers. So let's look at cluster 5 and 7 first</H3>

In [None]:
# cluster 5
allvenues_sorted[allvenues_sorted['Cluster_Labels'] == 5]

In [None]:
# cluster 7
allvenues_sorted[allvenues_sorted['Cluster_Labels'] == 7]

<H3>
Cluster 5 is the historical area of Beijing, thus no similar area is found in other cities. Cluster 7 is the same neighborhood throughout two boroughs, which is also unique in New York. <br><br>
Let's look at other neighborhood clusters. <br><br>
Cluster 0</H3>

In [None]:
cluster0 = allvenues_sorted[allvenues_sorted['Cluster_Labels'] == 0]
cluster0

<H3>Cluster 0 contains the neighborhoods in New York with a lot of nearby pizza places, fast food restaurants and Caribbean restaurants.</H3>

In [None]:
clustermap('New York','USA',cluster0)

In [None]:
#Cluster 1
cluster1 = allvenues_sorted[allvenues_sorted['Cluster_Labels'] == 1]
cluster1

In [None]:
clustermap('New York','USA',cluster1[cluster1['City'] == 'New York'])

In [None]:
clustermap('Toronto','Canada',cluster1[cluster1['City'] == 'Toronto'])

In [None]:
clustermap('Shanghai','China',cluster1[cluster1['City'] == 'Shanghai'])

In [None]:
clustermap('Beijing','China',cluster1[cluster1['City'] == 'Beijing'])

<H3>Cluster 1 is basically the residential area of each city. Each neighborhood has access to fast food restaurants and outdoor parks. There are also a lot of resaurants, convenience stores and supermarkets.</H3>

In [None]:
#Cluster 2

cluster2 = allvenues_sorted[allvenues_sorted['Cluster_Labels'] == 2]
cluster2

In [None]:
clustermap('New York','USA',cluster2[cluster2['City'] == 'New York'])

In [None]:
clustermap('Toronto','Canada',cluster2[cluster2['City'] == 'Toronto'])

<H3>Cluster 2: New York and Toronto neighborhoods with pizza places, banks, mobile phone shops nearby.</H3>

In [None]:
#Cluster 3
cluster3 = allvenues_sorted[allvenues_sorted['Cluster_Labels'] == 3]
cluster3

In [None]:
clustermap('New York','USA',cluster3[cluster3['City'] == 'New York'])

In [None]:
clustermap('Toronto','Canada',cluster3[cluster3['City'] == 'Toronto'])

<H3>Cluster 3: Most are New York neighborhoods with a lot of Italian and pizza places.</H3>

In [None]:
## cluster 4
cluster4 = allvenues_sorted[allvenues_sorted['Cluster_Labels'] == 4]
cluster4

In [None]:
clustermap('New York','USA',cluster4[cluster4['City'] == 'New York'])

In [None]:
clustermap('Toronto','Canada',cluster4[cluster4['City'] == 'Toronto'])

<H3>Cluster 4 includes the neighborhoods in New York and Toronto. These neighborhoods have a lot of dining place with Mexican and American cuisine.</H3>

In [None]:
## cluster 6
cluster6 = allvenues_sorted[allvenues_sorted['Cluster_Labels'] == 6]
cluster6

In [None]:
clustermap('New York','USA',cluster6[cluster6['City'] == 'New York'])

In [None]:
clustermap('Toronto','Canada',cluster6[cluster6['City'] == 'Toronto'])

In [None]:
clustermap('Beijing','China',cluster6[cluster6['City'] == 'Beijing'])

In [None]:
clustermap('Shanghai','China',cluster6[cluster6['City'] == 'Shanghai'])

<H3>Cluster 6 is basically the city center where surrounded by a variety of venues, including theaters, parks, restaurants and bars.</H3>

---

In [None]:
## cluster 8
cluster8 = allvenues_sorted[allvenues_sorted['Cluster_Labels'] == 8]
cluster8

In [None]:
clustermap('New York','USA',cluster8[cluster8['City'] == 'New York'])

In [None]:
clustermap('Toronto','Canada',cluster8[cluster8['City'] == 'Toronto'])

In [None]:
clustermap('Beijing','China',cluster8[cluster8['City'] == 'Beijing'])

In [None]:
clustermap('Shanghai','China',cluster8[cluster8['City'] == 'Shanghai'])

<H2>Cluster 8 contains the neighborhoods without much nearby venues, including the suburb of New York, Toronto and most parts of Beijing and Shanghai. Please note that it doesn't mean Beijing and Shanghai are less bustling. In fact, Beijing and Shanghai have even larger density of population. I think this is because the venue information provider Foursquare is located in USA, thus New York and Toronto has much more information than Beijing and Shanghai. Some venue providers located in China may help to have a more detailed analysis.<br><br>
Anyhow, I hope I convince you that despite the distinct cultural and geographical location differences of New York, Toronto, Beijing and Shanghai, there are several similar neighborhoods in cluster 1, 2, 3, 4 and 6. I hope this could provide some useful information for peple who are considering moving among Canada, USA and China.<br><br>
Thank you for reading!</H2>

---