# Goal : Establishing neighborhood mapping between Manhattan and Paris

## Importing relevant libraries

In [2]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

!conda install -c anaconda wget --yes

print('Libraries imported.')

Collecting package metadata (current_repodata.json): done
Solving environment: - 
  - anaconda/osx-64::ca-certificates-2020.1.1-0, anaconda/osx-64::certifi-2019.11.28-py37_0, anaconda/osx-64::openssl-1.1.1d-h1de35cc_4
  - anaconda/osx-64::ca-certificates-2020.1.1-0, anaconda/osx-64::certifi-2019.11.28-py37_0, defaults/osx-64::openssl-1.1.1d-h1de35cc_4
  - anaconda/osx-64::ca-certificates-2020.1.1-0, anaconda/osx-64::openssl-1.1.1d-h1de35cc_4, defaults/osx-64::certifi-2019.11.28-py37_0
  - anaconda/osx-64::ca-certificates-2020.1.1-0, defaults/osx-64::certifi-2019.11.28-py37_0, defaults/osx-64::openssl-1.1.1d-h1de35cc_4
  - anaconda/osx-64::openssl-1.1.1d-h1de35cc_4, defaults/osx-64::ca-certificates-2020.1.1-0, defaults/osx-64::certifi-2019.11.28-py37_0
  - defaults/osx-64::ca-certificates-2020.1.1-0, defaults/osx-64::certifi-2019.11.28-py37_0, defaults/osx-64::openssl-1.1.1d-h1de35cc_4
  - anaconda/osx-64::certifi-2019.11.28-py37_0, anaconda/osx-64::openssl-1.1.1d-h1de35cc_4, defaults/o

## Inputing Foursquare credentials

In [3]:
CLIENT_ID = 'N5SWU5EU5F0X0GM2D5KNOG52NTQC4IVKO2GM10RJQA2BMU3H' # your Foursquare ID
CLIENT_SECRET = 'CVSIFR0EDKGMQCQJYI02BY05SY220FWCO41IIOH3UGGWNDRD' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 100

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: N5SWU5EU5F0X0GM2D5KNOG52NTQC4IVKO2GM10RJQA2BMU3H
CLIENT_SECRET:CVSIFR0EDKGMQCQJYI02BY05SY220FWCO41IIOH3UGGWNDRD


## Loading NewYork Data

In [4]:
!wget -q -O 'newyork_data.json' https://cocl.us/new_york_dataset
print('NewYork Data downloaded!')

NewYork Data downloaded!


In [5]:
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

In [6]:
neighborhoods_data = newyork_data['features']
neighborhoods_data[0]
# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)
neighborhoods.head()

manhattan_data = neighborhoods[neighborhoods['Borough'] == 'Manhattan'].reset_index(drop=True)
manhattan_data.head()


Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Manhattan,Marble Hill,40.876551,-73.91066
1,Manhattan,Chinatown,40.715618,-73.994279
2,Manhattan,Washington Heights,40.851903,-73.9369
3,Manhattan,Inwood,40.867684,-73.92121
4,Manhattan,Hamilton Heights,40.823604,-73.949688


## Visualizing NewYork Neighborhood

In [7]:
address = 'Manhattan, NY'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Manhattan are 40.7896239, -73.9598939.


In [8]:
# create map of Manhattan using latitude and longitude values
map_manhattan = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(manhattan_data['Latitude'], manhattan_data['Longitude'], manhattan_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_manhattan)  
    
map_manhattan

## Downloading NewYork venue categories

In [9]:
manhattan_data.shape

(40, 4)

In [10]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [11]:
manhattan_venues = getNearbyVenues(names=manhattan_data['Neighborhood'],
                                   latitudes=manhattan_data['Latitude'],
                                   longitudes=manhattan_data['Longitude']
                                  )
manhattan_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Marble Hill,40.876551,-73.91066,Arturo's,40.874412,-73.910271,Pizza Place
1,Marble Hill,40.876551,-73.91066,Bikram Yoga,40.876844,-73.906204,Yoga Studio
2,Marble Hill,40.876551,-73.91066,Tibbett Diner,40.880404,-73.908937,Diner
3,Marble Hill,40.876551,-73.91066,Starbucks,40.877531,-73.905582,Coffee Shop
4,Marble Hill,40.876551,-73.91066,Dunkin',40.877136,-73.906666,Donut Shop


## Loading Paris Data

In [12]:
paris_data = pd.read_csv('https://raw.githubusercontent.com/Coursera-2020/Coursera_Capstone/master/ArrondissementsParis.csv', ',')
paris=pd.read_csv('https://raw.githubusercontent.com/Coursera-2020/Coursera_Capstone/master/ArrondissementsParis.csv', ',')
paris.drop(['N_SQ_AR', 'C_AR', 'C_ARINSEE', 'N_SQ_CO','SURFACE','PERIMETRE'], axis=1, inplace=True)
paris.rename(columns={'L_AR': 'Borough', 'L_AROFF': 'Neighborhood'}, inplace=True)
paris.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,15ème Ardt,Vaugirard,48.840085,2.292826
1,10ème Ardt,Entrepôt,48.87613,2.360728
2,16ème Ardt,Passy,48.860392,2.261971
3,9ème Ardt,Opéra,48.877164,2.337458
4,19ème Ardt,Buttes-Chaumont,48.887076,2.384821


## Visualizing Paris Neighborhood

In [13]:
address = 'Paris'

geolocator = Nominatim(user_agent="Paris_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Paris are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Paris are 48.8566969, 2.3514616.


In [14]:
# create map of Manhattan using latitude and longitude values
map_paris = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, label in zip(paris['Latitude'], paris['Longitude'], paris['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_paris)  
    
map_paris

## Downloading Paris venue categories

In [15]:
paris_venues = getNearbyVenues(names=paris['Neighborhood'],
                                   latitudes=paris['Latitude'],
                                   longitudes=paris['Longitude']
                                  )
paris_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Vaugirard,48.840085,2.292826,Indian Villa,48.841116,2.291621,Indian Restaurant
1,Vaugirard,48.840085,2.292826,Le Grand Venise,48.838276,2.294484,Italian Restaurant
2,Vaugirard,48.840085,2.292826,La Table Libanaise,48.841766,2.288607,Lebanese Restaurant
3,Vaugirard,48.840085,2.292826,AlKaram,48.838379,2.297156,Lebanese Restaurant
4,Vaugirard,48.840085,2.292826,CrossFit Lutèce,48.840888,2.292199,Gym


## Preparing categories

### One hot encoding of categories

In [16]:
# one hot encoding
manhattan_onehot = pd.get_dummies(manhattan_venues[['Venue Category']], prefix="", prefix_sep="")
paris_onehot = pd.get_dummies(paris_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
paris_onehot['Neighborhood'] = paris_venues['Neighborhood'] 
manhattan_onehot['Neighborhood'] = manhattan_venues['Neighborhood'] 


### Droping not mutual categories

In [17]:
for col in paris_onehot.columns: 
    if col not in manhattan_onehot.columns:
        paris_onehot.drop([col], axis=1, inplace=True)

for col in manhattan_onehot.columns:    
    if col not in paris_onehot.columns:
        manhattan_onehot.drop([col], axis=1, inplace=True)

## Same categories and same order

In [18]:
paris_onehot.columns

Index(['Afghan Restaurant', 'African Restaurant', 'American Restaurant',
       'Antique Shop', 'Argentinian Restaurant', 'Art Gallery', 'Art Museum',
       'Arts & Crafts Store', 'Asian Restaurant', 'Athletics & Sports',
       ...
       'Trail', 'Turkish Restaurant', 'Udon Restaurant',
       'Vegetarian / Vegan Restaurant', 'Vietnamese Restaurant', 'Waterfront',
       'Wine Bar', 'Wine Shop', 'Women's Store', 'Neighborhood'],
      dtype='object', length=162)

In [19]:
manhattan_onehot.columns

Index(['Afghan Restaurant', 'African Restaurant', 'American Restaurant',
       'Antique Shop', 'Argentinian Restaurant', 'Art Gallery', 'Art Museum',
       'Arts & Crafts Store', 'Asian Restaurant', 'Athletics & Sports',
       ...
       'Trail', 'Turkish Restaurant', 'Udon Restaurant',
       'Vegetarian / Vegan Restaurant', 'Vietnamese Restaurant', 'Waterfront',
       'Wine Bar', 'Wine Shop', 'Women's Store', 'Neighborhood'],
      dtype='object', length=162)

In [20]:
list(manhattan_onehot.columns)==list(paris_onehot.columns)

True

## Classifying Manhattan top ranked categories by neighborhoods

In [21]:
# move neighborhood column to the first column
fixed_columns = [manhattan_onehot.columns[-1]] + list(manhattan_onehot.columns[:-1])
manhattan_onehot = manhattan_onehot[fixed_columns]

manhattan_onehot.head()
manhattan_grouped = manhattan_onehot.groupby('Neighborhood').mean().reset_index()

num_top_venues = 5

for hood in manhattan_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = manhattan_grouped[manhattan_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

manhattan_grouped.shape

----Battery Park City----
         venue  freq
0         Park  0.11
1        Hotel  0.06
2    Wine Shop  0.05
3  Coffee Shop  0.05
4          Gym  0.05


----Carnegie Hill----
                 venue  freq
0          Coffee Shop  0.09
1          Pizza Place  0.05
2                 Café  0.04
3  Japanese Restaurant  0.04
4            Wine Shop  0.04


----Central Harlem----
                 venue  freq
0   African Restaurant  0.07
1       Cosmetics Shop  0.04
2  American Restaurant  0.04
3          Art Gallery  0.04
4   Seafood Restaurant  0.04


----Chelsea----
                 venue  freq
0          Art Gallery  0.16
1          Coffee Shop  0.07
2                 Café  0.04
3  American Restaurant  0.03
4       Ice Cream Shop  0.02


----Chinatown----
                venue  freq
0  Chinese Restaurant  0.08
1        Cocktail Bar  0.04
2              Bakery  0.04
3                 Spa  0.03
4  Salon / Barbershop  0.03


----Civic Center----
                 venue  freq
0  American Restaur

(40, 162)

In [22]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = manhattan_grouped['Neighborhood']

for ind in np.arange(manhattan_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(manhattan_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Battery Park City,Park,Hotel,Gym,Wine Shop,Memorial Site,Coffee Shop,Shopping Mall,Plaza,Playground,Boat or Ferry
1,Carnegie Hill,Coffee Shop,Pizza Place,Bookstore,Grocery Store,Café,Gym,Japanese Restaurant,Wine Shop,Bar,Bakery
2,Central Harlem,African Restaurant,Chinese Restaurant,American Restaurant,Cosmetics Shop,French Restaurant,Art Gallery,Seafood Restaurant,Bar,Ethiopian Restaurant,Spa
3,Chelsea,Art Gallery,Coffee Shop,Café,American Restaurant,Ice Cream Shop,Cupcake Shop,Bakery,Italian Restaurant,Juice Bar,Park
4,Chinatown,Chinese Restaurant,Cocktail Bar,Bakery,Coffee Shop,Spa,Optical Shop,Salon / Barbershop,American Restaurant,Vietnamese Restaurant,Hotel


## Applying KMeans on Manhattan dataset

In [23]:
# set number of clusters
kclusters = 5
#manhattan_grouped.to_excel(r'good.xlsx', index = False)

manhattan_grouped_clustering = manhattan_grouped.drop('Neighborhood', 1)
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0, n_init=120).fit(manhattan_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([4, 1, 3, 1, 3, 1, 1, 3, 3, 1], dtype=int32)

In [24]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

manhattan_merged = manhattan_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
manhattan_merged = manhattan_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

In [25]:
manhattan_merged.groupby('Cluster Labels').count() # check the last columns!

Unnamed: 0_level_0,Borough,Neighborhood,Latitude,Longitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,2,2,2,2,2,2,2,2,2,2,2,2,2,2
1,22,22,22,22,22,22,22,22,22,22,22,22,22,22
2,1,1,1,1,1,1,1,1,1,1,1,1,1,1
3,13,13,13,13,13,13,13,13,13,13,13,13,13,13
4,2,2,2,2,2,2,2,2,2,2,2,2,2,2


## Visualizing the results on Manhattan neighbordhood

In [26]:
# create map
map_clusters = map_manhattan
#folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(manhattan_merged['Latitude'], manhattan_merged['Longitude'], manhattan_merged['Neighborhood'], manhattan_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Cluster 0

In [27]:
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 0, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Marble Hill,Sandwich Place,Coffee Shop,Gym,Ice Cream Shop,Miscellaneous Shop,Seafood Restaurant,Steakhouse,Donut Shop,Diner,Department Store
26,Morningside Heights,Park,Bookstore,American Restaurant,Coffee Shop,Deli / Bodega,Sandwich Place,Burger Joint,Pizza Place,Mediterranean Restaurant,Mexican Restaurant


## Cluster 1

In [28]:
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 1, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
8,Upper East Side,Italian Restaurant,Bakery,Gym / Fitness Center,Juice Bar,Spa,Hotel,Coffee Shop,Wine Shop,Exhibit,Mexican Restaurant
9,Yorkville,Italian Restaurant,Coffee Shop,Gym,Deli / Bodega,Bar,Sushi Restaurant,Diner,Wine Shop,Mexican Restaurant,Japanese Restaurant
10,Lenox Hill,Italian Restaurant,Coffee Shop,Pizza Place,Cocktail Bar,Sushi Restaurant,Café,Gym,Gym / Fitness Center,Burger Joint,Sporting Goods Shop
13,Lincoln Square,Café,Plaza,Italian Restaurant,Gym / Fitness Center,Theater,Performing Arts Venue,Concert Hall,Indie Movie Theater,Wine Shop,American Restaurant
14,Clinton,Theater,Coffee Shop,Gym / Fitness Center,Hotel,Wine Shop,Gym,American Restaurant,Pizza Place,Sandwich Place,Spa
15,Midtown,Coffee Shop,Hotel,Theater,Pizza Place,Clothing Store,Bakery,Sandwich Place,Café,Sporting Goods Shop,Steakhouse
16,Murray Hill,Hotel,Coffee Shop,Sandwich Place,Pizza Place,Gym / Fitness Center,Steakhouse,Chinese Restaurant,Jewish Restaurant,Burger Joint,Mediterranean Restaurant
17,Chelsea,Art Gallery,Coffee Shop,Café,American Restaurant,Ice Cream Shop,Cupcake Shop,Bakery,Italian Restaurant,Juice Bar,Park
18,Greenwich Village,Italian Restaurant,Coffee Shop,Gym,Bakery,Ice Cream Shop,Vietnamese Restaurant,Pizza Place,Sandwich Place,Indian Restaurant,Comedy Club
21,Tribeca,Park,Italian Restaurant,Spa,Café,Wine Bar,American Restaurant,Greek Restaurant,Playground,Steakhouse,Hotel


## Cluster 2

In [29]:
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 2, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
37,Stuyvesant Town,Park,Fountain,Farmers Market,Cocktail Bar,Harbor / Marina,Gym / Fitness Center,Boat or Ferry,Bistro,Bar,Women's Store


## Cluster 3

In [30]:
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 3, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Chinatown,Chinese Restaurant,Cocktail Bar,Bakery,Coffee Shop,Spa,Optical Shop,Salon / Barbershop,American Restaurant,Vietnamese Restaurant,Hotel
2,Washington Heights,Café,Bakery,Chinese Restaurant,Mobile Phone Shop,Women's Store,Tapas Restaurant,Spanish Restaurant,Wine Shop,Supermarket,Mexican Restaurant
3,Inwood,Mexican Restaurant,Café,Restaurant,Pizza Place,Lounge,Bakery,Deli / Bodega,Wine Bar,American Restaurant,Spanish Restaurant
4,Hamilton Heights,Pizza Place,Deli / Bodega,Coffee Shop,Café,Mexican Restaurant,Cocktail Bar,Sushi Restaurant,Liquor Store,Bakery,Sandwich Place
5,Manhattanville,Coffee Shop,Chinese Restaurant,Seafood Restaurant,Park,Mexican Restaurant,Deli / Bodega,Italian Restaurant,Spanish Restaurant,Boutique,Lounge
6,Central Harlem,African Restaurant,Chinese Restaurant,American Restaurant,Cosmetics Shop,French Restaurant,Art Gallery,Seafood Restaurant,Bar,Ethiopian Restaurant,Spa
7,East Harlem,Mexican Restaurant,Bakery,Thai Restaurant,Latin American Restaurant,Deli / Bodega,Spa,Park,Restaurant,Café,Pharmacy
12,Upper West Side,Bakery,Coffee Shop,Italian Restaurant,Dessert Shop,Wine Bar,American Restaurant,Vegetarian / Vegan Restaurant,Seafood Restaurant,Pizza Place,Mediterranean Restaurant
19,East Village,Pizza Place,Cocktail Bar,Coffee Shop,Ice Cream Shop,Japanese Restaurant,Bar,Juice Bar,Mexican Restaurant,Ramen Restaurant,Vietnamese Restaurant
20,Lower East Side,Chinese Restaurant,Pharmacy,Art Gallery,Café,Cocktail Bar,French Restaurant,Performing Arts Venue,Park,Mediterranean Restaurant,Latin American Restaurant


## Cluster 4

In [31]:
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 4, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
11,Roosevelt Island,Park,Playground,Food & Drink Shop,Pizza Place,Plaza,Cosmetics Shop,Deli / Bodega,Sandwich Place,Monument / Landmark,Bubble Tea Shop
28,Battery Park City,Park,Hotel,Gym,Wine Shop,Memorial Site,Coffee Shop,Shopping Mall,Plaza,Playground,Boat or Ferry


## Classifying Paris top ranked categories by neighborhoods

In [32]:
fixed_columns = [paris_onehot.columns[-1]] + list(paris_onehot.columns[:-1])
paris_onehot = paris_onehot[fixed_columns]

paris_onehot.head()
paris_grouped = paris_onehot.groupby('Neighborhood').mean().reset_index()

num_top_venues = 5

for hood in paris_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = paris_grouped[paris_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

paris_grouped.shape

----Batignolles-Monceau----
                 venue  freq
0    French Restaurant  0.17
1                Hotel  0.17
2   Italian Restaurant  0.09
3  Japanese Restaurant  0.07
4               Bistro  0.05


----Bourse----
               venue  freq
0  French Restaurant  0.13
1           Wine Bar  0.06
2       Cocktail Bar  0.06
3              Hotel  0.05
4             Bistro  0.03


----Buttes-Chaumont----
                venue  freq
0   French Restaurant  0.11
1                 Bar  0.09
2              Bistro  0.05
3  Seafood Restaurant  0.05
4         Supermarket  0.05


----Buttes-Montmartre----
               venue  freq
0                Bar  0.18
1  French Restaurant  0.11
2  Convenience Store  0.05
3        Pizza Place  0.05
4        Coffee Shop  0.05


----Entrepôt----
               venue  freq
0  French Restaurant  0.13
1        Coffee Shop  0.05
2              Hotel  0.05
3               Café  0.04
4             Bistro  0.04


----Gobelins----
                   venue  freq
0  V

(20, 162)

In [33]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = paris_grouped['Neighborhood']

for ind in np.arange(paris_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(paris_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Batignolles-Monceau,French Restaurant,Hotel,Italian Restaurant,Japanese Restaurant,Bakery,Bistro,Plaza,Café,Bar,Restaurant
1,Bourse,French Restaurant,Wine Bar,Cocktail Bar,Hotel,Bakery,Bistro,Creperie,Italian Restaurant,Women's Store,Salad Place
2,Buttes-Chaumont,French Restaurant,Bar,Bistro,Beer Bar,Seafood Restaurant,Hotel,Supermarket,Pizza Place,Spa,Italian Restaurant
3,Buttes-Montmartre,Bar,French Restaurant,Restaurant,Hotel,Pizza Place,Coffee Shop,Convenience Store,Seafood Restaurant,Fast Food Restaurant,Beer Store
4,Entrepôt,French Restaurant,Coffee Shop,Hotel,Café,Indian Restaurant,Bistro,Pizza Place,Japanese Restaurant,African Restaurant,Wine Shop


## Finding Paris - Manhattan cluster mapping

In [34]:
paris_grouped_clustering = paris_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeansparis = kmeans.predict(paris_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeansparis

array([1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 4, 3, 4, 1, 1, 1],
      dtype=int32)

In [35]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeansparis)

paris_merged = paris

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
paris_merged = paris_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

## Final Results

In [36]:
neighborhoods_venues_sorted

Unnamed: 0,Cluster Labels,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,1,Batignolles-Monceau,French Restaurant,Hotel,Italian Restaurant,Japanese Restaurant,Bakery,Bistro,Plaza,Café,Bar,Restaurant
1,1,Bourse,French Restaurant,Wine Bar,Cocktail Bar,Hotel,Bakery,Bistro,Creperie,Italian Restaurant,Women's Store,Salad Place
2,1,Buttes-Chaumont,French Restaurant,Bar,Bistro,Beer Bar,Seafood Restaurant,Hotel,Supermarket,Pizza Place,Spa,Italian Restaurant
3,1,Buttes-Montmartre,Bar,French Restaurant,Restaurant,Hotel,Pizza Place,Coffee Shop,Convenience Store,Seafood Restaurant,Fast Food Restaurant,Beer Store
4,1,Entrepôt,French Restaurant,Coffee Shop,Hotel,Café,Indian Restaurant,Bistro,Pizza Place,Japanese Restaurant,African Restaurant,Wine Shop
5,3,Gobelins,Vietnamese Restaurant,Asian Restaurant,Chinese Restaurant,Thai Restaurant,French Restaurant,Juice Bar,Japanese Restaurant,Bus Stop,Sandwich Place,Cambodian Restaurant
6,1,Hôtel-de-Ville,French Restaurant,Ice Cream Shop,Clothing Store,Hotel,Wine Bar,Plaza,Garden,Tea Room,Pedestrian Plaza,Italian Restaurant
7,1,Louvre,French Restaurant,Japanese Restaurant,Plaza,Hotel,Italian Restaurant,Art Museum,Garden,Coffee Shop,Historic Site,Udon Restaurant
8,1,Luxembourg,French Restaurant,Wine Bar,Bistro,Fountain,Plaza,Cocktail Bar,Ice Cream Shop,Deli / Bodega,Café,Pub
9,1,Ménilmontant,French Restaurant,Bakery,Bistro,Japanese Restaurant,Italian Restaurant,Plaza,Bus Stop,Sushi Restaurant,Bar,Café


## Conclusion

Reuilly - 12e arrondissements (neighborhood) fits best with places in Manhattan cluster 2.

# Cheers !

In [37]:
neighborhoods_venues_sorted.groupby(["Cluster Labels"]).groups


{1: Int64Index([0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 17, 18, 19], dtype='int64'),
 3: Int64Index([5, 15], dtype='int64'),
 4: Int64Index([14, 16], dtype='int64')}