# Capstone project

## Exploration of cities around the world

Let's print a map showing the cities we will use in our analysis

In [1]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import matplotlib.cm as cm
import matplotlib.colors as colors
import pandas as pd
import numpy as np
import folium

cities = ['Toronto, Canada', 'Paris, France', 'Manhattan, NY', 'London, England', 
          'Lisbon, Portugal', 'Sydney, Australia', 'Dublin, Ireland', 'Nice, France', 
          'Moscow, Russia', 'Rome, Italy', 'Madrid, Spain', 'Le Cap, South Africa',
          'New Delhi, India', 'Tokyo, Japan', 'Hong-Kong, China', 'Praia, Cape-Verde', 'Rio, Brazil', 'Bangalore, India',
          'Mexico City, Mexico', 'Berlin, Germany']

locations = {}
geolocator = Nominatim(user_agent="world_explorer")

for city in cities:
    location = geolocator.geocode(city)
    latitude = location.latitude
    longitude = location.longitude
    city_position = (latitude,longitude)
    locations[city] = city_position
    


# create map of the world using 0 for latitude and longitude values
world_map = folium.Map(location=[0, 0], zoom_start=2)

for city_name, latitude_longitude in locations.items():
    label = city_name
    label = folium.Popup(label, parse_html=True)
    latitude =latitude_longitude[0]
    longitude =latitude_longitude[1]
    folium.CircleMarker(
        [latitude, longitude],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(world_map) 
    
world_map

Next, we are going to start utilizing the Foursquare API to explore the neighborhoods and segment them.
#### Define Foursquare Credentials and Version

In [2]:
CLIENT_ID = 'AR2LDST4MHLM5KFISIJGCISGINIYHNDZZ5VVP3Q14FMAWLBT' # your Foursquare ID
CLIENT_SECRET = 'DINBAUEQMLLBZQP1V1JSY3V4ZSWFS3FRODTJROZZZAHTRMAZ' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 10 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: AR2LDST4MHLM5KFISIJGCISGINIYHNDZZ5VVP3Q14FMAWLBT
CLIENT_SECRET:DINBAUEQMLLBZQP1V1JSY3V4ZSWFS3FRODTJROZZZAHTRMAZ


### Explore venues in each city
#### Let's create a function to get all the most common venues for each city

In [3]:
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import requests # library to handle requests

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        # type your answer here
        LIMIT = 50
        radius = 500
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['City', 
                  'City Latitude', 
                  'City Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return nearby_venues

city_names = list(locations.keys())
latitudes = [pos[0] for pos in list(locations.values())]
longitudes = [pos[1] for pos in list(locations.values())]

data = {'City':city_names,'Latitudes':latitudes,'Longitudes':longitudes}
cities_data = pd.DataFrame(data=data)
positions = list(locations.values())

city_venues = getNearbyVenues(names=city_names,
                              latitudes=latitudes,
                              longitudes=longitudes)

Toronto, Canada
Paris, France
Manhattan, NY
London, England
Lisbon, Portugal
Sydney, Australia
Dublin, Ireland
Nice, France
Moscow, Russia
Rome, Italy
Madrid, Spain
Le Cap, South Africa
New Delhi, India
Tokyo, Japan
Hong-Kong, China
Praia, Cape-Verde
Rio, Brazil
Bangalore, India
Mexico City, Mexico
Berlin, Germany


In [4]:
# one hot encoding
cities_onehot = pd.get_dummies(city_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
cities_onehot['City'] = city_venues['City'] 

# move neighborhood column to the first column
fixed_columns = [cities_onehot.columns[-1]] + list(cities_onehot.columns[:-1])
cities_onehot = cities_onehot[fixed_columns]

cities_onehot.head()# function that extracts the category of the venue

Unnamed: 0,City,Accessories Store,African Restaurant,Alsatian Restaurant,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Train Station,Ukrainian Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Vineyard,Waterfront,Wine Bar,Women's Store,Yoga Studio,Zoo
0,"Toronto, Canada",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Toronto, Canada",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Toronto, Canada",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Toronto, Canada",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Toronto, Canada",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
cities_grouped = cities_onehot.groupby('City').mean().reset_index()
cities_grouped

Unnamed: 0,City,Accessories Store,African Restaurant,Alsatian Restaurant,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Train Station,Ukrainian Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Vineyard,Waterfront,Wine Bar,Women's Store,Yoga Studio,Zoo
0,"Bangalore, India",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0
1,"Berlin, Germany",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.02,0.0,0.0,0.0,0.06,0.0,0.0,0.0
2,"Dublin, Ireland",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0
3,"Hong-Kong, China",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.02
4,"Le Cap, South Africa",0.0,0.02,0.0,0.0,0.0,0.02,0.0,0.02,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Lisbon, Portugal",0.0,0.02,0.0,0.0,0.0,0.0,0.02,0.0,0.0,...,0.0,0.0,0.02,0.0,0.0,0.02,0.02,0.0,0.0,0.0
6,"London, England",0.0,0.0,0.0,0.0,0.06,0.04,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0
7,"Madrid, Spain",0.02,0.0,0.0,0.0,0.0,0.02,0.0,0.02,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,"Manhattan, NY",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Mexico City, Mexico",0.0,0.0,0.0,0.0,0.0,0.04,0.02,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
num_top_venues = 20

for hood in cities_grouped['City']:
    print("----"+hood+"----")
    temp = cities_grouped[cities_grouped['City'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head())
    print('\n')

----Bangalore, India----
              venue  freq
0  Capitol Building  0.25
1             Hotel  0.25
2          Vineyard  0.25
3              Park  0.25
4            Palace  0.00


----Berlin, Germany----
            venue  freq
0           Hotel  0.12
1        Wine Bar  0.06
2      Restaurant  0.04
3  Clothing Store  0.04
4           Plaza  0.04


----Dublin, Ireland----
            venue  freq
0     Coffee Shop  0.12
1             Pub  0.12
2  Clothing Store  0.08
3       Bookstore  0.06
4            Café  0.06


----Hong-Kong, China----
        venue  freq
0       Hotel  0.12
1        Café  0.10
2        Park  0.06
3  Steakhouse  0.06
4   Bookstore  0.04


----Le Cap, South Africa----
                venue  freq
0                Café  0.08
1  Italian Restaurant  0.08
2                 Pub  0.06
3              Museum  0.06
4          Restaurant  0.04


----Lisbon, Portugal----
                   venue  freq
0  Portuguese Restaurant  0.18
1                    Bar  0.10
2            

#### Let's put that into a _pandas_ dataframe
First, let's write a function to sort the venues in descending order.

In [7]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each city.

In [8]:
num_top_venues = 20

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['City']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
cities_venues_sorted = pd.DataFrame(columns=columns)
cities_venues_sorted['City'] = cities_grouped['City']

for ind in np.arange(cities_grouped.shape[0]):
    cities_venues_sorted.iloc[ind, 1:] = return_most_common_venues(cities_grouped.iloc[ind, :], num_top_venues)

cities_venues_sorted.head()

Unnamed: 0,City,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,...,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue,16th Most Common Venue,17th Most Common Venue,18th Most Common Venue,19th Most Common Venue,20th Most Common Venue
0,"Bangalore, India",Hotel,Vineyard,Park,Capitol Building,Zoo,Diner,Falafel Restaurant,Exhibit,Event Space,...,Electronics Store,Donut Shop,Dog Run,Discount Store,Department Store,Dim Sum Restaurant,Dessert Shop,Field,Deli / Bodega,Currywurst Joint
1,"Berlin, Germany",Hotel,Wine Bar,Plaza,Coffee Shop,Clothing Store,Opera House,Cosmetics Shop,Concert Hall,Restaurant,...,Café,Chocolate Shop,Gourmet Shop,Perfume Shop,Pub,Cocktail Bar,Church,Optical Shop,Outdoor Sculpture,Sandwich Place
2,"Dublin, Ireland",Coffee Shop,Pub,Clothing Store,Café,Bookstore,Discount Store,Hotel,Theater,Donut Shop,...,Sports Bar,Market,Outdoor Sculpture,Sushi Restaurant,Post Office,Brewery,Restaurant,Scenic Lookout,Supermarket,Taco Place
3,"Hong-Kong, China",Hotel,Café,Steakhouse,Park,Furniture / Home Store,Italian Restaurant,Lounge,Dim Sum Restaurant,Cantonese Restaurant,...,Coffee Shop,Zoo,Chinese Restaurant,Shopping Mall,Burger Joint,Plaza,Cupcake Shop,Cocktail Bar,Deli / Bodega,Non-Profit
4,"Le Cap, South Africa",Café,Italian Restaurant,Museum,Pub,Restaurant,Hotel,Pizza Place,Cuban Restaurant,Burger Joint,...,Photography Lab,Coffee Shop,Park,Nightclub,Ice Cream Shop,Diner,Movie Theater,Molecular Gastronomy Restaurant,Food,Laundry Service


#### Let's put that into a _pandas_ dataframe

First, let's write a function to sort the venues in descending order.

In [9]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.


## Cluster Cities by their 20 most commong venues
Run _k_-means to cluster the cities into 5 clusters.

In [10]:
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

cities_grouped_clustering = cities_grouped.drop('City', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(cities_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 20 venues for each city.

In [11]:
# add clustering labels
cities_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

d = { 'latitudes':latitudes}
cities_merged = cities_data

# merge cities_grouped with cities_data to add latitude/longitude for each city
cities_merged = cities_merged.join(cities_venues_sorted.set_index('City'), on='City')

cities_merged.head() # check the last columns!

Unnamed: 0,City,Latitudes,Longitudes,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,...,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue,16th Most Common Venue,17th Most Common Venue,18th Most Common Venue,19th Most Common Venue,20th Most Common Venue
0,"Toronto, Canada",43.653482,-79.383935,1,Clothing Store,Theater,Electronics Store,Seafood Restaurant,Plaza,New American Restaurant,...,Monument / Landmark,Bookstore,Shopping Mall,Modern European Restaurant,Concert Hall,Breakfast Spot,Bubble Tea Shop,Gym / Fitness Center,Women's Store,Department Store
1,"Paris, France",48.856697,2.351462,1,Ice Cream Shop,French Restaurant,Plaza,Gay Bar,Art Gallery,Clothing Store,...,Café,Pub,Portuguese Restaurant,Pedestrian Plaza,Coffee Shop,Department Store,Deli / Bodega,Seafood Restaurant,Dessert Shop,Miscellaneous Shop
2,"Manhattan, NY",40.789624,-73.959894,1,Baseball Field,Park,Playground,Athletics & Sports,Bus Station,Food Truck,...,Tennis Court,Dog Run,Field,Event Space,Exhibit,Donut Shop,Escape Room,Electronics Store,Zoo,Discount Store
3,"London, England",51.507322,-0.127647,1,Hotel,Art Gallery,Garden,Theater,Plaza,Art Museum,...,Boutique,Sandwich Place,Burger Joint,Restaurant,Pharmacy,Piano Bar,Spanish Restaurant,Monument / Landmark,Currywurst Joint,Church
4,"Lisbon, Portugal",38.707751,-9.136592,1,Portuguese Restaurant,Bar,Plaza,Hostel,Ice Cream Shop,Hotel,...,Candy Store,Escape Room,Electronics Store,Peruvian Restaurant,Bookstore,Restaurant,Gastropub,Beer Garden,Beer Bar,Japanese Restaurant


Finally, let's visualize the resulting clusters

In [12]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[0, 0], zoom_start=2)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(cities_merged['Latitudes'], cities_merged['Longitudes'], cities_merged['City'], cities_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters