<h1>Clustering Most Visited Cities</h1>

<h4>Import libraries</h4>

In [1]:
from bs4 import BeautifulSoup #to extract wikipedia tables

from geopy.geocoders import Nominatim #to fnd latitude and longitude of cities

import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

import re

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    geographiclib: 1.49-py_0   conda-forge
    geopy:         1.18.1-py_0 conda-forge

geographiclib- 100% |################################| Time: 0:00:00  12.37 MB/s
geopy-1.18.1-p 100% |################################| Time: 0:00:00  18.65 MB/s
Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  45.42 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  34.24 MB/s
vincent-0.4.4- 100% |###################

In [2]:
#Other imports
#Math functions, we'll only need the sqrt function so let's import only that
from math import sqrt

import matplotlib.pyplot as plt
%matplotlib inline

<h4>Get list of most visited cities from Wikipedia</h4>

In [3]:
#Extract the table from the website
url = "https://en.wikipedia.org/wiki/List_of_cities_by_international_visitors"
res = requests.get(url).text
soup = BeautifulSoup(res,'lxml')
table = soup.find('table', class_='wikitable')
#Extract the elements from the table
city = []

for row in table.findAll("tr"):
    cells = row.findAll("td")
    if len(cells) == 8:
        city.append(cells[2].find(text=True).rstrip('\n'))
        
#Write the elements in a dataframe
df = pd.DataFrame()
df['City'] = city
df.head()

Unnamed: 0,City
0,Hong Kong
1,Bangkok
2,London
3,Singapore
4,Macau


<h4>Obtain latitude and longitude for the list of cites</h4>

In [4]:
#Find latitude and longitude of cities
df['Latitude'] = ""
df['Longitude'] = ""
geolocator = Nominatim(user_agent="specify_your_app_name_here")
for i in range(len(df['City'])):
    location = geolocator.geocode(df['City'][i])
    df['Latitude'][i] = location.latitude
    df['Longitude'][i] = location.longitude
df.head()

Unnamed: 0,City,Latitude,Longitude
0,Hong Kong,22.2793,114.163
1,Bangkok,13.7539,100.816
2,London,51.5073,-0.127647
3,Singapore,1.29048,103.852
4,Macau,22.1758,113.551


<h4>Map those cities</h4>

In [5]:
# create world map

#Let's start with the average lat and long in data set
latitude = df['Latitude'].mean()
longitude = df['Longitude'].mean()
map = folium.Map(location=[latitude, longitude], zoom_start=1, width=800,height=400)

#Add markers to map
for lat, lng, city in zip(df['Latitude'], df['Longitude'], df['City']):
    label = '{}'.format(city)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=2,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=1,
        parse_html=False).add_to(map)  
    
map

<h4>Define function to get nearby venues for all cities</h4>

In [6]:
def getNearbyVenues(names, latitudes, longitudes):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],
            re.search('/categories_v2/(.+?)/', v['venue']['categories'][0]['icon']['prefix']).group(1)) for v in results]) #Retrieve broad categories

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['City', 
                  'City Latitude', 
                  'City Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

<h4>Run the above function for each city and create a new dataframe called City_venues</h4>

In [7]:
# The code was removed by Watson Studio for sharing.

In [8]:
#First define input
#Credentials are in a hidden cell
VERSION = '20180605' # Foursquare API version

LIMIT = 50 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

#Then call the function
City_venues = getNearbyVenues(names=df['City'],latitudes=df['Latitude'],longitudes=df['Longitude'])

City_venues.head()
                                  

Unnamed: 0,City,City Latitude,City Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Hong Kong,22.279328,114.162813,Hong Kong Park (香港公園),22.2777,114.161854,parks_outdoors
1,Hong Kong,22.279328,114.162813,Pure Fitness,22.278475,114.161363,building
2,Hong Kong,22.279328,114.162813,Ruth's Chris Steak House (茹絲葵牛排餐廳),22.279474,114.163427,food
3,Hong Kong,22.279328,114.162813,Hong Kong Park Aviary (香港公園觀鳥園),22.27714,114.161399,arts_entertainment
4,Hong Kong,22.279328,114.162813,Pure Fitness,22.279925,114.163022,building


<h4>Analyze each neighborhood</h4>

In [81]:
# one hot encoding
City_onehot = pd.get_dummies(City_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
City_onehot['City'] = City_venues['City'] 

# move neighborhood column to the first column
fixed_columns = [City_onehot.columns[-1]] + list(City_onehot.columns[:-1])
City_onehot = City_onehot[fixed_columns]



#Let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
City_grouped = City_onehot.groupby('City').sum().reset_index()
City_grouped['sum_venues'] = City_grouped.sum(axis=1)
City_grouped = City_grouped.loc[City_grouped['sum_venues'] >= 50] #Exclude city where Foursquare might not be used a lot
City_grouped = City_grouped.drop(['sum_venues','building','education','event'], axis  = 1).reset_index(drop = True)#Drop building, education and event due to low occurence
City_grouped


Unnamed: 0,City,arts_entertainment,food,nightlife,parks_outdoors,shops,travel
0,Amman,2,38,5,2,2,1
1,Amsterdam,5,24,11,1,9,0
2,Athens,6,27,13,1,3,0
3,Auckland,8,30,0,2,7,2
4,Barcelona,4,26,6,7,4,3
5,Beirut,0,32,14,0,2,1
6,Berlin,5,18,2,2,11,9
7,Bogota,11,32,1,1,3,1
8,Boston,7,29,1,4,6,2
9,Brussels,4,19,7,1,17,1


<h4>Normalizing over the standard deviation</h4>

In [205]:
from sklearn.preprocessing import StandardScaler

City_grouped_standard = pd.DataFrame()
City_grouped_standard['arts_entertainment'] = pd.qcut(City_grouped['arts_entertainment'], 5, labels = [1,2,3,4,5])
City_grouped_standard['food'] = pd.qcut(City_grouped['food'], 5, labels = [1,2,3,4,5])
City_grouped_standard['nightlife'] = pd.qcut(City_grouped['nightlife'], 5, labels = [1,2,3,4,5])
City_grouped_standard['parks_outdoors'] = pd.qcut(City_grouped['parks_outdoors'], 5, labels = [1,2,3,4,5])
City_grouped_standard['shops'] = pd.qcut(City_grouped['shops'], 5, labels = [1,2,3,4,5])
City_grouped_standard['travel'] = pd.qcut(City_grouped['travel'], 5, labels = [1,2,3,4,5])

City_grouped_standard['City'] = City_grouped['City'] 
City_grouped_standard = City_grouped_standard[['City','arts_entertainment','food','nightlife','parks_outdoors','shops','travel']] #re-arrange order of columns
City_grouped_standard.head()


Unnamed: 0,City,arts_entertainment,food,nightlife,parks_outdoors,shops,travel
0,Amman,1,5,3,2,1,1
1,Amsterdam,3,2,5,1,4,1
2,Athens,4,4,5,1,1,1
3,Auckland,5,4,1,2,3,2
4,Barcelona,2,3,4,5,1,3


<h4>Cluster Neighborhoods</h4>

In [37]:
#Run k-means to cluster the neighborhoods
# set number of clusters
kclusters = 3

X = City_grouped_standard.drop(['City'], axis  = 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(X)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 1, 1, 1, 2, 1, 1, 0, 0, 2, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1], dtype=int32)

In [38]:
#Add clustering labels
City_grouped['Cluster Labels'] = kmeans.labels_

City_merged = df

#Merge City_grouped with df to add latitude/longitude for each city
City_merged = City_merged.join(City_grouped.set_index('City'), on='City')

#Remove rows where Cluster is NaN
City_merged = City_merged[np.isfinite(City_merged['Cluster Labels'])]

#Cast cluster as int
City_merged["Cluster Labels"] = City_merged["Cluster Labels"].astype(int)

#Check the centroid values by averaging the features in each cluster
City_merged.groupby('Cluster Labels').mean()

Unnamed: 0_level_0,arts_entertainment,building,education,event,food,nightlife,parks_outdoors,shops,travel
Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,3.709677,0.903226,0.129032,0.0,30.16129,5.032258,1.935484,6.193548,1.935484
1,5.166667,1.583333,0.055556,0.0,21.722222,3.611111,3.944444,8.444444,5.472222
2,15.0,6.0,0.0,0.5,14.5,3.5,4.5,2.0,4.0


In [41]:
#Finally, let's visualize the resulting clusters
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=1, width=800,height=400)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(City_merged['Latitude'], City_merged['Longitude'], City_merged['City'], City_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=2,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

<h4>Content-Based recommendation system</h4>
This technique attempts to figure out what a user's favourite aspects of an item is, and then recommends items that present those aspects. In our case, we're going to try to figure out the input's favorite aspects of a city from the cities and ratings given.

In [101]:
#Let's begin by creating an input user to recommend cities to:
userInput = [
            {'City':'Amsterdam', 'rating':5},
            {'City':'London', 'rating':4},
            {'City':'New York City', 'rating':2},
            {'City':'Paris', 'rating':4},
            {'City':'Toronto', 'rating':1}
         ] 
inputCities = pd.DataFrame(userInput)
inputCities

Unnamed: 0,City,rating
0,Amsterdam,5
1,London,4
2,New York City,3
3,Paris,4
4,Toronto,1


In [102]:
#We're going to start by learning the input's preferences, so let's get the subset of cities that the input has visited from the Dataframe containing characteristics
#Filtering out the movies from the input
CitiesWithVenues_df = City_grouped_standard

userCities = CitiesWithVenues_df[CitiesWithVenues['City'].isin(inputCities['City'].tolist())]
userCities

Unnamed: 0,City,arts_entertainment,food,nightlife,parks_outdoors,shops,travel
1,Amsterdam,3,2,5,1,4,1
32,London,5,1,2,5,2,4
42,New York City,1,5,2,1,5,1
46,Paris,2,2,3,5,4,1
62,Toronto,4,4,1,3,2,1


In [103]:
#We'll only need the actual genre table, so let's clean this up a bit by resetting the index and dropping the movieId, title, genres and year columns.

#Resetting the index to avoid future issues
userCities = userCities.reset_index(drop=True)
#Dropping unnecessary issues due to save memory and to avoid issues
userVenuesTable = userCities.drop('City', 1)
userVenuesTable

Unnamed: 0,arts_entertainment,food,nightlife,parks_outdoors,shops,travel
0,3,2,5,1,4,1
1,5,1,2,5,2,4
2,1,5,2,1,5,1
3,2,2,3,5,4,1
4,4,4,1,3,2,1


In [113]:
#Now we're ready to start learning the input's preferences! 
#To do this, we're going to turn each genre into weights. We can do this by using the input's reviews and multiplying them into the input's genre table and then summing up the resulting table by column. 
#This operation is actually a dot product between a matrix and a vector, so we can simply accomplish by calling Pandas's "dot" function.

#Dot produt to get weights
userProfile = userVenuesTable.transpose().dot(inputCities['rating'])
#The user profile
userProfile

arts_entertainment    50
food                  41
nightlife             52
parks_outdoors        51
shops                 61
travel                29
dtype: object

In [206]:
#Now, we have the weights for every of the user's preferences. This is known as the User Profile. Using this, we can recommend movies that satisfy the user's preferences.
#Let's format the data to obtain a VenueTatble
VenueTable = City_grouped_standard
VenueTable = VenueTable.set_index('City')
VenueTable.head()


Unnamed: 0_level_0,arts_entertainment,food,nightlife,parks_outdoors,shops,travel
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Amman,1,5,3,2,1,1
Amsterdam,3,2,5,1,4,1
Athens,4,4,5,1,1,1
Auckland,5,4,1,2,3,2
Barcelona,2,3,4,5,1,3


In [207]:
#Multiply the genres by the weights and then take the weighted average
Estimated_Rating = (np.dot(VenueTable,userProfile))/(userProfile.sum())
Estimated_Rating = pd.DataFrame(Estimated_Rating)
RecommendationTable_df = pd.DataFrame()
RecommendationTable_df = City_grouped_standard
RecommendationTable_df['Estimated_Ratings'] = Estimated_Rating[0]
RecommendationTable_df
#Sort our recommendations in descending order
RecommendationTable_df = RecommendationTable_df.sort_values(by = ['Estimated_Ratings'],ascending=False)
#Top10Recommendations
RecommendationTable_df.head(10)

Unnamed: 0,City,arts_entertainment,food,nightlife,parks_outdoors,shops,travel,Estimated_Ratings
67,Washington D.C.,4,1,4,5,3,4,3.53169
51,Saint Petersburg,5,1,2,5,4,3,3.45423
39,Milan,2,1,4,4,5,4,3.42958
56,Sofia,5,1,5,5,2,1,3.36972
66,Warsaw,5,2,3,3,3,3,3.20775
38,Mexico City,5,1,3,3,3,4,3.16549
15,Dublin,4,2,5,1,4,2,3.15141
32,London,5,1,2,5,2,4,3.12676
48,Prague,3,1,5,2,4,3,3.11268
10,Bucharest,3,2,2,3,5,3,3.10211
