<h1>Holiday Destination Recommender System</h1>

<h2>Import libraries</h2>

In [2]:
from bs4 import BeautifulSoup #to extract wikipedia tables

import numpy as np #library to handle data in a vectorized manner

import pandas as pd #library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json #library to handle JSON files

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

from math import sqrt #Math functions, we'll only need the sqrt function so let's import only that

import matplotlib.pyplot as plt
%matplotlib inline

import re #to search specific strings

from sklearn.cluster import KMeans #import k-means from clustering stage

!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library

print('Libraries imported.')

Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
geopy                     1.18.1                     py_0    conda-forge
Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
folium                    0.5.0                      py_0    conda-forge
Libraries imported.


<h2>Get a list of frequently visited cities from Wikipedia, obtain latitude and longitude of cites and show them on a map </h2>

In [3]:
#Extract the table from the website
url = "https://en.wikipedia.org/wiki/List_of_cities_by_international_visitors"
res = requests.get(url).text
soup = BeautifulSoup(res,'lxml')
table = soup.find('table', class_='wikitable')
#Extract the elements from the table
city = []

for row in table.findAll("tr"):
    cells = row.findAll("td")
    if len(cells) == 8:
        city.append(cells[2].find(text=True).rstrip('\n'))
        
#Write the elements in a dataframe
df = pd.DataFrame()
df['City'] = city
df.head()

Unnamed: 0,City
0,Hong Kong
1,Bangkok
2,London
3,Singapore
4,Macau


In [4]:
#Find latitude and longitude of cities
df['Latitude'] = ""
df['Longitude'] = ""
geolocator = Nominatim(user_agent="specify_your_app_name_here")
for i in range(len(df['City'])):
    location = geolocator.geocode(df['City'][i])
    df['Latitude'][i] = location.latitude
    df['Longitude'][i] = location.longitude
df.head()

Unnamed: 0,City,Latitude,Longitude
0,Hong Kong,22.2793,114.163
1,Bangkok,13.7539,100.816
2,London,51.5073,-0.127647
3,Singapore,1.29048,103.852
4,Macau,22.1899,113.538


In [5]:
#Create world map
#Let's start with the average lat and long in data set
latitude = df['Latitude'].mean()
longitude = df['Longitude'].mean()
map = folium.Map(location=[latitude, longitude], zoom_start=1, width=800,height=400)

#Add markers to map
for lat, lng, city in zip(df['Latitude'], df['Longitude'], df['City']):
    label = '{}'.format(city)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=2,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=1,
        parse_html=False).add_to(map)  
    
map

<h2>Obtain nearby venues for each city using the Foursquare API and prepare the data for analysis</h2>

In [7]:
#Define function to search venues
def getNearbyVenues(names, latitudes, longitudes):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],
            re.search('/categories_v2/(.+?)/', v['venue']['categories'][0]['icon']['prefix']).group(1)) for v in results]) #Retrieve broad categories

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['City', 
                  'City Latitude', 
                  'City Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [8]:
#The following cell includes my Foursquare credentials and is therefore hidden

In [9]:
# The code was removed by Watson Studio for sharing.

In [10]:
#First define input
#Credentials are in a hidden cell
VERSION = '20180605' # Foursquare API version
LIMIT = 50 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

#Then call the function
City_venues = getNearbyVenues(names=df['City'],latitudes=df['Latitude'],longitudes=df['Longitude'])
City_venues.head()

Unnamed: 0,City,City Latitude,City Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Hong Kong,22.279328,114.162813,Hong Kong Park (香港公園),22.2777,114.161854,parks_outdoors
1,Hong Kong,22.279328,114.162813,Pure Fitness,22.278475,114.161363,building
2,Hong Kong,22.279328,114.162813,Ruth's Chris Steak House (茹絲葵牛排餐廳),22.279474,114.163427,food
3,Hong Kong,22.279328,114.162813,Hong Kong Park Aviary (香港公園觀鳥園),22.27714,114.161399,arts_entertainment
4,Hong Kong,22.279328,114.162813,Pure Fitness,22.279925,114.163022,building


In [11]:
#The following piece of code is used to find how many venues of different types we have for each city

City_onehot = pd.get_dummies(City_venues[['Venue Category']], prefix="", prefix_sep="") # one hot encoding

City_onehot['City'] = City_venues['City'] # add city column back to dataframe

fixed_columns = [City_onehot.columns[-1]] + list(City_onehot.columns[:-1]) # move city column to the first column
City_onehot = City_onehot[fixed_columns]

City_grouped = City_onehot.groupby('City').sum().reset_index() #Let's group rows by city and by taking the sum of occurrence of each category
City_grouped['sum_venues'] = City_grouped.sum(axis=1)
City_grouped = City_grouped.loc[City_grouped['sum_venues'] >= 50] #Exclude city where Foursquare might not be used a lot
City_grouped = City_grouped.drop(['sum_venues','building','education','event'], axis  = 1).reset_index(drop = True) #Drop building, education and event due to low occurence
City_grouped.head()

Unnamed: 0,City,arts_entertainment,food,nightlife,parks_outdoors,shops,travel
0,Amman,2,38,5,2,2,1
1,Amsterdam,5,24,11,1,9,0
2,Athens,6,27,13,1,3,0
3,Auckland,8,30,0,2,7,2
4,Barcelona,4,26,6,7,4,3


In [12]:
#The following piece of code is used to transform "occurence valiables" into quintile. This method of standardization has been chosen with regard to the ultimate goal of this project 
#which is to build a recommender system based on attributes of cities
City_grouped_standard = pd.DataFrame()
City_grouped_standard['arts_entertainment'] = pd.qcut(City_grouped['arts_entertainment'], 5, labels = [1,2,3,4,5])
City_grouped_standard['food'] = pd.qcut(City_grouped['food'], 5, labels = [1,2,3,4,5])
City_grouped_standard['nightlife'] = pd.qcut(City_grouped['nightlife'], 5, labels = [1,2,3,4,5])
City_grouped_standard['parks_outdoors'] = pd.qcut(City_grouped['parks_outdoors'], 5, labels = [1,2,3,4,5])
City_grouped_standard['shops'] = pd.qcut(City_grouped['shops'], 5, labels = [1,2,3,4,5])
City_grouped_standard['travel'] = pd.qcut(City_grouped['travel'], 5, labels = [1,2,3,4,5])

City_grouped_standard['City'] = City_grouped['City'] 
City_grouped_standard = City_grouped_standard[['City','arts_entertainment','food','nightlife','parks_outdoors','shops','travel']] #re-arrange order of columns
City_grouped_standard.head()

Unnamed: 0,City,arts_entertainment,food,nightlife,parks_outdoors,shops,travel
0,Amman,1,5,4,2,1,1
1,Amsterdam,3,2,5,1,4,1
2,Athens,4,4,5,1,1,1
3,Auckland,5,4,1,2,3,2
4,Barcelona,2,3,4,5,1,3


<h2>Preliminary analysis: clustering the cities</h2>

In [15]:
#Run k-means to cluster the neighborhoods

kclusters = 3 # set number of clusters

X = City_grouped_standard.drop(['City'], axis  = 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(X) # run k-means clustering

kmeans.labels_ # check cluster labels generated for each row in the dataframe

array([0, 1, 0, 2, 2, 0, 1, 0, 0, 1, 1, 0, 1, 2, 0, 1, 1, 2, 1, 2, 0, 1, 1,
       0, 0, 0, 0, 2, 1, 1, 1, 2, 0, 1, 0, 0, 1, 1, 2, 1, 1, 1, 0, 0, 2, 0,
       1, 2, 1, 2, 2, 2, 1, 0, 0, 2, 2, 0, 1, 1, 0, 2, 0, 0, 0, 0, 2, 2, 1], dtype=int32)

In [16]:
City_grouped['Cluster Labels'] = kmeans.labels_ #Add clustering labels

City_merged = df

City_merged = City_merged.join(City_grouped.set_index('City'), on='City') #Merge City_grouped with df to add latitude/longitude for each city

City_merged = City_merged[np.isfinite(City_merged['Cluster Labels'])] #Remove rows where Cluster is NaN

City_merged["Cluster Labels"] = City_merged["Cluster Labels"].astype(int) #Cast cluster as int

City_merged.groupby('Cluster Labels').mean() #Check the centroid values by averaging the features in each cluster

Unnamed: 0_level_0,arts_entertainment,food,nightlife,parks_outdoors,shops,travel
Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3.192308,31.192308,4.192308,2.384615,4.846154,3.192308
1,3.76,22.44,4.04,2.36,11.6,4.28
2,8.5,20.777778,4.388889,5.111111,4.666667,4.388889


In [17]:
#Finally, let's visualize the resulting clusters
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=1, width=800,height=400)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(City_merged['Latitude'], City_merged['Longitude'], City_merged['City'], City_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=2,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

<h2>Main contribution: development of a content-based recommendation system</h2>

In [19]:
#Let's begin by creating an input user to recommend cities to:
userInput = [
            {'City':'Amsterdam', 'rating':5},
            {'City':'London', 'rating':4},
            {'City':'New York City', 'rating':2},
            {'City':'Paris', 'rating':4},
            {'City':'Toronto', 'rating':1}
         ] 
inputCities = pd.DataFrame(userInput)
inputCities

Unnamed: 0,City,rating
0,Amsterdam,5
1,London,4
2,New York City,2
3,Paris,4
4,Toronto,1


In [21]:
#We're going to start by learning the input's preferences, so let's get the subset of cities that the input has visited
CitiesWithVenues_df = City_grouped_standard
userCities = CitiesWithVenues_df[CitiesWithVenues_df['City'].isin(inputCities['City'].tolist())]
userCities

Unnamed: 0,City,arts_entertainment,food,nightlife,parks_outdoors,shops,travel
1,Amsterdam,3,2,5,1,4,1
31,London,5,1,2,5,2,4
42,New York City,1,5,2,1,5,1
46,Paris,2,2,4,5,4,1
62,Toronto,4,4,1,3,2,1


In [22]:
#We'll only need the actual venue table, so let's clean this up a bit by resetting the index and dropping the city colu;m

userCities = userCities.reset_index(drop=True) #Resetting the index to avoid future issues

userVenuesTable = userCities.drop('City', 1) #Dropping unnecessary issues due to save memory and to avoid issues
userVenuesTable

Unnamed: 0,arts_entertainment,food,nightlife,parks_outdoors,shops,travel
0,3,2,5,1,4,1
1,5,1,2,5,2,4
2,1,5,2,1,5,1
3,2,2,4,5,4,1
4,4,4,1,3,2,1


In [23]:
#Now we're ready to start learning the input's preferences! 
#To do this, we're going to turn each venue category into weights. We can do this by using the input's reviews and multiplying them into the input's venue table and then summing up the resulting table by column. 
#This operation is actually a dot product between a matrix and a vector, so we can simply accomplish by calling Pandas's "dot" function.

userProfile = userVenuesTable.transpose().dot(inputCities['rating']) #Dot produt to get weights
userProfile

arts_entertainment    49
food                  36
nightlife             54
parks_outdoors        50
shops                 56
travel                28
dtype: object

In [24]:
#Now, we have the weights for every of the user's preferences. This is known as the User Profile. Using this, we can recommend cities that satisfy the user's preferences.
#Let's format the data to obtain a Venue Table
VenueTable = City_grouped_standard
VenueTable = VenueTable.set_index('City')
VenueTable.head()

Unnamed: 0_level_0,arts_entertainment,food,nightlife,parks_outdoors,shops,travel
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Amman,1,5,4,2,1,1
Amsterdam,3,2,5,1,4,1
Athens,4,4,5,1,1,1
Auckland,5,4,1,2,3,2
Barcelona,2,3,4,5,1,3


In [25]:
#Multiply the genres by the weights and then take the weighted average
Estimated_Rating = (np.dot(VenueTable,userProfile))/(userProfile.sum())
Estimated_Rating = pd.DataFrame(Estimated_Rating)
RecommendationTable_df = pd.DataFrame()
RecommendationTable_df = City_grouped_standard
RecommendationTable_df['Estimated_Ratings'] = Estimated_Rating[0]
RecommendationTable_df
#Sort our recommendations in descending order
RecommendationTable_df = RecommendationTable_df.sort_values(by = ['Estimated_Ratings'],ascending=False)
#Top10Recommendations
RecommendationTable_df.head(10)

Unnamed: 0,City,arts_entertainment,food,nightlife,parks_outdoors,shops,travel,Estimated_Ratings
67,Washington D.C.,4,1,4,5,3,4,3.58242
51,Saint Petersburg,5,1,2,5,4,3,3.46886
39,Milan,2,1,4,4,5,4,3.45055
56,Sofia,5,1,5,5,2,1,3.44689
66,Warsaw,5,2,4,3,3,3,3.42491
46,Paris,2,2,4,5,4,1,3.25275
59,Taipei,1,2,4,3,5,4,3.21978
38,Mexico City,5,1,3,3,3,4,3.1978
15,Dublin,4,2,5,1,4,2,3.17949
31,London,5,1,2,5,2,4,3.16117
