# Section 1 - Data Aquisition

In [1]:
# Import needed libraries
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import io
import folium
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors
from geopy.geocoders import Nominatim

First we will need to get the list of financial centers and their coordinates. We will get the list of centers from the wikipedia page: https://en.wikipedia.org/wiki/Global_Financial_Centres_Index

In [2]:
# Load article, turn into soup and get the <table>s.
website_url = requests.get('https://en.wikipedia.org/wiki/Global_Financial_Centres_Index').text
soup = BeautifulSoup(website_url,'lxml')
table = soup.find_all('table', class_='multicol')

In [3]:
# read the table into a list
df=pd.read_html(str(table))
# convert list to dataframe
df=pd.DataFrame(df[0])
# clean first rows of dataframe
df.drop(df.index[0], inplace=True)

# set the dataframe headers
new_header = df.iloc[0] 
df = df[1:] 
df.columns = new_header 

# drop rows and columns we don't want
df.drop(['Change', 'Rating', 'Change'], axis='columns', inplace=True)
df.drop( df[ df['Rank'] == 'Rank' ].index , inplace=True)
df['Rank'] = pd.to_numeric(df['Rank'])
df.drop(df[df['Rank'] > 50].index, inplace=True)
df.set_index('Rank')
# replac names of Washington DC and New York City so that they retrieve the correct coordinates
df.replace('Washington, D.C.', 'Washington', inplace=True)
df.replace('New York City', 'New York', inplace=True)
df

1,Rank,Centre
2,1,New York
3,2,London
4,3,Shanghai
5,4,Tokyo
6,5,Hong Kong
7,6,Singapore
8,7,Beijing
9,8,San Francisco
10,9,Shenzhen
11,10,Zurich


In [4]:
# Get lat, long coordinates for each city
lats = []
longs = []

for city in df['Centre']:
    geolocator = Nominatim(user_agent="my_user_agent")
    loc = geolocator.geocode(city)
    latitude = loc.latitude
    lats.append(latitude)
    longitude = loc.longitude
    longs.append(longitude)



In [43]:
df['Latitude'] = lats
df['Longitude'] = longs

df.loc[df.Centre == "Beijing", "Latitude"] = 39.914736
df.loc[df.Centre == "Beijing", "Longitude"] = 116.405894

In [44]:
worldMap = folium.Map(zoom_start=10)

# add markers to map
for lat, lng, City in zip(df['Latitude'], df['Longitude'], df['Centre']):
    label = '{}'.format(City)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(worldMap) 

worldMap

In [7]:
# Load Foursquare credentials
CLIENT_ID = '1UWPXEJJPVA3N424ACSVHJTOPARDZHAA51KZYMSKXKUFE1JG' # your Foursquare ID
CLIENT_SECRET = 'FNXFZRIGHXLLHD0C4E3ILHO1RILM24LA3R4WDRJV5NIDN3VX' # your Foursquare Secret
ACCESS_TOKEN = '3SE5USOAVDLOBV3RI3DIKXG05UPNX34HQOU5PR24LDVIA5YO' # your FourSquare Access Token
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 1UWPXEJJPVA3N424ACSVHJTOPARDZHAA51KZYMSKXKUFE1JG
CLIENT_SECRET:FNXFZRIGHXLLHD0C4E3ILHO1RILM24LA3R4WDRJV5NIDN3VX


In [50]:
# Function to get the nearby venues for each City
def getNearbyVenues(names, latitudes, longitudes, radius=12000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['City', 
                  'City Latitude', 
                  'City Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [51]:
# Create a dataframe containing all of the venues by City
city_venues = getNearbyVenues(names=df['Centre'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

New York
London
Shanghai
Tokyo
Hong Kong
Singapore
Beijing
San Francisco
Shenzhen
Zurich
Los Angeles
Luxembourg
Edinburgh
Geneva
Boston
Frankfurt
Dubai
Paris
Washington
Chicago
Guangzhou
Amsterdam
Stockholm
Vancouver
Seoul
Montreal
Melbourne
Madrid
Hamburg
Brussels
Toronto
Sydney
Abu Dhabi
Dublin
Mumbai
Copenhagen
Stuttgart
Milan
Osaka
Busan
Kuala Lumpur
Taipei
Chengdu
Wellington
Tel Aviv
Casablanca
Qingdao
Munich
New Delhi
Oslo


In [52]:
city_venues.head()

Unnamed: 0,City,City Latitude,City Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,New York,40.712728,-74.006015,The Bar Room at Temple Court,40.711448,-74.006802,Hotel Bar
1,New York,40.712728,-74.006015,Los Tacos No. 1,40.714267,-74.008756,Taco Place
2,New York,40.712728,-74.006015,Korin,40.714824,-74.009404,Furniture / Home Store
3,New York,40.712728,-74.006015,Aire Ancient Baths,40.718141,-74.004941,Spa
4,New York,40.712728,-74.006015,9/11 Memorial North Pool,40.712077,-74.013187,Memorial Site


In [53]:
print(city_venues.shape)

(4930, 7)


In [54]:
city_venues.groupby('City').count()

Unnamed: 0_level_0,City Latitude,City Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Abu Dhabi,100,100,100,100,100,100
Amsterdam,100,100,100,100,100,100
Beijing,100,100,100,100,100,100
Boston,100,100,100,100,100,100
Brussels,100,100,100,100,100,100
Busan,100,100,100,100,100,100
Casablanca,100,100,100,100,100,100
Chengdu,100,100,100,100,100,100
Chicago,100,100,100,100,100,100
Copenhagen,100,100,100,100,100,100


In [56]:
# Onehot encode each venue category to allow the data to be used by the k-means clustering algorithm 
df_onehot = pd.get_dummies(city_venues[['Venue Category']], prefix='',prefix_sep='')
df_onehot['City'] =  city_venues['City']
fixed_columns = [df_onehot.columns[-1]] + list(df_onehot.columns[:-1])
df_onehot = df_onehot[fixed_columns]
df_onehot.head()

Unnamed: 0,City,Abruzzo Restaurant,Accessories Store,Adult Boutique,Advertising Agency,African Restaurant,Airport Lounge,Airport Service,American Restaurant,Amphitheater,...,Wings Joint,Women's Store,Xinjiang Restaurant,Yakitori Restaurant,Yoga Studio,Yoshoku Restaurant,Yunnan Restaurant,Zhejiang Restaurant,Zoo,Zoo Exhibit
0,New York,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,New York,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,New York,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,New York,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,New York,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [188]:
# Group the data by neighbourhood and use mean to produce weighted one hot encoded values based on venue category frequency
df_grouped = df_onehot.groupby('City').mean().reset_index()

In [219]:

cities = ['New York', 'London', 'Shanghai', 'Tokyo']

for city in cities:
    print("5 Most Common Venue Categories For: ",  city)
    city = df_grouped.loc[(df_grouped["City"] == city)]
    city = city.loc[:, (city != 0).any(axis=0)]
    city = city.T.reset_index()
    city.columns = ['Venue', 'Frequency']
    city = city.iloc[1:]
    city['Frequency'] = city['Frequency'].astype(float)
    city = city.round({'Frequency': 2})
    print(city.sort_values('Frequency', ascending=False).reset_index(drop=True).head(5))
    print('\n')

5 Most Common Venue Categories For:  New York
            Venue  Frequency
0            Park       0.13
1  Ice Cream Shop       0.05
2          Bakery       0.04
3  Scenic Lookout       0.04
4       Bookstore       0.04


5 Most Common Venue Categories For:  London
        Venue  Frequency
0       Hotel       0.17
1     Theater       0.05
2      Lounge       0.04
3  Art Museum       0.04
4        Park       0.04


5 Most Common Venue Categories For:  Shanghai
               Venue  Frequency
0              Hotel       0.18
1        Coffee Shop       0.05
2      Shopping Mall       0.05
3  French Restaurant       0.05
4             Bakery       0.04


5 Most Common Venue Categories For:  Tokyo
              Venue  Frequency
0             Hotel       0.09
1        Art Museum       0.05
2  Ramen Restaurant       0.04
3     Wagashi Place       0.04
4          Sake Bar       0.04




# Section 2 - Clustering Financial Centres

In [127]:
# Import needed libraries
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
import requests
import json

In [115]:
# Now we will run the k-means clustering on our grouped one hot encoded data set
# set number of clusters
kclusters = 6

df_grouped_clustering = df_grouped.drop('City', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 0, 4, 3, 3, 0, 2, 4, 0, 2])

In [116]:
df_grouped.insert(0,'Cluster Labels', kmeans.labels_)

df_grouped.shape

(50, 392)

In [117]:
df_merged = df.merge(df_grouped[['City', 'Cluster Labels']], left_on='Centre', right_on='City')

In [118]:
df_merged.drop(['City'], axis='columns', inplace=True)

In [119]:
# create map
worldMap_clusters = folium.Map(zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_merged['Latitude'], df_merged['Longitude'], df_merged['Centre'], df_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(worldMap_clusters)
       
worldMap_clusters

In [128]:
# Function to call API for each cities population
def get_city_opendata(city):
    tmp = 'https://public.opendatasoft.com/api/records/1.0/search/?dataset=worldcitiespop&q=%s&sort=population'
    cmd = tmp % (city)
    res = requests.get(cmd)
    dct = json.loads(res.content)
    out = dct['records'][0]['fields']
    return out

In [136]:
# Get population for each city
pop = []

for city in df_merged['Centre']:
    city_data = get_city_opendata(city)
    pop.append(city_data.get('population'))

df_merged['Population'] = pop

In [140]:
# Hardcoding populations for the below cities as the above api did not return population numbers
df_merged.loc[df_merged.Centre == "Hong Kong", "Population"] = 7451000
df_merged.loc[df_merged.Centre == "Beijing", "Population"] = 21540000
df_merged.loc[df_merged.Centre == "Luxembourg", "Population"] = 613894
df_merged.loc[df_merged.Centre == "Mumbai", "Population"] = 18410000
df_merged.loc[df_merged.Centre == "Busan", "Population"] = 3429000

In [173]:
df_grouped_clusters = df_grouped.drop(columns=['City']).groupby(by= 'Cluster Labels').mean()
df_grouped_clusters.reset_index(inplace=True)

num_top_venues = 5

for cluster in df_grouped_clusters['Cluster Labels']:
    in_cluster = []
    in_cluster = df_merged.loc[(df_merged["Cluster Labels"] == cluster), "Centre"].values

    print("---- Cluster: "+str(cluster)+"----")

    print("Cities in cluster: ", end = '')
    for city in in_cluster:
        print(city+ ", ", end = '')
    print()

    print("Cluster Mean Population: ", "{:,.0f}".format((df_merged[df_merged['Cluster Labels']==cluster])['Population'].mean()))
    temp = df_grouped_clusters[df_grouped_clusters['Cluster Labels'] == cluster].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

---- Cluster: 0----
Cities in cluster: London, Tokyo, Hong Kong, Singapore, Dubai, Washington, Chicago, Amsterdam, Vancouver, Seoul, Busan, Taipei, 
Cluster Mean Population:  6,106,943
                venue  freq
0               Hotel  0.11
1         Coffee Shop  0.05
2                Park  0.04
3              Bakery  0.03
4  Italian Restaurant  0.02


---- Cluster: 1----
Cities in cluster: Dublin, Wellington, 
Cluster Mean Population:  601,634
         venue  freq
0         Café  0.15
1  Coffee Shop  0.10
2          Pub  0.06
3   Restaurant  0.06
4          Bar  0.03


---- Cluster: 2----
Cities in cluster: San Francisco, Los Angeles, Edinburgh, Stockholm, Montreal, Melbourne, Hamburg, Toronto, Sydney, Abu Dhabi, Copenhagen, Osaka, Tel Aviv, Casablanca, Oslo, 
Cluster Mean Population:  2,038,489
         venue  freq
0  Coffee Shop  0.07
1         Café  0.07
2         Park  0.06
3       Bakery  0.05
4        Hotel  0.04


---- Cluster: 3----
Cities in cluster: New York, Zurich, Luxembo