## Imports and such things

In [1]:
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from geopy.geocoders import Nominatim
import folium
import requests
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
import re

## Scrape list of largest cities from Wikipedia

The wikipedia page [List of largest cities](https://en.wikipedia.org/wiki/List_of_largest_cities) has a list of the largest cities in the world.  
We are interested in table that contains the actual cities, and want the cityname, the nation, and its city proper population.  
In addition, we grab the link to said cities' page, to later scrape more information.

We will first use urllib to fetch the page, then BeautifulSoup to parse it and find the first (and only) sortable table.

In [2]:
# location of the wikipedia article
url = "https://en.wikipedia.org/wiki/List_of_largest_cities"

# fetch the article
req = urllib.request.urlopen(url)
article = req.read().decode()

# parse with BeautifulSoup and find the first sortable table
soup = BeautifulSoup(article, 'html.parser')
table = soup.find('table', class_='sortable')

Every row in the table will also have a link to the cities wikipedia page. We will use the following function to scrape specific information from that page.

In [3]:
# create an empty DataFrame
cols=["City", "Nation", "Population", "URL"]
df_cities = pd.DataFrame(columns=cols)
df_cities['Population'].astype(int)
    
# iterate trough all the rows in the table:
for tr in table.find_all('tr'):
    tds = tr.find_all('td')
    if not tds:
        continue                            # skips first row with headings
    nation = tds[0].find('a').string        # first td column contains nation   
    try:
        pop = int(re.compile(r'\[.*\]').sub("",tds[2].text).replace(',',''))  # rough but working way to parse the population count
    except ValueError:
        pop = 0
    city_a = tr.find('th').find('a')        # the first column contains th tag and contains the <a> link to the city
    city = city_a.string
    url = "https://en.wikipedia.org" + city_a['href']
    df_cities = df_cities.append({
        'City': city, 
        'Nation': nation, 
        'Population': pop, 
        'URL': url
    }, ignore_index=True)

In [4]:
df_cities.head()

Unnamed: 0,City,Nation,Population,URL
0,Chongqing,People's Republic of China,30751600,https://en.wikipedia.org/wiki/Chongqing
1,Shanghai,People's Republic of China,24256800,https://en.wikipedia.org/wiki/Shanghai
2,Delhi,India,11034555,https://en.wikipedia.org/wiki/Delhi
3,Beijing,People's Republic of China,21516000,https://en.wikipedia.org/wiki/Beijing
4,Dhaka,Bangladesh,14399000,https://en.wikipedia.org/wiki/Dhaka


## Adding geopositioning data

We can scrape the geoposition coordinates for every city from the respective city page.  
The following function scrape the city page and uses a simple regular expression to capture the coordinates.

In [5]:
# Scrape an individual cities page for its coordinates
def scrape_city_coords(url):
    req = urllib.request.urlopen(url)
    article = req.read().decode()
    reg = re.search(r'"lat":(.*?),"lon":(.*?)}', article)
    lat = float(reg.group(1))
    lon = float(reg.group(2))
    return lat,lon

In [6]:
df_cities["Latitude"], df_cities["Longitude"] = zip(*df_cities["URL"].map(scrape_city_coords))

In [7]:
df_cities.head()

Unnamed: 0,City,Nation,Population,URL,Latitude,Longitude
0,Chongqing,People's Republic of China,30751600,https://en.wikipedia.org/wiki/Chongqing,29.558333,106.566667
1,Shanghai,People's Republic of China,24256800,https://en.wikipedia.org/wiki/Shanghai,31.228611,121.474722
2,Delhi,India,11034555,https://en.wikipedia.org/wiki/Delhi,28.61,77.23
3,Beijing,People's Republic of China,21516000,https://en.wikipedia.org/wiki/Beijing,39.916667,116.383333
4,Dhaka,Bangladesh,14399000,https://en.wikipedia.org/wiki/Dhaka,23.716111,90.396111


## Putting these cities on the map

In [8]:
# create map of the world using latitude and longitude values
map_world = folium.Map(location=[0,0],zoom_start=2)

# add markers to map
for lat, lng, city, nation, pop in zip(df_cities['Latitude'], df_cities['Longitude'], df_cities['City'], df_cities['Nation'], df_cities['Population']):
    label = '{}, {}: {}'.format(city, nation, pop)
    label = folium.Popup(label)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
    ).add_to(map_world)  
    
map_world

## Getting more information from Foursquare

This information is needed to connect with Foursquare API

In [9]:
CLIENT_ID = 'YPBVFDUZOP1M24BKCWGXIYZ3RFACOE3V35WSFY4DSCMRU44L' # your Foursquare ID
CLIENT_SECRET = 'VYHYTBSRIZBPYAOCP5ZEFV3YM4C40YEQCQWCUO4NC1JTPNJM' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100

In [35]:
def getRecommendedVenues(cities, latitudes, longitudes):
    
    venues_list=[]
    for city, lat, lon in zip(cities, latitudes, longitudes):
        
        print("city:" + city)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&section=topPicks&client_id={}&client_secret={}&v={}&ll={},{}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lon, 
            5)
            
        # make the GET request
        response = requests.get(url)
        if response.status_code == requests.codes.ok:
            results = response.json()["response"]['groups'][0]['items']
        else:
            print ("status was:" + str(response.status_code))
            print (response)
            continue
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            city,
            lat, 
            lon, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])


    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['City',
                  'Latitude', 
                  'Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [36]:
world_venues = getRecommendedVenues(df_cities['City'], df_cities['Latitude'], df_cities['Longitude'])
world_venues.head()

city:Chongqing
city:Shanghai
city:Delhi
city:Beijing
city:Dhaka
city:Mumbai
city:Lagos
city:Chengdu
city:Karachi
city:Guangzhou
city:Istanbul
city:Tokyo
city:Tianjin
city:Moscow
city:São Paulo
city:Kinshasa
city:Baoding
city:Lahore
city:Cairo
city:Seoul
city:Jakarta
city:Wenzhou
city:Mexico City
city:Lima
city:London
city:Bangkok
city:Xi'an
city:Chennai
city:Bangalore
city:New York City
city:Ho Chi Minh City
city:Hyderabad
city:Shenzhen
city:Suzhou
city:Nanjing
city:Dongguan
city:Tehran
city:Quanzhou
city:Shenyang
city:Bogotá
city:Hong Kong
city:Baghdad
city:Fuzhou
city:Changsha
city:Wuhan
city:Hanoi
city:Rio de Janeiro
city:Qingdao
city:Foshan
city:Zunyi
city:Santiago
city:Riyadh
city:Ahmedabad
city:Singapore
city:Shantou
city:Ankara
city:Yangon
city:Saint Petersburg
city:Sydney
city:Casablanca
city:Melbourne
city:Abidjan
city:Alexandria
city:Kolkata
city:Surat
city:Johannesburg
city:Dar es Salaam
city:Shijiazhuang
city:Harbin
city:Giza
city:İzmir
city:Zhengzhou
city:New Taipei City
c

Unnamed: 0,City,Latitude,Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Chongqing,29.558333,106.566667,大队长知青火锅,29.581195,106.529219,Hotpot Restaurant
1,Chongqing,29.558333,106.566667,Apple Paradise Walk Chongqing (Apple 重庆北城天街),29.580996,106.529746,Electronics Store
2,Chongqing,29.558333,106.566667,Paulaner,29.561489,106.53228,German Restaurant
3,Chongqing,29.558333,106.566667,Blue Frog (蓝蛙),29.580209,106.528782,American Restaurant
4,Chongqing,29.558333,106.566667,Club Intercontinental,29.561557,106.520209,Lounge


That's looking pretty awesome. Now let's prevent more scraping by saving the dataframe to a file.

In [18]:
df_cities.to_csv('cities.csv')
world_venues.to_csv('venues.csv')

In [37]:
world_venues.describe()

Unnamed: 0,Latitude,Longitude,Venue Latitude,Venue Longitude
count,1225.0,1225.0,1225.0,1225.0
mean,21.892944,46.496361,21.894071,46.497047
std,22.451007,71.801368,22.450103,71.801005
min,-37.813611,-118.25,-37.815571,-118.250051
25%,10.8,7.483333,10.800865,7.475612
50%,27.683333,52.533333,27.690678,52.52082
75%,36.3,113.393,36.3005,113.380386
max,59.9375,174.74,59.939728,174.756709


## Putting all venues on a map

In [38]:
# create map of the world using latitude and longitude values
map_venues = folium.Map(location=[0,0],zoom_start=2)

# add markers to map
for lat, lng, city, venue in zip(world_venues['Venue Latitude'], world_venues['Venue Longitude'], world_venues['City'],  world_venues['Venue']):
    label = '{}: {}'.format(city, venue)
    label = folium.Popup(label)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
    ).add_to(map_venues)  
    
map_venues