# Battle of the Neighborhoods - DRAFT

In [6]:
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim

Solving environment: done

# All requested packages already installed.



## Research: List top world cities by population

In [43]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize

In [69]:
#Parse the list of top world cities by popuylation from Wikipedia and clean data into pandas dataframe
url = 'https://en.wikipedia.org/wiki/List_of_cities_proper_by_population'
sauce = requests.get(url).text
soup = BeautifulSoup(sauce, 'lxml')
tables = [[[td.get_text(strip=True)
           for td in tr.find_all('td')]
           for tr in table.find_all('tr')]
         for table in soup.find_all('table')]
world_cities = tables[1]
df = pd.DataFrame(world_cities, 
                 columns =['Rank', 'City', 'Image', 'Population', 'Definition', 'Total area (km2)', 'Population density (/km2)', 'Country'])
#Remove unnecessary row with no values
df.drop(df.index[0], inplace=True)
#Remove unnecessary columns with no useful information
df.drop(columns = ['Image', 'Definition'], inplace=True)
#Remove [references to footnotes] from dataframe using regex
for column in df.columns:
    df[column] = df[column].str.replace(r"\[.*]","")
#Remove commas from columns containig numbers using regex and convert type to numeric
NumberColumns = ['Rank', 'Population', 'Total area (km2)', 'Population density (/km2)']
for column in NumberColumns:
    df[column] = pd.to_numeric(df[column].str.replace(",",""))
#Select the top 20 world cities by population
df = df.nlargest(20, 'Population')
print('Dataframe shape: ',df.shape)
df.head()

Dataframe shape:  (20, 6)


Unnamed: 0,Rank,City,Population,Total area (km2),Population density (/km2),Country
1,1,Chongqing,30165500,82403.0,366,China
2,2,Shanghai,24183300,6340.5,3814,China
3,3,Beijing,21707000,16411.0,1267,China
4,4,Istanbul,15029231,5196.0,2893,Turkey
5,5,Karachi,14910352,3780.0,3944,Pakistan


In [70]:
# Get geographical coordinates for each city
geolocator = Nominatim(user_agent = 'foursquare_agent')
df['Lat'] = None
df['Lng'] = None
for i in range(0, len(df), 1):
    location = geolocator.geocode(df.iat[i,1])
    try:
        lat = location.latitude
        lng = location.longitude
        df.iat[i, df.columns.get_loc('Lat')] = lat
        df.iat[i, df.columns.get_loc('Lng')] = lng
    except:
      lat = None
      lon = None
        
df.head()

Unnamed: 0,Rank,City,Population,Total area (km2),Population density (/km2),Country,Lat,Lng
1,1,Chongqing,30165500,82403.0,366,China,29.5586,106.549
2,2,Shanghai,24183300,6340.5,3814,China,31.2253,121.489
3,3,Beijing,21707000,16411.0,1267,China,39.9062,116.391
4,4,Istanbul,15029231,5196.0,2893,Turkey,41.0096,28.9652
5,5,Karachi,14910352,3780.0,3944,Pakistan,25.1447,67.1848


In [76]:
df['Radius']= round(np.sqrt(df['Total area (km2)']/np.pi),0)
df.head()

Unnamed: 0,Rank,City,Population,Total area (km2),Population density (/km2),Country,Lat,Lng,Radius,Spas
1,1,Chongqing,30165500,82403.0,366,China,29.5586,106.549,162.0,8
2,2,Shanghai,24183300,6340.5,3814,China,31.2253,121.489,45.0,100
3,3,Beijing,21707000,16411.0,1267,China,39.9062,116.391,72.0,45
4,4,Istanbul,15029231,5196.0,2893,Turkey,41.0096,28.9652,41.0,100
5,5,Karachi,14910352,3780.0,3944,Pakistan,25.1447,67.1848,35.0,0


# Foursquare

In [72]:
# The code was removed by Watson Studio for sharing.

In [82]:
location = geolocator.geocode('New York')
latitude = location.latitude
longitude = location.longitude
radius = 10000
search_query = 'Gastropub'
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, search_query, radius, LIMIT)
results = requests.get(url).json()
items = results['response']['groups'][0]['items']
print(len(items))

100


In [78]:
# Get hotels for each city
df['Spas'] = None
for i in range(0, len(df), 1):
    location = geolocator.geocode(df.iat[i,1])
    latitude = df.iat[i,6]
    longitude = df.iat[i,7]
    radius_query = 5000
    search_query = 'Spa'
    url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, search_query, radius_query, LIMIT)
    results = requests.get(url).json()
    items = results['response']['groups'][0]['items']
    try:
        search_items = len(items)
        df.iat[i, df.columns.get_loc('Spas')] = search_items
    except:
        hotels = None
      
        
df.head()

Unnamed: 0,Rank,City,Population,Total area (km2),Population density (/km2),Country,Lat,Lng,Radius,Spas
1,1,Chongqing,30165500,82403.0,366,China,29.5586,106.549,162.0,7
2,2,Shanghai,24183300,6340.5,3814,China,31.2253,121.489,45.0,84
3,3,Beijing,21707000,16411.0,1267,China,39.9062,116.391,72.0,18
4,4,Istanbul,15029231,5196.0,2893,Turkey,41.0096,28.9652,41.0,100
5,5,Karachi,14910352,3780.0,3944,Pakistan,25.1447,67.1848,35.0,0


In [79]:
df

Unnamed: 0,Rank,City,Population,Total area (km2),Population density (/km2),Country,Lat,Lng,Radius,Spas
1,1,Chongqing,30165500,82403.0,366,China,29.5586,106.549,162.0,7
2,2,Shanghai,24183300,6340.5,3814,China,31.2253,121.489,45.0,84
3,3,Beijing,21707000,16411.0,1267,China,39.9062,116.391,72.0,18
4,4,Istanbul,15029231,5196.0,2893,Turkey,41.0096,28.9652,41.0,100
5,5,Karachi,14910352,3780.0,3944,Pakistan,25.1447,67.1848,35.0,0
6,6,Dhaka,14399000,337.54,42659,Bangladesh,23.7594,90.3788,10.0,1
7,7,Tokyo,13515271,626.99,21556,Japan,35.6828,139.759,14.0,95
8,8,Moscow,13200000,2511.0,5256,Russia,55.7504,37.6175,28.0,100
9,9,Guangzhou,13081000,7434.0,1760,China,23.1302,113.259,49.0,12
10,10,Shenzhen,12528300,1992.0,6889,China,22.5446,114.055,25.0,19
