# Capstone Project - The Battle of Neighborhoods

In [1]:
# Setting the environment
!conda install -c conda-forge beautifulsoup4 --yes
!conda install -c conda-forge lxml --yes
!conda install -c conda-forge html5lib --yes
!conda install -c conda-forge requests --yes
# Foursquare API
!conda install -c conda-forge geopy --yes
!conda install -c conda-forge folium=0.5.0 --yes

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following packages will be UPDATED:

    beautifulsoup4: 4.6.0-py35h442a8c9_1 --> 4.6.3-py35_0 conda-forge

beautifulsoup4 100% |################################| Time: 0:00:00  40.30 MB/s
Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following packages will be UPDATED:

    libxml2: 2.9.4-h6b072ca_5     --> 2.9.8-h422b904_2     conda-forge
    libxslt: 1.1.29-hcf9102b_5    --> 1.1.32-h88dbc4e_2    conda-forge
    lxml:    4.1.0-py35ha401a81_0 --> 4.2.5-py35hc9114bc_0 conda-forge

libxml2-2.9.8- 100% |################################| Time: 0:00:00  71.30 MB/s
libxslt-1.1.32 100% |################################| Time: 0:00:00  62.61 MB/s
lxml-4.2.5-py3 100% |################################| Time: 0:00:00  65.96 MB/s
Fetchin

In [2]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd

url = 'https://es.wikipedia.org/wiki/Anexo:Distritos_de_Lima'
response = requests.get(url)
data = response.text

soup = BeautifulSoup(data, 'lxml')

match = soup.findAll("table")

lima =  match[0]
callao = match[1]

# print(callao.contents[3])

x = 0
data_lima = {}
data_lima[0] = []
data_lima[1] = []
data_lima[2] = []

for child in lima.contents[1].contents:
    x = x + 1    
    if( x%2 != 0 ):
        data_lima[0].append(child.get_text().strip().split('\n')[12])
        data_lima[1].append(child.get_text().strip().split('\n')[0])
        data_lima[2].append(child.get_text().strip().split('\n')[2])

x = 0
first_time = 1

for child in callao.contents[3].contents:
    x = x + 1    
    if( x%2 != 0 ):
        if( first_time == 1):
            first_time = 0
        else:
            data_lima[0].append(child.get_text().strip().split('\n')[12])
            data_lima[1].append(child.get_text().strip().split('\n')[0])
            data_lima[2].append(child.get_text().strip().split('\n')[2])

df = pd.DataFrame({'Postcode':data_lima[0][1:],'District':data_lima[1][1:],'LocationCode':data_lima[2][1:]})
df

Unnamed: 0,District,LocationCode,Postcode
0,Ancón,150102,02
1,Ate,150103,03
2,Barranco,150104,04
3,Breña,150105,05
4,Carabayllo,150106,06
5,Chaclacayo,150107,08
6,Chorrillos,150108,09
7,Cieneguilla,150109,40
8,Comas,150110,07
9,El Agustino,150111,10


In [3]:
# Replace empty values from 'Postcode' column with NaN values
df['Postcode'].replace('-', np.nan, inplace=True)
df

Unnamed: 0,District,LocationCode,Postcode
0,Ancón,150102,02
1,Ate,150103,03
2,Barranco,150104,04
3,Breña,150105,05
4,Carabayllo,150106,06
5,Chaclacayo,150107,08
6,Chorrillos,150108,09
7,Cieneguilla,150109,40
8,Comas,150110,07
9,El Agustino,150111,10


In [4]:
# Delete rows with NaN values
df.dropna(subset=['Postcode'], inplace=True)
df

Unnamed: 0,District,LocationCode,Postcode
0,Ancón,150102,02
1,Ate,150103,03
2,Barranco,150104,04
3,Breña,150105,05
4,Carabayllo,150106,06
5,Chaclacayo,150107,08
6,Chorrillos,150108,09
7,Cieneguilla,150109,40
8,Comas,150110,07
9,El Agustino,150111,10


In [5]:
# Remove indexes
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,District,LocationCode,Postcode
0,Ancón,150102,02
1,Ate,150103,03
2,Barranco,150104,04
3,Breña,150105,05
4,Carabayllo,150106,06
5,Chaclacayo,150107,08
6,Chorrillos,150108,09
7,Cieneguilla,150109,40
8,Comas,150110,07
9,El Agustino,150111,10


## Get the latitude and longitude of districts in Lima

In [6]:
import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

print('Libraries imported.')

Libraries imported.


### Function in order to get the coordinates of an input Location

In [7]:
def get_coordinates(location, output_as='center'):
    """
    get the bounding box of a locality in WGS84 given its name

    Parameters
    ----------
    localidad : str
        name of the country in english and lowercase
    output_as : 'str
        chose from 'boundingbox' or 'center'. 
         - 'boundingbox' for [latmin, latmax, lonmin, lonmax]
         - 'center' for [latcenter, loncenter]

    Returns
    -------
    output : list
        list with coordinates as str
    """
    # create url
    url = '{0}{1}{2}'.format('http://nominatim.openstreetmap.org/search.php?q=', location +', Lima, Peru', '&format=json&polygon=0')
    response = requests.get(url).json()[0]

    # parse response to list
    if output_as == 'boundingbox':
        lst = response[output_as]
        output = [float(i) for i in lst]
    if output_as == 'center':
        lst = [response.get(key) for key in ['lat','lon']]
        output = [float(i) for i in lst]
    return output

In [8]:
df_loc = df.copy()

latitudes = []
longitudes = []

for index, row in df_loc.iterrows():
    lat, lng = get_coordinates(location=row[0], output_as='center')    
    latitudes.append(lat)
    longitudes.append(lng)

df_loc['Latitude'] = latitudes
df_loc['Longitude'] = longitudes

print(df_loc.shape)
df_loc.head()

(50, 5)


Unnamed: 0,District,LocationCode,Postcode,Latitude,Longitude
0,Ancón,150102,2,-11.696554,-77.111655
1,Ate,150103,3,-12.038728,-76.896873
2,Barranco,150104,4,-12.143959,-77.020268
3,Breña,150105,5,-12.0597,-77.050119
4,Carabayllo,150106,6,-11.794993,-76.989292


## Create a map of Lima Metropolitana

In [9]:
# get Toronto latitude and longitude
address = 'Lima, Peru'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Lima are {}, {}.'.format(latitude, longitude))



The geograpical coordinate of Lima are -12.0621065, -77.0365256.


In [10]:
# create map of Toronto using latitude and longitude values
map_lima = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, district, location in zip(df_loc['Latitude'], df_loc['Longitude'], df_loc['District'], df_loc['LocationCode']):
    label = '{}, {}'.format(location, district)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_lima)  
    
map_lima

### Foursquare credentials and version

In [11]:
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '' # Foursquare API

# Top 200 venues in a radius of 5 kilometers
LIMIT = 200
rad = 5000

### Function to process all the locations in Lima

In [12]:
def getNearbyVenues(names, latitudes, longitudes, radius, categoryIds=''):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        
        # filter by category
        if (categoryIds != ''):
                url = url + '&categoryId={}'
                url = url.format(categoryIds)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### We use a code from Foursquare to only get Fast Food Restaurants

In [13]:
ID_FAST_FOOD = '4bf58dd8d48988d16e941735'
lima_fast_food = getNearbyVenues(names=df_loc['District'], latitudes=df_loc['Latitude'], longitudes=df_loc['Longitude'], radius=rad, categoryIds=ID_FAST_FOOD)
lima_fast_food.head()

Ancón
Ate
Barranco
Breña
Carabayllo
Chaclacayo
Chorrillos
Cieneguilla
Comas
El Agustino
Independencia
Jesús María
La Molina
La Victoria
Lima
Lince
Los Olivos
Lurigancho
Lurín
Magdalena del Mar
Miraflores
Pachacamac
Pucusana
Pueblo Libre
Puente Piedra
Punta Hermosa
Punta Negra
Rímac
San Bartolo
San Borja
San Isidro
San Juan de Lurigancho
San Juan de Miraflores
San Luis
San Martín de Porres
San Miguel
Santa Anita
Santa María del Mar
Santa Rosa
Santiago de Surco
Surquillo
Villa El Salvador
Villa María del Triunfo
Bellavista
Callao
Carmen de La Legua-Reynoso
La Perla
La Punta
Ventanilla
Mi Perú


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Ate,-12.038728,-76.896873,Patio de Comidas Real Plaza Santa Clara,-12.015311,-76.885459,Fast Food Restaurant
1,Ate,-12.038728,-76.896873,KFC,-12.077653,-76.918261,Fast Food Restaurant
2,Ate,-12.038728,-76.896873,Pios Chicken,-12.041984,-76.934752,Fast Food Restaurant
3,Ate,-12.038728,-76.896873,Roky's,-12.043467,-76.936234,Fast Food Restaurant
4,Barranco,-12.143959,-77.020268,Foodtruck El Gringo,-12.141336,-77.021359,Fast Food Restaurant


In [14]:
lima_fast_food.shape

(1664, 7)

### Function to add markes on a map to display all of our features

In [15]:
# function to add markers for given venues to map
def addMarkerToMap(df, color, mapName):
    for lat, lng, local, venue, category in zip(df["Venue Latitude"], df["Venue Longitude"], df['Neighborhood'], df['Venue'], df["Venue Category"]):
        label = '{} ({}) - {}'.format(venue, category, local)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color=color,
            fill=True,
            fill_color='#3186cc',
            fill_opacity=0.7,
            parse_html=True).add_to(mapName)

We only plot the first 1400 points in order to have a figurative view 

In [16]:
map_lima_ff = folium.Map(location=[latitude, longitude], zoom_start=12)

# Select 1400 points
subset = lima_fast_food.head(1400)
addMarkerToMap(subset, 'red', map_lima_ff)

map_lima_ff

### Find the highschools

In [17]:
ID_HIGHSCHOOL = '4bf58dd8d48988d13d941735'
lima_highschools = getNearbyVenues(names=df_loc['District'], latitudes=df_loc['Latitude'], longitudes=df_loc['Longitude'], radius=rad, categoryIds=ID_HIGHSCHOOL)
lima_highschools.head()

Ancón
Ate
Barranco
Breña
Carabayllo
Chaclacayo
Chorrillos
Cieneguilla
Comas
El Agustino
Independencia
Jesús María
La Molina
La Victoria
Lima
Lince
Los Olivos
Lurigancho
Lurín
Magdalena del Mar
Miraflores
Pachacamac
Pucusana
Pueblo Libre
Puente Piedra
Punta Hermosa
Punta Negra
Rímac
San Bartolo
San Borja
San Isidro
San Juan de Lurigancho
San Juan de Miraflores
San Luis
San Martín de Porres
San Miguel
Santa Anita
Santa María del Mar
Santa Rosa
Santiago de Surco
Surquillo
Villa El Salvador
Villa María del Triunfo
Bellavista
Callao
Carmen de La Legua-Reynoso
La Perla
La Punta
Ventanilla
Mi Perú


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Ate,-12.038728,-76.896873,I.E. 0025 San Martin de Porres,-12.025601,-76.905968,High School
1,Ate,-12.038728,-76.896873,Colegio San Alfonso,-12.024123,-76.885875,High School
2,Ate,-12.038728,-76.896873,I.E.P. Sain Joseph School Vitarte,-12.025434,-76.927099,High School
3,Ate,-12.038728,-76.896873,Colegio Villa María La Planicie,-12.069716,-76.910333,High School
4,Ate,-12.038728,-76.896873,Colegio Julio C. Tello,-12.034389,-76.940217,High School


In [18]:
lima_highschools.shape

(597, 7)

Lets visualize the result on the map

In [19]:
map_lima_highschools = folium.Map(location=[latitude, longitude], zoom_start=12)
addMarkerToMap(lima_highschools, 'green', map_lima_highschools)
map_lima_highschools

### Find universities

In [20]:
ID_UNI = '4bf58dd8d48988d1ae941735'
lima_uni = getNearbyVenues(names=df_loc['District'], latitudes=df_loc['Latitude'], longitudes=df_loc['Longitude'], radius=rad, categoryIds=ID_UNI)
lima_uni.head()

Ancón
Ate
Barranco
Breña
Carabayllo
Chaclacayo
Chorrillos
Cieneguilla
Comas
El Agustino
Independencia
Jesús María
La Molina
La Victoria
Lima
Lince
Los Olivos
Lurigancho
Lurín
Magdalena del Mar
Miraflores
Pachacamac
Pucusana
Pueblo Libre
Puente Piedra
Punta Hermosa
Punta Negra
Rímac
San Bartolo
San Borja
San Isidro
San Juan de Lurigancho
San Juan de Miraflores
San Luis
San Martín de Porres
San Miguel
Santa Anita
Santa María del Mar
Santa Rosa
Santiago de Surco
Surquillo
Villa El Salvador
Villa María del Triunfo
Bellavista
Callao
Carmen de La Legua-Reynoso
La Perla
La Punta
Ventanilla
Mi Perú


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Ate,-12.038728,-76.896873,USIL - Campus Huachipa,-12.016187,-76.935202,University
1,Barranco,-12.143959,-77.020268,Facultad Arquitectura - UAP,-12.142593,-77.022237,University
2,Barranco,-12.143959,-77.020268,Universidad Alas Peruanas-Facultad de Ingenier...,-12.140184,-77.025217,University
3,Barranco,-12.143959,-77.020268,Universidad Marcelino Champagnat,-12.136654,-77.00622,University
4,Barranco,-12.143959,-77.020268,Universidad Alas Peruanas,-12.128782,-77.029065,University


In [21]:
lima_uni.shape

(1024, 7)

Lets visualiza the result on the map

In [22]:
map_lima_uni = folium.Map(location=[latitude, longitude], zoom_start=12)
addMarkerToMap(lima_uni, 'blue', map_lima_uni)
map_lima_uni

### Find work offices

In [23]:
ID_OFFICE = '4d4b7105d754a06375d81259'
lima_office = getNearbyVenues(names=df_loc['District'], latitudes=df_loc['Latitude'], longitudes=df_loc['Longitude'], radius=rad, categoryIds=ID_OFFICE)
lima_office.head()

Ancón
Ate
Barranco
Breña
Carabayllo
Chaclacayo
Chorrillos
Cieneguilla
Comas
El Agustino
Independencia
Jesús María
La Molina
La Victoria
Lima
Lince
Los Olivos
Lurigancho
Lurín
Magdalena del Mar
Miraflores
Pachacamac
Pucusana
Pueblo Libre
Puente Piedra
Punta Hermosa
Punta Negra
Rímac
San Bartolo
San Borja
San Isidro
San Juan de Lurigancho
San Juan de Miraflores
San Luis
San Martín de Porres
San Miguel
Santa Anita
Santa María del Mar
Santa Rosa
Santiago de Surco
Surquillo
Villa El Salvador
Villa María del Triunfo
Bellavista
Callao
Carmen de La Legua-Reynoso
La Perla
La Punta
Ventanilla
Mi Perú


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Ate,-12.038728,-76.896873,Villa Santa Clara,-12.022495,-76.89869,Building
1,Ate,-12.038728,-76.896873,Telefonica vitarte,-12.029032,-76.924008,Office
2,Ate,-12.038728,-76.896873,Gloria S.A. Planta Huachipa,-12.007437,-76.906022,Building
3,Ate,-12.038728,-76.896873,AmBev Perú,-12.014358,-76.928595,Office
4,Ate,-12.038728,-76.896873,Electrovia SAC,-12.047325,-76.938265,Office


In [24]:
lima_office.shape

(2949, 7)

Lets visualize the result on the map

In [25]:
map_lima_office = folium.Map(location=[latitude, longitude], zoom_start=12)

sub_set_office = lima_office.head(1400)
addMarkerToMap(sub_set_office, 'gold', map_lima_office)

map_lima_office

In [26]:
# Define a function to add the new columns to the dataframe
def addColumn(originDf, columnTitle, addDf):
    group = addDf.groupby('Neighborhood').count()
    
    for n in originDf['District']:
        try:
            originDf.loc[originDf['District'] == n,columnTitle] = group.loc[n, 'Venue']
        except:
            originDf.loc[originDf['District'] == n,columnTitle] = 0

In [27]:
df_data = df_loc.copy() 
addColumn(df_data, 'Fast Food Restaurants', lima_fast_food)
addColumn(df_data, 'High Schools', lima_highschools)
addColumn(df_data, 'Universities', lima_uni)
addColumn(df_data, 'Offices', lima_office)
df_data

Unnamed: 0,District,LocationCode,Postcode,Latitude,Longitude,Fast Food Restaurants,High Schools,Universities,Offices
0,Ancón,150102,02,-11.696554,-77.111655,0.0,0.0,0.0,0.0
1,Ate,150103,03,-12.038728,-76.896873,4.0,5.0,1.0,36.0
2,Barranco,150104,04,-12.143959,-77.020268,81.0,18.0,29.0,100.0
3,Breña,150105,05,-12.0597,-77.050119,100.0,38.0,76.0,100.0
4,Carabayllo,150106,06,-11.794993,-76.989292,0.0,0.0,0.0,7.0
5,Chaclacayo,150107,08,-11.992479,-76.776176,0.0,3.0,0.0,9.0
6,Chorrillos,150108,09,-12.19235,-77.008962,16.0,7.0,8.0,86.0
7,Cieneguilla,150109,40,-12.073167,-76.777071,0.0,0.0,0.0,5.0
8,Comas,150110,07,-11.932861,-77.040674,6.0,5.0,4.0,29.0
9,El Agustino,150111,10,-12.042052,-76.995714,40.0,8.0,14.0,100.0


For get the better to place to set the new fast food restaurant we have to consider some variables and put a weight for each one that represents the impact of these variables in our choise. Thus, we have the following:
> 1. **weight_fast_food = -1** negative value, because we want a place that does not have enough fast food restaurants
> 1. **weight_high = 1** positive value, because students of high school are often a good customer
> 1. **weight_uni = 1.5** a more positive value, because university students are regularly a good customer
> 1. **weight_office = 2** a very positive value, because employees are even better customer


In [28]:
# Setting the weights for each variable
weight_fast_food = -1
weight_high = 1
weight_uni = 1.5 
weight_office = 2

We need to create a new dataframe that contains the final score that represents the best district in Lima to open a new fast food restaurant

In [29]:
df_score = df_loc[['District']].copy()
df_score['Score'] = df_data['Fast Food Restaurants'] * weight_fast_food + df_data['High Schools'] * weight_high + df_data['Universities'] * weight_uni + df_data['Offices'] * weight_office
df_score

Unnamed: 0,District,Score
0,Ancón,0.0
1,Ate,74.5
2,Barranco,180.5
3,Breña,252.0
4,Carabayllo,14.0
5,Chaclacayo,21.0
6,Chorrillos,175.0
7,Cieneguilla,10.0
8,Comas,63.0
9,El Agustino,189.0


Sort the values of the scores

In [30]:
df_final = df_score.sort_values(by=['Score'], ascending=False)
df_final.head()

Unnamed: 0,District,Score
15,Lince,256.5
14,Lima,255.0
11,Jesús María,253.5
3,Breña,252.0
19,Magdalena del Mar,238.0


## Visualize the district of our choice and all the places that it has

In [31]:
map_result = folium.Map(location=[latitude, longitude], zoom_start=13)

df_win = df_loc[df_loc['District'] == 'Lince']

for lat, lng, local in zip(df_win['Latitude'], df_win['Longitude'], df_win['District']):
    label = '{}'.format(local)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='fuchsia',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_result) 

addMarkerToMap(lima_fast_food[lima_fast_food['Neighborhood'] == 'Lince'], 'red', map_result)
addMarkerToMap(lima_highschools[lima_highschools['Neighborhood'] == 'Lince'], 'green', map_result)
addMarkerToMap(lima_uni[lima_uni['Neighborhood'] == 'Lince'], 'blue', map_result)
addMarkerToMap(lima_office[lima_office['Neighborhood'] == 'Lince'], 'gold', map_result)

map_result

Therefore, the best location to open Abigail's fast food restaurant is the district of **Lince**