# Applied Data Science Capstone Project
## Final Project of [IBM's Data Science Professional Certificate Course] (https://www.coursera.org/professional-certificates/ibm-data-science)
## Part 2:
## Recomending neighborhoods in Curitiba (Brasil) based on venues' data 

First, let's import all the libraries needed

In [1]:
import pandas as pd
import numpy as np 
import geocoder
import requests 
import folium

import matplotlib.cm as cm
import matplotlib.colors as colors

from pandas.io.html import read_html
from sklearn.cluster import KMeans

In [2]:
def getBoroughDataframe(wikitable):
    tableTitle = wikitable.iloc[0,0]
    boroughName = tableTitle[(tableTitle.index("- ") + 2):tableTitle.index("(IBGE-")]

    if 'Regional' in boroughName:
        boroughName = boroughName.replace('Regional ','')

    df = wikitable.drop([0,1,2]).reset_index(drop = True)
    
    df['Borough'] = boroughName
    df['Neighborhood'] = df[0]
    df['Area'] = pd.to_numeric(df[1], downcast="float")/100 

    return df[['Borough', 'Neighborhood', 'Area']]

In [3]:
# Get a list of wiki tables from the following link 
page = 'https://pt.wikipedia.org/wiki/Lista_de_bairros_de_Curitiba'
wikitables = read_html(page,  attrs = {"class":"wikitable"})

# Create empty dataframe to hold all sub dataframes
df_curitiba = pd.DataFrame(columns = ['Borough', 'Neighborhood', 'Area'])

# For each borough, get a sub dataframe of its neighborhoods and concat it to the main dataframe
for table in wikitables:
    df_curitiba = pd.concat([df_curitiba, getBoroughDataframe(table)])

# Remove any duplicate value 
df_curitiba.drop_duplicates(subset = 'Neighborhood', keep = 'last', inplace = True)

Unnamed: 0,Borough,Neighborhood,Area
0,Bairro Novo,Ganchinho,11.200000
1,Bairro Novo,Sitio Cercado,11.120000
2,Bairro Novo,Umbará,22.469999
3,Boa Vista,Abranches,4.320000
4,Boa Vista,Atuba,4.270000
...,...,...,...
70,Santa Felicidade,Santo Inácio,2.720000
71,Santa Felicidade,São Braz,5.010000
72,Santa Felicidade,São João,3.030000
73,Santa Felicidade,Seminário,2.130000


In [4]:
latitude = []
longitude = []

# For each neighborhood, find its coordinates and append it to the latitude and longitude lists
for neighborhood in df_curitiba['Neighborhood']:
    lat_lng_coords = None

    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Curitiba, Brasil'.format(neighborhood))
        lat_lng_coords = g.latlng

    latitude.append(lat_lng_coords[0])
    longitude.append(lat_lng_coords[1])

# Create new columns with the latitude and longitude lists
df_curitiba['Latitude'] = latitude
df_curitiba['Longitude'] = longitude

df_curitiba.reset_index(drop = True)

# Print the dataframe 
df_curitiba.head(20)

Unnamed: 0,Borough,Neighborhood,Area,Latitude,Longitude
0,Bairro Novo,Ganchinho,11.2,-25.57523,-49.25502
1,Bairro Novo,Sitio Cercado,11.12,-25.54155,-49.26651
2,Bairro Novo,Umbará,22.469999,-25.58153,-49.28313
0,Boa Vista,Abranches,4.32,-25.37028,-49.27007
1,Boa Vista,Atuba,4.27,-25.43333,-49.23333
2,Boa Vista,Bacacheri,6.98,-25.39847,-49.23038
3,Boa Vista,Bairro Alto,7.02,-25.41102,-49.20442
4,Boa Vista,Barreirinha,3.73,-25.37337,-49.25943
5,Boa Vista,Boa Vista,5.14,-25.38704,-49.24761
6,Boa Vista,Cachoeira,3.07,-25.35376,-49.26428
