# IBM Capstone
## Exploring Rio de Janeiro Touristic Neighborhoods

In [6]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done


  current version: 4.5.11
  latest version: 4.8.0

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          91 KB

The following NEW packages will be INSTALLED:

    geographiclib: 1.50-py_0   conda-forge
    geopy:         1.20.0-py_0 conda-forge


Downloading and Extracting Packages
geopy-1.20.0         | 57 KB     | ##################################### | 100% 
geographiclib-1.50   | 34 KB     | ###

In [None]:
#http://www.data.rio/datasets/limite-bairro/data?geometry=-44.313%2C-23.138%2C-42.579%2C-22.695
#https://www.feriasbrasil.com.br/rj/riodejaneiro/bairros.cfm

### Loading turistics neighborhood

In [10]:
address = 'Rio de Janeiro, RJ'

geolocator = Nominatim(user_agent="rj_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Rio de Janeiro are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Rio de Janeiro are -22.9110137, -43.2093727.


In [3]:
df = pd.read_csv('rio_turistics.csv', encoding = "ISO-8859-1")

In [4]:
df

Unnamed: 0,Neighborhood
0,Copacabana
1,Leme
2,Lagoa
3,Gávea
4,Leblon
5,Ipanema
6,Lapa
7,Botafogo
8,Urca
9,Glória


In [24]:
latitude = []
latitude.append('a')
latitude.append('b')
latitude

['a', 'b']

### Getting neighborhood latitude,longitude centroids

In [50]:
latitude = []
longitude = []
for ng in df.index: 
    address = df['Neighborhood'][ng] + ' , RJ'
    geolocator = Nominatim(user_agent="rj_explorer")
    location = geolocator.geocode(address)
    latitude.append(location.latitude)
    longitude.append(location.longitude)
df['Latitude'] = latitude
df['Longitude'] = longitude

    
    

In [51]:
df.to_csv('rio_tur_geo.csv', index = False)

In [5]:
df = pd.read_csv('rio_tur_geo.csv')
df.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Copacabana,-22.971964,-43.184343
1,Leme,-22.961704,-43.166904
2,Lagoa,-22.962466,-43.202488
3,Gávea,-22.981424,-43.238324
4,Leblon,-22.983556,-43.224938


In [53]:
df.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Copacabana,-22.971964,-43.184343
1,Leme,-22.961704,-43.166904
2,Lagoa,-22.962466,-43.202488
3,Gávea,-22.981424,-43.238324
4,Leblon,-22.983556,-43.224938


### Plot Folium map with neighborhoods limits and centroids
#### Loading GeoJSON

In [6]:
#22.9292055,-43.5945507
RIO_COORDINATES = (-22.94,-43.21)
map_rio = folium.Map(location=RIO_COORDINATES, zoom_start=12)
district_geo2 = r'Limite_Bairro_geo7.json'

map_rio.choropleth(geo_data = district_geo2, 
              #data_out = 'crimeagg.json', 
              #data = crimedata2,
              #columns = ['District2', 'Number'],
              key_on = 'feature.properties.BAIRRO',
              fill_color = 'YlOrRd', 
              fill_opacity = 0.1,
              line_opacity = 0.9,
              legend_name = 'Number of incidents per district',
              reset=True)

for lat, lng, neighborhood in zip(df['Latitude'], df['Longitude'], df['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_rio)

map_rio


  

![alt text](https://github.com/Fredeinfo/Coursera_Capstone/blob/master/img/02_geo_centroide.JPG?raw=true "Folium map with neighborhoods limits and centroids")


In [101]:
CLIENT_ID = '05IER4UB5SNZC0QL3SH3NALXLZT5KY214JZJPU5Y5EVZGXGB' # your Foursquare ID
CLIENT_SECRET = '2NSEJXG1QNNV3X4X3PMFLGBDQSL1NO4Z0Y0LOK0ONRLOQDRB' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 50

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 05IER4UB5SNZC0QL3SH3NALXLZT5KY214JZJPU5Y5EVZGXGB
CLIENT_SECRET:2NSEJXG1QNNV3X4X3PMFLGBDQSL1NO4Z0Y0LOK0ONRLOQDRB


### Define Function for get all Neighbordhood

In [102]:
def getNearbyVenues(names, latitudes, longitudes, radius=1500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [103]:
rio_venues = getNearbyVenues(names=df['Neighborhood'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

Copacabana
Leme
Lagoa
Gávea
Leblon
Ipanema
Lapa
Botafogo
Urca
Glória
Centro
Flamengo


#### Save to CSV to avoid call API again

In [7]:
rio_venues = pd.read_csv('rio_venues.csv')
print(rio_venues.shape)
rio_venues.head()

(600, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Copacabana,-22.971964,-43.184343,Praia de Copacabana,-22.972441,-43.183436,Beach
1,Copacabana,-22.971964,-43.184343,Windsor California Hotel,-22.972704,-43.185707,Hotel
2,Copacabana,-22.971964,-43.184343,JW Marriott Hotel Rio de Janeiro,-22.972259,-43.185825,Hotel
3,Copacabana,-22.971964,-43.184343,Kopenhagen,-22.97068,-43.185617,Chocolate Shop
4,Copacabana,-22.971964,-43.184343,Santa Satisfação,-22.972035,-43.186719,Bistro


### Plot venues in MAP

In [13]:
#22.9292055,-43.5945507
RIO_COORDINATES = (-22.94,-43.21)
map_rio2 = folium.Map(location=RIO_COORDINATES, zoom_start=12)
district_geo2 = r'Limite_Bairro_geo7.json'

map_rio2.choropleth(geo_data = district_geo2, 
              #data_out = 'crimeagg.json', 
              #data = crimedata2,
              #columns = ['District2', 'Number'],
              key_on = 'feature.properties.BAIRRO',
              fill_color = 'YlOrRd', 
              fill_opacity = 0.1,
              line_opacity = 0.9,
              legend_name = 'Rio Venues',
              reset=True)

for lat, lng, neighborhood in zip(rio_venues['Venue Latitude'], rio_venues['Venue Longitude'], rio_venues['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_rio2)

map_rio2

![alt text](https://github.com/Fredeinfo/Coursera_Capstone/blob/master/img/04_venues.JPG?raw=true "Folium map with neighborhoods limits and centroids")


In [106]:
rio_venues.to_csv('rio_venues.csv', index = False)

#### Analyzing most common venues

In [12]:
rio_sort_df = rio_venues.groupby('Venue Category').count()
rio_sort_df.sort_values('Neighborhood', ascending=False)


Unnamed: 0_level_0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude
Venue Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Brazilian Restaurant,41,41,41,41,41,41
Bar,36,36,36,36,36,36
Hotel,25,25,25,25,25,25
Coffee Shop,17,17,17,17,17,17
Beach,17,17,17,17,17,17
Pizza Place,16,16,16,16,16,16
Gym / Fitness Center,14,14,14,14,14,14
Japanese Restaurant,14,14,14,14,14,14
Bookstore,13,13,13,13,13,13
Scenic Lookout,12,12,12,12,12,12


#### Build categories for k-means

In [14]:
# one hot encoding
rio_onehot = pd.get_dummies(rio_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
rio_onehot['Neighborhood'] = rio_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [rio_onehot.columns[-1]] + list(rio_onehot.columns[:-1])
rio_onehot = rio_onehot[fixed_columns]

rio_onehot.head()

Unnamed: 0,Neighborhood,Argentinian Restaurant,Art Gallery,Art Museum,Asian Restaurant,Athletics & Sports,BBQ Joint,Bakery,Bar,Bathing Area,Beach,Beach Bar,Bed & Breakfast,Beer Bar,Beer Garden,Beer Store,Belgian Restaurant,Bistro,Bookstore,Brazilian Restaurant,Breakfast Spot,Brewery,Building,Burger Joint,Café,Chinese Restaurant,Chocolate Shop,Church,Churrascaria,Clothing Store,Club House,Cocktail Bar,Coffee Shop,College Quad,Comfort Food Restaurant,Concert Hall,Convenience Store,Cosmetics Shop,Creperie,Cultural Center,Dance Studio,Deli / Bodega,Design Studio,Dessert Shop,Dive Bar,Dog Run,Drugstore,Electronics Store,Farmers Market,Fast Food Restaurant,Flea Market,Food,Food Stand,Food Truck,French Restaurant,Fried Chicken Joint,Fruit & Vegetable Store,Garden,Gastropub,German Restaurant,Gourmet Shop,Greek Restaurant,Gym,Gym / Fitness Center,Health & Beauty Service,Heliport,Herbs & Spices Store,Historic Site,History Museum,Hostel,Hotel,Ice Cream Shop,Indie Movie Theater,Italian Restaurant,Japanese Restaurant,Jewelry Store,Jewish Restaurant,Juice Bar,Lingerie Store,Lounge,Mediterranean Restaurant,Mexican Restaurant,Middle Eastern Restaurant,Military Base,Miscellaneous Shop,Monument / Landmark,Mountain,Movie Theater,Multiplex,Museum,Music Venue,Nightclub,Northeastern Brazilian Restaurant,Other Great Outdoors,Paper / Office Supplies Store,Park,Pastelaria,Pastry Shop,Pedestrian Plaza,Peruvian Restaurant,Pet Store,Pharmacy,Pie Shop,Pizza Place,Planetarium,Playground,Plaza,Pool,Portuguese Restaurant,Pub,Resort,Restaurant,Rock Club,Sake Bar,Salad Place,Salon / Barbershop,Sandwich Place,Scenic Lookout,Seafood Restaurant,Shoe Store,Shopping Mall,Shopping Plaza,Smoke Shop,Snack Place,Soup Place,Souvenir Shop,Spa,Sports Bar,Steakhouse,Supermarket,Sushi Restaurant,Tapas Restaurant,Tapiocaria,Tattoo Parlor,Tea Room,Thai Restaurant,Theater,Track,Trail,Tram Station,Used Bookstore,Vegetarian / Vegan Restaurant,Video Store,Waterfront,Wine Bar,Women's Store
0,Copacabana,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Copacabana,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Copacabana,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Copacabana,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Copacabana,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Analyse mean data

In [15]:
rio_grouped = rio_onehot.groupby('Neighborhood').mean().reset_index()
rio_grouped.to_csv('rio_grouped.csv', index = False)
rio_grouped

Unnamed: 0,Neighborhood,Argentinian Restaurant,Art Gallery,Art Museum,Asian Restaurant,Athletics & Sports,BBQ Joint,Bakery,Bar,Bathing Area,Beach,Beach Bar,Bed & Breakfast,Beer Bar,Beer Garden,Beer Store,Belgian Restaurant,Bistro,Bookstore,Brazilian Restaurant,Breakfast Spot,Brewery,Building,Burger Joint,Café,Chinese Restaurant,Chocolate Shop,Church,Churrascaria,Clothing Store,Club House,Cocktail Bar,Coffee Shop,College Quad,Comfort Food Restaurant,Concert Hall,Convenience Store,Cosmetics Shop,Creperie,Cultural Center,Dance Studio,Deli / Bodega,Design Studio,Dessert Shop,Dive Bar,Dog Run,Drugstore,Electronics Store,Farmers Market,Fast Food Restaurant,Flea Market,Food,Food Stand,Food Truck,French Restaurant,Fried Chicken Joint,Fruit & Vegetable Store,Garden,Gastropub,German Restaurant,Gourmet Shop,Greek Restaurant,Gym,Gym / Fitness Center,Health & Beauty Service,Heliport,Herbs & Spices Store,Historic Site,History Museum,Hostel,Hotel,Ice Cream Shop,Indie Movie Theater,Italian Restaurant,Japanese Restaurant,Jewelry Store,Jewish Restaurant,Juice Bar,Lingerie Store,Lounge,Mediterranean Restaurant,Mexican Restaurant,Middle Eastern Restaurant,Military Base,Miscellaneous Shop,Monument / Landmark,Mountain,Movie Theater,Multiplex,Museum,Music Venue,Nightclub,Northeastern Brazilian Restaurant,Other Great Outdoors,Paper / Office Supplies Store,Park,Pastelaria,Pastry Shop,Pedestrian Plaza,Peruvian Restaurant,Pet Store,Pharmacy,Pie Shop,Pizza Place,Planetarium,Playground,Plaza,Pool,Portuguese Restaurant,Pub,Resort,Restaurant,Rock Club,Sake Bar,Salad Place,Salon / Barbershop,Sandwich Place,Scenic Lookout,Seafood Restaurant,Shoe Store,Shopping Mall,Shopping Plaza,Smoke Shop,Snack Place,Soup Place,Souvenir Shop,Spa,Sports Bar,Steakhouse,Supermarket,Sushi Restaurant,Tapas Restaurant,Tapiocaria,Tattoo Parlor,Tea Room,Thai Restaurant,Theater,Track,Trail,Tram Station,Used Bookstore,Vegetarian / Vegan Restaurant,Video Store,Waterfront,Wine Bar,Women's Store
0,Botafogo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.04,0.0,0.0,0.0,0.04,0.0,0.0,0.02,0.08,0.02,0.0,0.0,0.0,0.02,0.0,0.0,0.02,0.02,0.02,0.0,0.02,0.0,0.06,0.0,0.0,0.0,0.02,0.0,0.02,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.02,0.0,0.0,0.02,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.02,0.0,0.06,0.08,0.02,0.02,0.02,0.04,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.02,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0
1,Centro,0.0,0.02,0.0,0.0,0.0,0.0,0.02,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.08,0.08,0.0,0.0,0.0,0.02,0.04,0.0,0.0,0.06,0.0,0.02,0.0,0.0,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.02,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.02,0.0,0.02,0.02,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.02,0.02,0.0,0.0,0.0,0.0,0.02,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0
2,Copacabana,0.0,0.0,0.0,0.0,0.0,0.02,0.06,0.06,0.02,0.02,0.04,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.02,0.0,0.0,0.0,0.02,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.02,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.04,0.02,0.0,0.0,0.02,0.0,0.0,0.0,0.16,0.02,0.0,0.0,0.02,0.0,0.0,0.02,0.0,0.02,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.02,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.02,0.0,0.02,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0
3,Flamengo,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.02,0.0,0.02,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.04,0.0,0.0,0.02,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.02,0.02,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.02,0.0,0.02,0.0,0.04,0.02,0.02,0.0,0.02,0.0,0.02,0.04,0.0,0.0,0.0,0.0,0.02,0.02,0.0,0.0,0.0,0.02,0.06,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.02,0.0,0.02,0.02,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.02,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.04,0.0,0.0,0.02,0.0,0.0,0.0,0.02,0.0
4,Glória,0.02,0.0,0.02,0.02,0.02,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.02,0.0,0.0,0.0,0.02,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.02,0.02,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.04,0.04,0.02,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.04,0.0,0.0,0.06,0.04,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.04,0.0,0.02,0.0,0.02,0.0,0.0,0.0,0.0
5,Gávea,0.0,0.0,0.0,0.0,0.02,0.0,0.02,0.06,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.04,0.06,0.04,0.0,0.0,0.0,0.02,0.0,0.02,0.0,0.02,0.0,0.0,0.02,0.0,0.02,0.0,0.0,0.02,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.02,0.0,0.02,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.04,0.0,0.02,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.02,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.02,0.02,0.0,0.0,0.02,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02
6,Ipanema,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.04,0.04,0.0,0.02,0.0,0.0,0.02,0.0,0.02,0.0,0.06,0.02,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.02,0.0,0.02,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.02,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.02,0.02,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.04,0.02,0.0,0.04,0.02,0.0,0.0,0.04,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.02,0.0,0.02,0.02,0.0,0.02,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.02
7,Lagoa,0.0,0.0,0.0,0.0,0.0,0.02,0.04,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.02,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.02,0.0,0.02,0.0,0.0,0.02,0.04,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.1,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.02,0.0,0.02,0.02,0.0,0.02,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.04,0.0,0.0,0.02,0.06,0.0,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.02,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.02,0.02,0.0,0.0,0.02,0.0,0.0,0.0,0.0
8,Lapa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.12,0.0,0.0,0.02,0.02,0.0,0.0,0.02,0.02,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.02,0.0,0.0,0.02,0.04,0.0,0.0,0.0,0.06,0.0,0.02,0.04,0.0,0.0,0.02,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.02,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Leblon,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.06,0.02,0.02,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.02,0.1,0.0,0.02,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.02,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.02,0.0,0.02,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.02,0.02,0.02,0.0,0.04,0.02,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.02,0.0,0.0,0.0,0.0,0.06,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.02,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.02,0.02,0.02,0.02,0.0,0.02,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Group Neighborhood for top five venues

In [16]:
num_top_venues = 5

for hood in rio_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = rio_grouped[rio_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Botafogo----
         venue  freq
0        Hotel  0.08
1    Bookstore  0.08
2  Coffee Shop  0.06
3       Hostel  0.06
4  Beer Garden  0.04


----Centro----
                       venue  freq
0       Brazilian Restaurant  0.08
1                  Bookstore  0.08
2                     Church  0.06
3                Salad Place  0.06
4  Middle Eastern Restaurant  0.06


----Copacabana----
       venue  freq
0      Hotel  0.16
1     Bakery  0.06
2        Bar  0.06
3  Beach Bar  0.04
4        Gym  0.04


----Flamengo----
                  venue  freq
0  Brazilian Restaurant  0.06
1   Japanese Restaurant  0.06
2                  Park  0.04
3                 Track  0.04
4  Gym / Fitness Center  0.04


----Glória----
           venue  freq
0  Historic Site  0.06
1    Coffee Shop  0.06
2    Music Venue  0.06
3         Hostel  0.04
4        Theater  0.04


----Gávea----
                  venue  freq
0                   Bar  0.06
1  Brazilian Restaurant  0.06
2             Bookstore  0.04
3    

#### Define funcion for return top 10

In [17]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

#### Run fuction to build dataframe with top 10

In [18]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
rio_venues_sorted = pd.DataFrame(columns=columns)
rio_venues_sorted['Neighborhood'] = rio_grouped['Neighborhood']

for ind in np.arange(rio_grouped.shape[0]):
    rio_venues_sorted.iloc[ind, 1:] = return_most_common_venues(rio_grouped.iloc[ind, :], num_top_venues)

rio_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Botafogo,Hotel,Bookstore,Hostel,Coffee Shop,Beer Garden,Japanese Restaurant,Dance Studio,Bar,Beach,Italian Restaurant
1,Centro,Brazilian Restaurant,Bookstore,Coffee Shop,Church,Middle Eastern Restaurant,Salad Place,Music Venue,Café,Italian Restaurant,Tram Station
2,Copacabana,Hotel,Bakery,Bar,Gym,Beach Bar,Lounge,Salad Place,Chocolate Shop,Churrascaria,Resort
3,Flamengo,Brazilian Restaurant,Japanese Restaurant,Coffee Shop,Churrascaria,Gym / Fitness Center,Track,Fruit & Vegetable Store,Park,Movie Theater,Cocktail Bar
4,Glória,Historic Site,Music Venue,Coffee Shop,Theater,Garden,Movie Theater,Hostel,History Museum,Bar,Brazilian Restaurant


In [112]:
rio_venues_sorted.to_csv('rio_venues_sorted.csv', index = False)

#### Model k-means with 5 clusters

In [19]:
# set number of clusters
kclusters = 5

rio_grouped_clustering = rio_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(rio_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 2, 3, 2, 2, 0, 0, 0, 4, 0], dtype=int32)

#### ADD clusters labels to dataframe

In [20]:
# add clustering labels
rio_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

rio_merged = df

rio_merged = rio_merged.join(rio_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

rio_merged.head() # check the last columns!

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Copacabana,-22.971964,-43.184343,3,Hotel,Bakery,Bar,Gym,Beach Bar,Lounge,Salad Place,Chocolate Shop,Churrascaria,Resort
1,Leme,-22.961704,-43.166904,1,Brazilian Restaurant,Beach,Hotel,Scenic Lookout,Bar,Seafood Restaurant,Pizza Place,Bookstore,Breakfast Spot,Deli / Bodega
2,Lagoa,-22.962466,-43.202488,0,Gym / Fitness Center,Bar,Pizza Place,Farmers Market,Park,Scenic Lookout,Bakery,Peruvian Restaurant,Brazilian Restaurant,Pie Shop
3,Gávea,-22.981424,-43.238324,0,Brazilian Restaurant,Bar,Scenic Lookout,Pizza Place,Park,Bookstore,Dive Bar,Breakfast Spot,Japanese Restaurant,Ice Cream Shop
4,Leblon,-22.983556,-43.224938,0,Brazilian Restaurant,Bar,Pizza Place,Italian Restaurant,Gym / Fitness Center,Juice Bar,Steakhouse,French Restaurant,Bakery,Japanese Restaurant


#### Show map with clusters

In [23]:
# create map
map_clusters = folium.Map(location=RIO_COORDINATES, zoom_start=13)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]


map_clusters.choropleth(geo_data = district_geo2, 
              #data_out = 'crimeagg.json', 
              #data = crimedata2,
              #columns = ['District2', 'Number'],
              key_on = 'feature.properties.BAIRRO',
              fill_color = 'YlOrRd', 
              fill_opacity = 0.1,
              line_opacity = 0.9,
              legend_name = 'Number of incidents per district',
              reset=True)  

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(rio_merged['Latitude'], rio_merged['Longitude'], rio_merged['Neighborhood'], rio_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=10,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
    
  
       
map_clusters

![alt text](https://github.com/Fredeinfo/Coursera_Capstone/blob/master/img/08_kmap.JPG?raw=true "Folium map with neighborhoods limits and centroids")

In [24]:
rio_merged.loc[rio_merged['Cluster Labels'] == 0, rio_merged.columns[[1] + list(range(4, rio_merged.shape[1]))]]

Unnamed: 0,Latitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,-22.962466,Gym / Fitness Center,Bar,Pizza Place,Farmers Market,Park,Scenic Lookout,Bakery,Peruvian Restaurant,Brazilian Restaurant,Pie Shop
3,-22.981424,Brazilian Restaurant,Bar,Scenic Lookout,Pizza Place,Park,Bookstore,Dive Bar,Breakfast Spot,Japanese Restaurant,Ice Cream Shop
4,-22.983556,Brazilian Restaurant,Bar,Pizza Place,Italian Restaurant,Gym / Fitness Center,Juice Bar,Steakhouse,French Restaurant,Bakery,Japanese Restaurant
5,-22.983956,Brazilian Restaurant,Bar,Italian Restaurant,Food Stand,Hotel,Coffee Shop,Juice Bar,Spa,Pizza Place,Beach
7,-22.948845,Hotel,Bookstore,Hostel,Coffee Shop,Beer Garden,Japanese Restaurant,Dance Studio,Bar,Beach,Italian Restaurant


In [25]:
rio_merged.loc[rio_merged['Cluster Labels'] == 1, rio_merged.columns[[1] + list(range(4, rio_merged.shape[1]))]]

Unnamed: 0,Latitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,-22.961704,Brazilian Restaurant,Beach,Hotel,Scenic Lookout,Bar,Seafood Restaurant,Pizza Place,Bookstore,Breakfast Spot,Deli / Bodega
8,-22.954074,Beach,Scenic Lookout,Brazilian Restaurant,Pizza Place,Mountain,Trail,Bar,Steakhouse,Hotel,Ice Cream Shop


In [26]:
rio_merged.loc[rio_merged['Cluster Labels'] == 2, rio_merged.columns[[1] + list(range(4, rio_merged.shape[1]))]]

Unnamed: 0,Latitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
9,-22.918323,Historic Site,Music Venue,Coffee Shop,Theater,Garden,Movie Theater,Hostel,History Museum,Bar,Brazilian Restaurant
10,-22.904393,Brazilian Restaurant,Bookstore,Coffee Shop,Church,Middle Eastern Restaurant,Salad Place,Music Venue,Café,Italian Restaurant,Tram Station
11,-22.933984,Brazilian Restaurant,Japanese Restaurant,Coffee Shop,Churrascaria,Gym / Fitness Center,Track,Fruit & Vegetable Store,Park,Movie Theater,Cocktail Bar


In [27]:
rio_merged.loc[rio_merged['Cluster Labels'] == 3, rio_merged.columns[[1] + list(range(4, rio_merged.shape[1]))]]

Unnamed: 0,Latitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,-22.971964,Hotel,Bakery,Bar,Gym,Beach Bar,Lounge,Salad Place,Chocolate Shop,Churrascaria,Resort


In [28]:
rio_merged.loc[rio_merged['Cluster Labels'] == 4, rio_merged.columns[[1] + list(range(4, rio_merged.shape[1]))]]

Unnamed: 0,Latitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,-22.913026,Bar,Brazilian Restaurant,Historic Site,Music Venue,Dive Bar,Hotel,Gym / Fitness Center,Theater,Nightclub,Salad Place


Analyzing clusters, let us focus only in differences between them:

Red(0): Park, gym, fitness center

Purple(1): Scenic Lookout, Mountain, Trail

Blue(2): Historic Site, Theater, History Museum

Cyan(3): Beach bar, Gym, Resorts

Brown(4): Music Venue, Nightclubs
