# IBM Applied Data Science Capstone Course by Coursera
### Week 5 Final Report
**_Opening a New Shopping Mall in Algiers, Algeria_**
- Build a dataframe of neighborhoods in Algiers, Algeria by web scraping the data from Wikipedia page
- Get the geographical coordinates of the neighborhoods
- Obtain the venue data for the neighborhoods from Foursquare API
- Explore and cluster the neighborhoods
- Select the best cluster to open a new shopping mall
***
### 1. Import libraries

In [1]:
!pip install geocoder 
!pip install folium


Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 10.7MB/s ta 0:00:01
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6
Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/fd/a0/ccb3094026649cda4acd55bf2c3822bb8c277eb11446d13d384e5be35257/folium-0.10.1-py2.py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 13.3MB/s eta 0:00:01
[?25hCollecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/63/36/1c93318e9653f4e414a2e0c3b98fc898b4970e939afeedeee6075dd3b703/branca-0.3.1-py3-none-any

In [10]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import geocoder # to get coordinates

import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML and XML documents

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

print("Libraries imported.")

Libraries imported.


### 2. Scrap data from Wikipedia page into a DataFrame

In [11]:
# send the GET request
data = requests.get("https://en.wikipedia.org/wiki/Category:Suburbs_of_Algiers").text

In [12]:
# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(data, 'html.parser')

In [13]:
# create a list to store neighborhood data
neighborhoodList = []

In [14]:
# append the data into the list
for row in soup.find_all("div", class_="mw-category")[0].findAll("li"):
    neighborhoodList.append(row.text)

In [15]:
# create a new DataFrame from the list
kl_df = pd.DataFrame({"Neighborhood": neighborhoodList})

kl_df.head()

Unnamed: 0,Neighborhood
0,"Aïn Bénian, Algiers"
1,Aïn Taya
2,Ain-bessem
3,Bab Ezzouar
4,Baba Hassen


In [16]:
# print the number of rows of the dataframe
kl_df.shape

(34, 1)

### 3. Get the geographical coordinates

In [17]:
# define a function to get coordinates
def get_latlng(neighborhood):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Algiers, Algeria'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

In [18]:
# call the function to get the coordinates, store in a new list using list comprehension
coords = [ get_latlng(neighborhood) for neighborhood in kl_df["Neighborhood"].tolist() ]

In [19]:
coords

[[36.80095000000006, 2.9185600000000704],
 [36.792940000000044, 3.288880000000063],
 [36.293330000000026, 3.6731900000000337],
 [36.72538000000003, 3.1903500000000236],
 [36.69608000000005, 2.9724200000000565],
 [36.724740000000054, 3.112220000000036],
 [36.71110092500004, 3.1432204760000673],
 [36.75792000000007, 3.0140500000000543],
 [36.785750000000064, 3.012740000000065],
 [36.763960000000054, 2.926750000000027],
 [36.712600000000066, 3.2128100000000472],
 [36.754200000000026, 2.9804000000000315],
 [36.669170000000065, 2.9419900000000325],
 [36.714800000000025, 3.0027500000000487],
 [36.73927000000003, 2.9941700000000537],
 [36.76652000000007, 3.0302800000000616],
 [36.70816000000008, 3.1402500000000373],
 [36.73109000000005, 3.103650000000073],
 [36.691140000000075, 3.08224000000007],
 [36.738860000000045, 3.1090300000000752],
 [36.66593000000006, 2.983120000000042],
 [36.72691000000003, 3.0768200000000547],
 [36.65539000000007, 3.1468600000000606],
 [36.73124000000007, 3.15601000

In [20]:
# create temporary dataframe to populate the coordinates into Latitude and Longitude
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])

In [21]:
# merge the coordinates into the original dataframe
kl_df['Latitude'] = df_coords['Latitude']
kl_df['Longitude'] = df_coords['Longitude']

In [22]:
# check the neighborhoods and the coordinates
print(kl_df.shape)
kl_df

(34, 3)


Unnamed: 0,Neighborhood,Latitude,Longitude
0,"Aïn Bénian, Algiers",36.80095,2.91856
1,Aïn Taya,36.79294,3.28888
2,Ain-bessem,36.29333,3.67319
3,Bab Ezzouar,36.72538,3.19035
4,Baba Hassen,36.69608,2.97242
5,Bachdjerrah,36.72474,3.11222
6,"Baraki, Algiers",36.711101,3.14322
7,Ben Aknoun,36.75792,3.01405
8,Bouzaréah,36.78575,3.01274
9,Chéraga,36.76396,2.92675


In [23]:
# save the DataFrame as CSV file
kl_df.to_csv("kl_df.csv", index=False)

### 4. Create a map of Algiers with neighborhoods superimposed on top

In [24]:
# get the coordinates of Algiers
address = 'Algiers, Algeria'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Algiers, Algeria {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Algiers, Algeria 36.7753606, 3.0601882.


In [25]:
# create map of Algiers using latitude and longitude values
map_kl = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, neighborhood in zip(kl_df['Latitude'], kl_df['Longitude'], kl_df['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_kl)  
    
map_kl

In [26]:
# save the map as HTML file
map_kl.save('map_kl.html')

### 5. Use the Foursquare API to explore the neighborhoods

In [28]:
# define Foursquare Credentials and Version
CLIENT_ID = '1SJD355OM551LSZGZ03WMAMLDXFNEW2KITCAJHEAMHZJNBHR' # your Foursquare ID
CLIENT_SECRET = 'TIQEGTKXXBBTCGMRXEQHYC1TEVQDRPABR4NNIJPT3HZDQMVZ' # your Foursquare Secret
VERSION = '20200213' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 1SJD355OM551LSZGZ03WMAMLDXFNEW2KITCAJHEAMHZJNBHR
CLIENT_SECRET:TIQEGTKXXBBTCGMRXEQHYC1TEVQDRPABR4NNIJPT3HZDQMVZ


**Now, let's get the top 100 venues that are within a radius of 2000 meters.**

In [29]:
radius = 2000
LIMIT = 100

venues = []

for lat, long, neighborhood in zip(kl_df['Latitude'], kl_df['Longitude'], kl_df['Neighborhood']):
    
    # create the API request URL
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [30]:
# convert the venues list into a new DataFrame
venues_df = pd.DataFrame(venues)

# define the column names
venues_df.columns = ['Neighborhood', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

(286, 7)


Unnamed: 0,Neighborhood,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,"Aïn Bénian, Algiers",36.80095,2.91856,Restaurant El Kahina,36.801243,2.905486,Seafood Restaurant
1,"Aïn Bénian, Algiers",36.80095,2.91856,La Paella Restaurante Y Tapas,36.802166,2.899236,Seafood Restaurant
2,"Aïn Bénian, Algiers",36.80095,2.91856,La Madrague,36.801747,2.899505,Beach
3,"Aïn Bénian, Algiers",36.80095,2.91856,le rancho,36.800127,2.900348,Mediterranean Restaurant
4,"Aïn Bénian, Algiers",36.80095,2.91856,Le Sauveur,36.802758,2.897456,Seafood Restaurant


**Let's check how many venues were returned for each neighorhood**

In [31]:
venues_df.groupby(["Neighborhood"]).count()

Unnamed: 0_level_0,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Aïn Bénian, Algiers",6,6,6,6,6,6
Aïn Taya,7,7,7,7,7,7
Bab Ezzouar,24,24,24,24,24,24
Baba Hassen,5,5,5,5,5,5
Bachdjerrah,6,6,6,6,6,6
"Baraki, Algiers",4,4,4,4,4,4
Ben Aknoun,19,19,19,19,19,19
Bouzaréah,3,3,3,3,3,3
Chéraga,4,4,4,4,4,4
Dar El Beïda,19,19,19,19,19,19


**Let's find out how many unique categories can be curated from all the returned venues**

In [32]:
print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))

There are 78 uniques categories.


In [33]:
# print out the list of categories
venues_df['VenueCategory'].unique()[:50]

array(['Seafood Restaurant', 'Beach', 'Mediterranean Restaurant',
       'Fish & Chips Shop', 'Hotel', 'Coffee Shop', 'Pizza Place',
       'Plaza', 'Turkish Restaurant', 'Indian Restaurant',
       'Recreation Center', 'African Restaurant', 'Restaurant',
       'Shopping Mall', 'Steakhouse', 'Italian Restaurant', 'BBQ Joint',
       'Gym', 'Middle Eastern Restaurant', 'Dessert Shop',
       'Bowling Alley', 'Construction & Landscaping', 'Burger Joint',
       'Tennis Stadium', 'Train Station', 'Gym / Fitness Center',
       'Market', 'Stadium', 'Café', 'Light Rail Station', 'Tea Room',
       'Cocktail Bar', 'Diner', 'Bus Station', 'Sporting Goods Shop',
       'Food Court', 'Spa', 'Flower Shop', 'Airport Terminal',
       'Airport Lounge', 'Comfort Food Restaurant', 'Sandwich Place',
       'Camera Store', 'Gaming Cafe', 'Falafel Restaurant',
       'Clothing Store', 'French Restaurant', 'History Museum',
       'Video Game Store', 'Pharmacy'], dtype=object)

In [34]:
# check if the results contain "Shopping Mall"
"Neighborhood" in venues_df['VenueCategory'].unique()

False

### 6. Analyze Each Neighborhood

In [35]:
# one hot encoding
kl_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
kl_onehot['Neighborhoods'] = venues_df['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [kl_onehot.columns[-1]] + list(kl_onehot.columns[:-1])
kl_onehot = kl_onehot[fixed_columns]

print(kl_onehot.shape)
kl_onehot.head()

(286, 79)


Unnamed: 0,Neighborhoods,African Restaurant,Airport,Airport Lounge,Airport Terminal,Art Museum,BBQ Joint,Bar,Beach,Bowling Alley,Burger Joint,Bus Station,Café,Camera Store,Church,Clothing Store,Cocktail Bar,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cruise,Department Store,Dessert Shop,Diner,Falafel Restaurant,Farm,Fast Food Restaurant,Fish & Chips Shop,Flower Shop,Food Court,French Restaurant,Furniture / Home Store,Gaming Cafe,Garden,Grocery Store,Gym,Gym / Fitness Center,Harbor / Marina,History Museum,Hotel,Indian Restaurant,Italian Restaurant,Kids Store,Light Rail Station,Lighthouse,Lounge,Market,Mediterranean Restaurant,Metro Station,Middle Eastern Restaurant,Motel,Nightclub,Nudist Beach,Office,Opera House,Park,Pharmacy,Pizza Place,Plaza,Racetrack,Recreation Center,Resort,Restaurant,Salad Place,Sandwich Place,Seafood Restaurant,Shop & Service,Shopping Mall,Spa,Sporting Goods Shop,Stadium,Steakhouse,Tea Room,Tennis Stadium,Theater,Train Station,Turkish Restaurant,Video Game Store
0,"Aïn Bénian, Algiers",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
1,"Aïn Bénian, Algiers",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,"Aïn Bénian, Algiers",0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,"Aïn Bénian, Algiers",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,"Aïn Bénian, Algiers",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


**Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category**

In [39]:
kl_grouped = kl_onehot.groupby(["Neighborhoods"]).mean().reset_index()

print(kl_grouped.shape)
kl_grouped

(29, 79)


Unnamed: 0,Neighborhoods,African Restaurant,Airport,Airport Lounge,Airport Terminal,Art Museum,BBQ Joint,Bar,Beach,Bowling Alley,Burger Joint,Bus Station,Café,Camera Store,Church,Clothing Store,Cocktail Bar,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cruise,Department Store,Dessert Shop,Diner,Falafel Restaurant,Farm,Fast Food Restaurant,Fish & Chips Shop,Flower Shop,Food Court,French Restaurant,Furniture / Home Store,Gaming Cafe,Garden,Grocery Store,Gym,Gym / Fitness Center,Harbor / Marina,History Museum,Hotel,Indian Restaurant,Italian Restaurant,Kids Store,Light Rail Station,Lighthouse,Lounge,Market,Mediterranean Restaurant,Metro Station,Middle Eastern Restaurant,Motel,Nightclub,Nudist Beach,Office,Opera House,Park,Pharmacy,Pizza Place,Plaza,Racetrack,Recreation Center,Resort,Restaurant,Salad Place,Sandwich Place,Seafood Restaurant,Shop & Service,Shopping Mall,Spa,Sporting Goods Shop,Stadium,Steakhouse,Tea Room,Tennis Stadium,Theater,Train Station,Turkish Restaurant,Video Game Store
0,"Aïn Bénian, Algiers",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Aïn Taya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Bab Ezzouar,0.041667,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.25,0.041667,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.125,0.0,0.0,0.041667,0.0,0.083333,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.041667,0.0
3,Baba Hassen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0
4,Bachdjerrah,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.166667,0.0,0.333333,0.0,0.0
5,"Baraki, Algiers",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Ben Aknoun,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.052632,0.105263,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.105263,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.210526,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.105263,0.052632,0.0,0.0,0.0,0.052632,0.0
7,Bouzaréah,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Chéraga,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Dar El Beïda,0.0,0.0,0.052632,0.105263,0.0,0.052632,0.0,0.0,0.052632,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.105263,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.052632,0.052632,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.052632,0.0,0.052632,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.052632,0.0


In [41]:
len(kl_grouped[kl_grouped["Shopping Mall"] > 0])

5

**Create a new DataFrame for Shopping Mall data only**

In [42]:
kl_mall = kl_grouped[["Neighborhoods","Shopping Mall"]]

In [43]:
kl_mall.head()

Unnamed: 0,Neighborhoods,Shopping Mall
0,"Aïn Bénian, Algiers",0.0
1,Aïn Taya,0.0
2,Bab Ezzouar,0.083333
3,Baba Hassen,0.0
4,Bachdjerrah,0.0


### 7. Cluster Neighborhoods
Run k-means to cluster the neighborhoods in Algiers into 3 clusters.

In [44]:
# set number of clusters
kclusters = 3

kl_clustering = kl_mall.drop(["Neighborhoods"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(kl_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 2], dtype=int32)

In [45]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
kl_merged = kl_mall.copy()

# add clustering labels
kl_merged["Cluster Labels"] = kmeans.labels_

In [49]:
kl_merged.rename(columns={"Neighborhoods": "Neighborhood"}, inplace=True)
kl_merged.head()

Unnamed: 0,Neighborhood,Shopping Mall,Cluster Labels,Latitude,Longitude
0,"Aïn Bénian, Algiers",0.0,0,36.80095,2.91856
1,Aïn Taya,0.0,0,36.79294,3.28888
2,Bab Ezzouar,0.083333,1,36.72538,3.19035
3,Baba Hassen,0.0,0,36.69608,2.97242
4,Bachdjerrah,0.0,0,36.72474,3.11222


In [50]:
# merge algiers_grouped with algiers_data to add latitude/longitude for each neighborhood
kl_merged = kl_merged.join(kl_df.set_index("Neighborhood"), on="Neighborhood")

print(kl_merged.shape)
kl_merged.head() # check the last columns!

ValueError: columns overlap but no suffix specified: Index(['Latitude', 'Longitude'], dtype='object')

In [83]:
# sort the results by Cluster Labels
print(kl_merged.shape)
kl_merged.sort_values(["Cluster Labels"], inplace=True)
kl_merged

(70, 5)


Unnamed: 0,Neighborhood,Shopping Mall,Cluster Labels,Latitude,Longitude
19,Chow Kit,0.03,0,3.16359,101.69811
37,Medan Tuanku,0.02,0,3.15926,101.69834
44,Segambut,0.02,0,3.18639,101.6681
31,"Kampung Baru, Kuala Lumpur",0.03,0,3.16546,101.71028
48,Setiawangsa,0.02,0,3.178434,101.737013
49,Shamelin,0.02,0,3.12458,101.73597
28,Jalan Duta,0.02,0,3.179388,101.677454
27,"Jalan Cochrane, Kuala Lumpur",0.02,0,3.132962,101.724703
23,Dang Wangi,0.02,0,3.155238,101.70152
22,"Damansara, Kuala Lumpur",0.03,0,3.141316,101.62608


**Finally, let's visualize the resulting clusters**

In [91]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(kl_merged['Latitude'], kl_merged['Longitude'], kl_merged['Neighborhood'], kl_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [92]:
# save the map as HTML file
map_clusters.save('map_clusters.html')

### 8. Examine Clusters

#### Cluster 0

In [93]:
kl_merged.loc[kl_merged['Cluster Labels'] == 0]

Unnamed: 0,Neighborhood,Shopping Mall,Cluster Labels,Latitude,Longitude
19,Chow Kit,0.03,0,3.16359,101.69811
37,Medan Tuanku,0.02,0,3.15926,101.69834
44,Segambut,0.02,0,3.18639,101.6681
31,"Kampung Baru, Kuala Lumpur",0.03,0,3.16546,101.71028
48,Setiawangsa,0.02,0,3.178434,101.737013
49,Shamelin,0.02,0,3.12458,101.73597
28,Jalan Duta,0.02,0,3.179388,101.677454
27,"Jalan Cochrane, Kuala Lumpur",0.02,0,3.132962,101.724703
23,Dang Wangi,0.02,0,3.155238,101.70152
22,"Damansara, Kuala Lumpur",0.03,0,3.141316,101.62608


#### Cluster 1

In [94]:
kl_merged.loc[kl_merged['Cluster Labels'] == 1]

Unnamed: 0,Neighborhood,Shopping Mall,Cluster Labels,Latitude,Longitude
67,Taman Wahyu,0.0,1,3.2224,101.67173
43,Salak South,0.0,1,3.08102,101.69724
45,Semarak,0.0,1,3.180393,101.723414
46,Sentul Raya,0.01,1,3.175375,101.693034
47,Setapak,0.0,1,3.18816,101.70415
64,Taman Taynton View,0.0,1,3.08707,101.73681
63,Taman Sri Sinar,0.01,1,3.19007,101.65293
62,Taman P. Ramlee,0.0,1,3.1936,101.70598
61,Taman OUG,0.0,1,3.210037,101.634498
60,Taman Midah,0.0,1,3.09359,101.72837


#### Cluster 2

In [95]:
kl_merged.loc[kl_merged['Cluster Labels'] == 2]

Unnamed: 0,Neighborhood,Shopping Mall,Cluster Labels,Latitude,Longitude
11,Brickfields,0.05,2,3.12916,101.68406
30,KL Eco City,0.05,2,3.11711,101.67349
7,Bangsar Park,0.04,2,3.13478,101.67262
6,Bangsar,0.05,2,3.1292,101.67844
35,Lembah Pantai,0.05,2,3.121216,101.663897
66,Taman U-Thant,0.05,2,3.15765,101.72502
41,"Pudu, Kuala Lumpur",0.05,2,3.13354,101.71307
