# IBM Data Science Professional Certificate
https://www.coursera.org/specializations/ibm-data-science-professional-certificate

## COURSE 9 - Applied Data Science Capstone
https://www.coursera.org/learn/applied-data-science-capstone

### Week 05 - Capstone Project - The Battle of Neighborhoods (Week 2)

### Opening a New Bakery in Berlin - Germany:

* [001 - Library import](#library_import)
* [002 - Build a dataframe (web scraping from Wikipedia)](#wikipedia)
* [003 - Get the geographical coordinates (geocoder)](#geocoder)
* [004 - Obtain the venue data (Foursquare API)](#foursquare)
* [005 - Explore and cluster the borough](#explore)
* [006 - Select the best cluster to open a new backery](#cluster)

### 001 - Library import <a id='library_import'></a>

In [1]:
import numpy as np
print ('Numpy: ', np.__version__)
import pandas as pd
print ('Pandas: ', pd.__version__)

from bs4 import BeautifulSoup

%matplotlib inline

import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt
print ('Matplotlib: ', mpl.__version__)

import sklearn as sk
from sklearn.cluster import KMeans
print ('scikit-learn: ', sk.__version__)

import requests
print ('requests: ', requests.__version__)
import folium
print ('folium: ', folium.__version__)

from geopy.geocoders import Nominatim
import geocoder
print ('geocoder: ', geocoder.__version__)

Numpy:  1.16.3
Pandas:  0.23.4
Matplotlib:  3.0.3
scikit-learn:  0.21.0
requests:  2.21.0
folium:  0.8.3
geocoder:  1.38.1


### 002 - Build a dataframe (web scraping - Wikipedia)  <a id='wikipedia'></a>

In [2]:
# request link to wikipedia page in text format (boroughs and neighborhoods)
districts_BLN = requests.get('https://en.wikipedia.org/wiki/Boroughs_and_neighborhoods_of_Berlin').text

In [3]:
# beautifulsoup function
soupBLN = BeautifulSoup(districts_BLN, 'html.parser')
# print(soupBLN.prettify())

In [4]:
# which line of html code contains the table details
wiki_table = soupBLN.find('table', {'class':'sortable wikitable'})

In [5]:
# create a empty variables
Borough = []
Population = []
Area = []
Density = []

In [6]:
# append data into a created variables
for row in wiki_table.find_all('tr'):
    cells = row.find_all('td')
    if len(cells) == 4:
        Borough.append(cells[0].find(text = True))
        Population.append(cells[1].find(text = True))
        Area.append(cells[2].find(text = True))
        Density.append(cells[2].find(text = True))

In [7]:
# creating a empty dataframe
BLN_df = pd.DataFrame(columns = ['Borough', 'Population', 'Area', 'Density'])

In [8]:
# insert data into the empty dataframe
BLN_df['Borough'] = Borough
BLN_df['Population'] = Population
BLN_df['Area'] = Area
BLN_df['Density'] = Density
BLN_df

Unnamed: 0,Borough,Population,Area,Density
0,Friedrichshain-Kreuzberg,268225,20.16,20.16
1,Lichtenberg,259881,52.29,52.29
2,Marzahn-Hellersdorf,248264,61.74,61.74
3,Mitte,332919,39.47,39.47
4,Neukölln,310283,44.93,44.93
5,Pankow,366441,103.01,103.01
6,Reinickendorf,240454,89.46,89.46
7,Spandau,223962,91.91,91.91
8,Steglitz-Zehlendorf,293989,102.5,102.5
9,Tempelhof-Schöneberg,335060,53.09,53.09


In [9]:
print ('dimensions (rows, column)')
print ('BLN_df:', BLN_df.shape)

dimensions (rows, column)
BLN_df: (11, 4)


### 003 - Get the geographical coordinates (geocoder) <a id='geocoder'></a>

In [10]:
# define a function to get coordinates
def get_latlng(borough):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Berlin, Germany'.format(borough))
        lat_lng_coords = g.latlng
    return lat_lng_coords

In [11]:
# call the function to get the coordinates, store in a new list using list comprehension
coords = [ get_latlng(borough) for borough in BLN_df['Borough'].tolist()]
coords

[[52.500530036643774, 13.418679956142979],
 [52.51618000000008, 13.480870000000039],
 [52.51667000000003, 13.583330000000046],
 [52.52121000000005, 13.424150000000054],
 [52.480770000000064, 13.435410000000047],
 [52.56925000000007, 13.402480000000025],
 [52.575450000000046, 13.349700000000041],
 [52.53487000000007, 13.202160000000049],
 [52.43485000000004, 13.24183000000005],
 [52.477706405047485, 13.358656086540078],
 [52.43333000000007, 13.600000000000023]]

In [12]:
# create temporary dataframe to populate the coordinates into Latitude and Longitude
BLN_df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])

In [13]:
# merge the coordinates with BLN_df
BLN_df['Latitude'] = BLN_df_coords['Latitude']
BLN_df['Longitude'] = BLN_df_coords['Longitude']

In [14]:
# export BLN_df
BLN_df.to_csv('BLN_df.csv', index=False)
BLN_df

Unnamed: 0,Borough,Population,Area,Density,Latitude,Longitude
0,Friedrichshain-Kreuzberg,268225,20.16,20.16,52.50053,13.41868
1,Lichtenberg,259881,52.29,52.29,52.51618,13.48087
2,Marzahn-Hellersdorf,248264,61.74,61.74,52.51667,13.58333
3,Mitte,332919,39.47,39.47,52.52121,13.42415
4,Neukölln,310283,44.93,44.93,52.48077,13.43541
5,Pankow,366441,103.01,103.01,52.56925,13.40248
6,Reinickendorf,240454,89.46,89.46,52.57545,13.3497
7,Spandau,223962,91.91,91.91,52.53487,13.20216
8,Steglitz-Zehlendorf,293989,102.5,102.5,52.43485,13.24183
9,Tempelhof-Schöneberg,335060,53.09,53.09,52.477706,13.358656


In [15]:
# get the coordinates of Berlin
address = 'Berlin, Germany'

geolocator = Nominatim(user_agent='berlin_location')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('Berlin, Germany  - LAT: {}, LON: {}'.format(latitude, longitude))

Berlin, Germany  - LAT: 52.5170365, LON: 13.3888599


In [16]:
# Berlin - Map
BLN_map = folium.Map(location=[latitude, longitude], zoom_start=12)


# add markers to map
for lat, lng, Borough in zip(BLN_df['Latitude'], BLN_df['Longitude'], BLN_df['Borough']):
    label = '{}'.format(Borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(BLN_map)

BLN_map

In [17]:
BLN_map.save('./images/001_BLN_map.html')

### 004 - Obtain the venue data (Foursquare API) <a id='foursquare'></a>

In [18]:
# define Foursquare Credentials and Version
CLIENT_ID = 'HCBFUXIPNH1OLER4D4VXYZTNVIIGJXWBKIM3CWOXCCF3ZSLL' 
CLIENT_SECRET = 'IIXXXOZ24X51NPFZSWGKGWQMGJMRDENRAYJ1PAGB2OQ3UOHB' 
VERSION = '20190610' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: HCBFUXIPNH1OLER4D4VXYZTNVIIGJXWBKIM3CWOXCCF3ZSLL
CLIENT_SECRET:IIXXXOZ24X51NPFZSWGKGWQMGJMRDENRAYJ1PAGB2OQ3UOHB


In [19]:
#### top 200 venues that are within a radius of 4000 meters
LIMIT = 200
radius = 4000

venues = []

for lat, long, Borough in zip(BLN_df['Latitude'], BLN_df['Longitude'], BLN_df['Borough']):
    
    # create the API request URL
    url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    for venue in results:
        venues.append((
            Borough,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [20]:
# convert the venues list into a new DataFrame
venues_df = pd.DataFrame(venues)
venues_df.columns = ['Borough', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print ('dimensions (rows, column)')
print ('venues_df:', venues_df.shape)
venues_df.head()

dimensions (rows, column)
venues_df: (1091, 7)


Unnamed: 0,Borough,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,Friedrichshain-Kreuzberg,52.50053,13.41868,Nano Kaffee,52.500831,13.417041,Coffee Shop
1,Friedrichshain-Kreuzberg,52.50053,13.41868,Biererei Bar,52.500775,13.42121,Beer Bar
2,Friedrichshain-Kreuzberg,52.50053,13.41868,Modern Graphics,52.500748,13.421465,Comic Shop
3,Friedrichshain-Kreuzberg,52.50053,13.41868,Basmah,52.498266,13.421194,African Restaurant
4,Friedrichshain-Kreuzberg,52.50053,13.41868,Kaffeekirsche,52.503057,13.420549,Coffee Shop


In [21]:
#### how many venues are in each neighorhood
venues_df.groupby(['Borough']).count()

Unnamed: 0_level_0,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Friedrichshain-Kreuzberg,100,100,100,100,100,100
Lichtenberg,100,100,100,100,100,100
Marzahn-Hellersdorf,91,91,91,91,91,91
Mitte,100,100,100,100,100,100
Neukölln,100,100,100,100,100,100
Pankow,100,100,100,100,100,100
Reinickendorf,100,100,100,100,100,100
Spandau,100,100,100,100,100,100
Steglitz-Zehlendorf,100,100,100,100,100,100
Tempelhof-Schöneberg,100,100,100,100,100,100


In [22]:
#### how many unique categories exist
print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))

There are 221 uniques categories.


In [23]:
#### list of exist categories
venues_df['VenueCategory'].unique()[:50]

array(['Coffee Shop', 'Beer Bar', 'Comic Shop', 'African Restaurant',
       'Bar', 'Yoga Studio', 'Spanish Restaurant', 'Record Shop',
       'Garden', 'Pizza Place', 'Ramen Restaurant', 'Arts & Crafts Store',
       'Farmers Market', 'Bakery', 'Canal', 'Falafel Restaurant', 'Café',
       'Taverna', 'Food Court', 'BBQ Joint', 'Brewery',
       'Lebanese Restaurant', 'Cocktail Bar', 'Wine Bar', 'Event Space',
       'Beer Store', 'Ice Cream Shop', 'Art Gallery', 'Breakfast Spot',
       'Vegetarian / Vegan Restaurant', 'Plaza', 'Performing Arts Venue',
       'Hostel', 'Indie Movie Theater', 'Italian Restaurant',
       'Salon / Barbershop', 'Beach Bar', 'Hotel', 'Art Museum',
       'Vietnamese Restaurant', 'Tea Room', 'Burger Joint', 'Wine Shop',
       'Pie Shop', 'Greek Restaurant', 'Monument / Landmark', 'Bike Shop',
       'Park', 'Cycle Studio', 'Dumpling Restaurant'], dtype=object)

In [24]:
print ('Exist a Bakery in the categories:', 'Bakery' in venues_df['VenueCategory'].unique())

Exist a Bakery in the categories: True


### 005 - Explore and cluster the borough  <a id='explore'></a>

In [25]:
# encoding
BLN_df_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add Borough column back to dataframe
BLN_df_onehot['Borough'] = venues_df['Borough'] 

# move Borough column to the first column
fixed_columns = [BLN_df_onehot.columns[-1]] + list(BLN_df_onehot.columns[:-1])
table_df_onehot = BLN_df_onehot[fixed_columns]

print ('dimensions (rows, column)')
print ('BLN_df_onehot:', BLN_df_onehot.shape)
BLN_df_onehot.head()

dimensions (rows, column)
BLN_df_onehot: (1091, 222)


Unnamed: 0,African Restaurant,Amphitheater,Arcade,Argentinian Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Austrian Restaurant,...,Waterfall,Whisky Bar,Windmill,Wine Bar,Wine Shop,Winery,Yoga Studio,Zoo,Zoo Exhibit,Borough
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Friedrichshain-Kreuzberg
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Friedrichshain-Kreuzberg
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Friedrichshain-Kreuzberg
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Friedrichshain-Kreuzberg
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Friedrichshain-Kreuzberg


In [26]:
### group rows by Borough
BLN_df_grouped = BLN_df_onehot.groupby(['Borough']).mean().reset_index()
print ('dimensions (rows, column)')
print ('BLN_df_onehot:', BLN_df_grouped.shape)
BLN_df_grouped

dimensions (rows, column)
BLN_df_onehot: (11, 222)


Unnamed: 0,Borough,African Restaurant,Amphitheater,Arcade,Argentinian Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Volleyball Court,Waterfall,Whisky Bar,Windmill,Wine Bar,Wine Shop,Winery,Yoga Studio,Zoo,Zoo Exhibit
0,Friedrichshain-Kreuzberg,0.01,0.0,0.0,0.0,0.01,0.01,0.01,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01,0.02,0.0,0.01,0.0,0.0
1,Lichtenberg,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.01,...,0.0,0.0,0.0,0.0,0.02,0.01,0.01,0.02,0.0,0.0
2,Marzahn-Hellersdorf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010989,0.0,...,0.0,0.0,0.0,0.010989,0.0,0.0,0.0,0.0,0.010989,0.032967
3,Mitte,0.0,0.0,0.0,0.01,0.0,0.0,0.02,0.0,0.0,...,0.0,0.0,0.0,0.0,0.03,0.0,0.0,0.02,0.0,0.0
4,Neukölln,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.0,0.01,0.01,0.0,0.0,0.0,0.0
5,Pankow,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.0,0.01,0.0,0.0,0.01,0.0,0.0
6,Reinickendorf,0.01,0.0,0.0,0.02,0.01,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Spandau,0.0,0.01,0.0,0.02,0.0,0.0,0.0,0.01,0.02,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Steglitz-Zehlendorf,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0
9,Tempelhof-Schöneberg,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,...,0.01,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
len(BLN_df_grouped[BLN_df_grouped["Bakery"] > 0])

11

In [28]:
### create a new DataFrame
BLN_df_bakery = BLN_df_grouped[['Borough','Bakery']]
BLN_df_bakery

Unnamed: 0,Borough,Bakery
0,Friedrichshain-Kreuzberg,0.02
1,Lichtenberg,0.02
2,Marzahn-Hellersdorf,0.010989
3,Mitte,0.01
4,Neukölln,0.01
5,Pankow,0.04
6,Reinickendorf,0.01
7,Spandau,0.01
8,Steglitz-Zehlendorf,0.02
9,Tempelhof-Schöneberg,0.02


In [29]:
### k-means cluster (Berlin)
# set number of clusters
kclusters = 3

BLN_df_bakery_clustering = BLN_df_bakery.drop(['Borough'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(BLN_df_bakery_clustering)

In [30]:
# cluster with the top 10 venues
BLN_df_bakery_merged = BLN_df_bakery.copy()

# clustering label
BLN_df_bakery_merged['Cluster Labels'] = kmeans.labels_

In [31]:
# merge BLN_df_bakery_merged with BLN_df
BLN_df_bakery_merged = BLN_df_bakery_merged.join(BLN_df.set_index('Borough'), on='Borough')

print ('dimensions (rows, column)')
print ('BLN_df_bakery_merged:', BLN_df_bakery_merged.shape)
BLN_df_bakery_merged.head()

dimensions (rows, column)
BLN_df_bakery_merged: (11, 8)


Unnamed: 0,Borough,Bakery,Cluster Labels,Population,Area,Density,Latitude,Longitude
0,Friedrichshain-Kreuzberg,0.02,1,268225,20.16,20.16,52.50053,13.41868
1,Lichtenberg,0.02,1,259881,52.29,52.29,52.51618,13.48087
2,Marzahn-Hellersdorf,0.010989,2,248264,61.74,61.74,52.51667,13.58333
3,Mitte,0.01,2,332919,39.47,39.47,52.52121,13.42415
4,Neukölln,0.01,2,310283,44.93,44.93,52.48077,13.43541


In [32]:
# sort the results
print ('dimensions (rows, column)')
print ('BLN_df_bakery_merged:', BLN_df_bakery_merged.shape)
BLN_df_bakery_merged.sort_values(['Cluster Labels'], inplace=True)
BLN_df_bakery_merged

dimensions (rows, column)
BLN_df_bakery_merged: (11, 8)


Unnamed: 0,Borough,Bakery,Cluster Labels,Population,Area,Density,Latitude,Longitude
5,Pankow,0.04,0,366441,103.01,103.01,52.56925,13.40248
0,Friedrichshain-Kreuzberg,0.02,1,268225,20.16,20.16,52.50053,13.41868
1,Lichtenberg,0.02,1,259881,52.29,52.29,52.51618,13.48087
8,Steglitz-Zehlendorf,0.02,1,293989,102.5,102.5,52.43485,13.24183
9,Tempelhof-Schöneberg,0.02,1,335060,53.09,53.09,52.477706,13.358656
10,Treptow-Köpenick,0.02,1,241335,168.42,168.42,52.43333,13.6
2,Marzahn-Hellersdorf,0.010989,2,248264,61.74,61.74,52.51667,13.58333
3,Mitte,0.01,2,332919,39.47,39.47,52.52121,13.42415
4,Neukölln,0.01,2,310283,44.93,44.93,52.48077,13.43541
6,Reinickendorf,0.01,2,240454,89.46,89.46,52.57545,13.3497


In [33]:
# create map
BLN_map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(BLN_df_bakery_merged['Latitude'], BLN_df_bakery_merged['Longitude'], BLN_df_bakery_merged['Borough'], BLN_df_bakery_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(BLN_map_clusters)
       
BLN_map_clusters

In [34]:
BLN_map_clusters.save('./images/002_BLN_map_clusters.html')

### 006 - Select the best cluster to open a new backery  <a id='cluster'></a>

#### Cluster 0 - with low number of bakery

In [35]:
BLN_df_cluster_0 = BLN_df_bakery_merged.loc[BLN_df_bakery_merged['Cluster Labels'] == 0]
# export BLN_df_cluster
BLN_df_cluster_0.to_csv('BLN_df_cluster_0.csv', index=False)

#### Cluster 1 - with moderate number of bakery

In [36]:
BLN_df_cluster_1 = BLN_df_bakery_merged.loc[BLN_df_bakery_merged['Cluster Labels'] == 1]
# export BLN_df_cluster
BLN_df_cluster_1.to_csv('BLN_df_cluster_1.csv', index=False)

#### Cluster 2 - with high number of bakery

In [37]:
BLN_df_cluster_2 = BLN_df_bakery_merged.loc[BLN_df_bakery_merged['Cluster Labels'] == 2]
# export BLN_df_cluster
BLN_df_cluster_2.to_csv('BLN_df_cluster_2.csv', index=False)