### Coursera Capstone Final Project 

#### Importing all the required packages

In [1]:
import geocoder

In [2]:
from pandas.io.json import json_normalize 
from geopy.geocoders import Nominatim 
import matplotlib.colors as colors
from sklearn.cluster import KMeans
from bs4 import BeautifulSoup
import matplotlib.cm as cm
import pandas as pd 
import numpy as np
import geocoder
import requests 
import folium 
import json 
print("All libraries imported successfully")

All libraries imported successfully


### Scraping data from Wikipedia

In [3]:
data = requests.get("https://commons.wikimedia.org/wiki/Category:Suburbs_of_Bangalore").text
soup = BeautifulSoup(data, 'html.parser')
neighborhoodList = []
for row in soup.find_all("div", class_="mw-category")[0].findAll("li"):
    neighborhoodList.append(row.text)
banglore_df = pd.DataFrame({"Neighborhood": neighborhoodList})
banglore_df.head()

Unnamed: 0,Neighborhood
0,"► Agara, Bangalore‎ (2 C, 6 F)"
1,► Arekere‎ (5 F)
2,"► Banashankari‎ (1 C, 5 F)"
3,► Banaswadi‎ (2 F)
4,"► Basavanagudi‎ (5 C, 11 F)"


In [4]:
new = banglore_df['Neighborhood'].str.split(' ', expand = True)

In [5]:
banglore_df['Neighborhood'] = new[2]

In [6]:
new = banglore_df['Neighborhood'].str.split(',',expand = True)

In [7]:
banglore_df['Neighborhood'] = new[0]

In [8]:
banglore_df.head()

Unnamed: 0,Neighborhood
0,Agara
1,Arekere‎
2,Banashankari‎
3,Banaswadi‎
4,Basavanagudi‎


In [9]:
banglore_df.shape

(59, 1)

### Getting coordinates for the cities

In [10]:
def get_latlng(neighborhood):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Banglore, India'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

In [11]:
coordinates = [ get_latlng(neighborhood) for neighborhood in banglore_df["Neighborhood"].tolist() ]

In [12]:
df_coords = pd.DataFrame(coordinates, columns=['Latitude', 'Longitude'])

In [13]:
banglore_df['Latitude'] = df_coords['Latitude']
banglore_df['Longitude'] = df_coords['Longitude']

In [14]:
# save the DataFrame as CSV file
banglore_df.to_csv("banglore_df.csv", index=False)

In [15]:
address = 'Banglore'
geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Banglore {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Banglore 18.0056901, 79.5715088.


In [16]:
map_banglore = folium.Map(location=[latitude, longitude], zoom_start=10)
for lat, lng, neighborhood in zip(banglore_df['Latitude'], banglore_df['Longitude'], banglore_df['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=8,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_banglore)  
    
map_banglore

In [17]:
CLIENT_ID = '' 
CLIENT_SECRET = ''
VERSION = '20180605' 


In [18]:
radius = 2000
LIMIT = 100

venues = []

for lat, long, neighborhood in zip(banglore_df['Latitude'], banglore_df['Longitude'], banglore_df['Neighborhood']):
    
    # create the API request URL
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [19]:
# convert the venues list into a new DataFrame
venues_df = pd.DataFrame(venues)

# define the column names
venues_df.columns = ['Neighborhood', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

(3159, 7)


Unnamed: 0,Neighborhood,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,Arekere‎,12.88568,77.59668,Decathlon Sports India Pvt Ltd,12.887513,77.597712,Sporting Goods Shop
1,Arekere‎,12.88568,77.59668,Natural Ice Cream,12.892188,77.598222,Ice Cream Shop
2,Arekere‎,12.88568,77.59668,Swensens,12.876071,77.595542,Ice Cream Shop
3,Arekere‎,12.88568,77.59668,Cinepolis,12.876119,77.595455,Multiplex
4,Arekere‎,12.88568,77.59668,Chavadi,12.892199,77.602538,BBQ Joint


In [20]:
banglore_df.head(10)

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Agara,12.84283,77.48759
1,Arekere‎,12.88568,77.59668
2,Banashankari‎,12.873412,77.538013
3,Banaswadi‎,13.019644,77.654692
4,Basavanagudi‎,12.93898,77.57137
5,Begur,12.88245,77.62475
6,Bellandur‎,12.92735,77.67185
7,BEML‎,12.9649,77.5924
8,Bengaluru,12.97194,77.59369
9,Bidadi‎,12.83019,77.86623


In [21]:
venues_df.head(20)

Unnamed: 0,Neighborhood,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,Arekere‎,12.88568,77.59668,Decathlon Sports India Pvt Ltd,12.887513,77.597712,Sporting Goods Shop
1,Arekere‎,12.88568,77.59668,Natural Ice Cream,12.892188,77.598222,Ice Cream Shop
2,Arekere‎,12.88568,77.59668,Swensens,12.876071,77.595542,Ice Cream Shop
3,Arekere‎,12.88568,77.59668,Cinepolis,12.876119,77.595455,Multiplex
4,Arekere‎,12.88568,77.59668,Chavadi,12.892199,77.602538,BBQ Joint
5,Arekere‎,12.88568,77.59668,Ingu Tengu,12.883268,77.607514,South Indian Restaurant
6,Arekere‎,12.88568,77.59668,Hakuna Matata,12.894028,77.586509,Lounge
7,Arekere‎,12.88568,77.59668,The Yellow Submarine,12.897626,77.599696,Indian Restaurant
8,Arekere‎,12.88568,77.59668,SLV Refreshments,12.890905,77.581927,Indian Restaurant
9,Arekere‎,12.88568,77.59668,The Pint Room,12.875989,77.595602,Beer Garden


#### Let's check how many venues were returned for each neighorhood

In [22]:
venues_df.groupby(["Neighborhood"]).count()

Unnamed: 0_level_0,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Arekere‎,74,74,74,74,74,74
BEML‎,100,100,100,100,100,100
Banashankari‎,8,8,8,8,8,8
Banaswadi‎,49,49,49,49,49,49
Basavanagudi‎,100,100,100,100,100,100
Begur,16,16,16,16,16,16
Bellandur‎,86,86,86,86,86,86
Bengaluru,100,100,100,100,100,100
Bidadi‎,1,1,1,1,1,1
Bommasandra‎,7,7,7,7,7,7


#### Finding unique categories in the venues_df

In [23]:
print("There are {} unique categories from venues".format(len(venues_df['VenueCategory'].unique())))

There are 204 unique categories from venues


#### Exploring the unique categories from venues_df

In [24]:
print(venues_df['VenueCategory'].unique())

['Sporting Goods Shop' 'Ice Cream Shop' 'Multiplex' 'BBQ Joint'
 'South Indian Restaurant' 'Lounge' 'Indian Restaurant' 'Beer Garden'
 'Bowling Alley' 'Café' 'Pizza Place' 'Brewery' 'Restaurant'
 'Chinese Restaurant' 'Liquor Store' 'Office' 'Sandwich Place'
 'Shopping Mall' 'Fast Food Restaurant' 'Department Store'
 'Eastern European Restaurant' 'Middle Eastern Restaurant'
 'American Restaurant' 'General Entertainment' 'Burger Joint'
 'Coffee Shop' 'Rajasthani Restaurant' 'Italian Restaurant' 'Supermarket'
 'Dive Bar' 'Dumpling Restaurant' 'Clothing Store' 'Electronics Store'
 'Breakfast Spot' 'Diner' 'Boutique' 'Food Court' 'Movie Theater'
 'Vegetarian / Vegan Restaurant' 'Badminton Court' 'Park' 'Bakery'
 'Wine Shop' 'Pharmacy' 'Asian Restaurant' 'Trail' 'Bistro' 'Pub'
 'Korean Restaurant' 'Falafel Restaurant' 'Andhra Restaurant'
 'Arts & Crafts Store' 'Donut Shop' 'Indian Chinese Restaurant'
 'Snack Place' 'Kerala Restaurant' 'Convenience Store'
 'Athletics & Sports' 'Seafood Restau

In [25]:
venues_df['VenueCategory'].value_counts()[:50]

Indian Restaurant                452
Café                             198
Ice Cream Shop                   143
Fast Food Restaurant             111
Pizza Place                      106
Coffee Shop                       90
Hotel                             89
Chinese Restaurant                79
Italian Restaurant                65
Bakery                            64
Lounge                            63
Restaurant                        58
Pub                               57
Department Store                  52
Asian Restaurant                  51
Breakfast Spot                    49
Brewery                           44
Vegetarian / Vegan Restaurant     44
Clothing Store                    44
Bar                               38
Sandwich Place                    37
Burger Joint                      37
Gym                               37
Shopping Mall                     34
BBQ Joint                         32
Snack Place                       32
Juice Bar                         31
M

### Finding Best place to open a pub in Banglore

### Analyzing the neighborhoods information

#### One hot encoding for the neighboring 

In [26]:
# one hot encoding
banglore_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
banglore_onehot['Neighborhoods'] = venues_df['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [banglore_onehot.columns[-1]] + list(banglore_onehot.columns[:-1])
banglore_onehot = banglore_onehot[fixed_columns]

print(banglore_onehot.shape)
banglore_onehot.head()

(3159, 205)


Unnamed: 0,Neighborhoods,ATM,Accessories Store,Afghan Restaurant,Airport,American Restaurant,Andhra Restaurant,Arcade,Art Gallery,Art Museum,...,Travel & Transport,Travel Agency,Turkish Restaurant,Udupi Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,Arekere‎,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Arekere‎,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Arekere‎,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Arekere‎,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Arekere‎,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Grouping categories based on frequency

In [27]:
banglore_grouped = banglore_onehot.groupby(["Neighborhoods"]).mean().reset_index()

print(banglore_grouped.shape)
banglore_grouped.head(10)

(56, 205)


Unnamed: 0,Neighborhoods,ATM,Accessories Store,Afghan Restaurant,Airport,American Restaurant,Andhra Restaurant,Arcade,Art Gallery,Art Museum,...,Travel & Transport,Travel Agency,Turkish Restaurant,Udupi Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,Arekere‎,0.0,0.0,0.0,0.0,0.013514,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.013514,0.0,0.0,0.013514,0.0,0.0
1,BEML‎,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.0,0.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.01,0.0,0.0,0.0
2,Banashankari‎,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Banaswadi‎,0.0,0.0,0.0,0.0,0.0,0.020408,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.020408,0.0,0.0,0.0,0.0,0.0
4,Basavanagudi‎,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.01,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0
5,Begur,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Bellandur‎,0.0,0.0,0.011628,0.0,0.011628,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Bengaluru,0.0,0.0,0.01,0.0,0.01,0.01,0.01,0.0,0.0,...,0.0,0.0,0.0,0.0,0.02,0.0,0.01,0.0,0.0,0.0
8,Bidadi‎,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Bommasandra‎,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Checking number of places having pubs as VenueCategory

In [28]:
len(banglore_grouped[banglore_grouped["Pub"] > 0])

19

### Creating dataframe with pub data only

In [29]:
banglore_pub = banglore_grouped[["Neighborhoods","Pub"]]

In [30]:
banglore_pub.head()

Unnamed: 0,Neighborhoods,Pub
0,Arekere‎,0.0
1,BEML‎,0.03
2,Banashankari‎,0.0
3,Banaswadi‎,0.020408
4,Basavanagudi‎,0.0


### Creating clusters for the neighborhoods using Kmeans

In [31]:
# set number of clusters
kclusters = 5
banglore_clustering = banglore_pub.drop(["Neighborhoods"], 1)
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(banglore_clustering)
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 3, 0, 2, 0, 0, 2, 3, 0, 0], dtype=int32)

In [32]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
banglore_merged = banglore_pub.copy()

# add clustering labels
banglore_merged["Cluster Labels"] = kmeans.labels_

In [33]:
banglore_merged.rename(columns={"Neighborhoods": "Neighborhood"}, inplace=True)
banglore_merged.head()

Unnamed: 0,Neighborhood,Pub,Cluster Labels
0,Arekere‎,0.0,0
1,BEML‎,0.03,3
2,Banashankari‎,0.0,0
3,Banaswadi‎,0.020408,2
4,Basavanagudi‎,0.0,0


### Joinging dataframes to get the latitude and longitude for corresponding neighborhood

In [34]:


banglore_merged = banglore_merged.join(banglore_df.set_index("Neighborhood"), on="Neighborhood")

print(banglore_merged.shape)
banglore_merged.head(20) 

(56, 5)


Unnamed: 0,Neighborhood,Pub,Cluster Labels,Latitude,Longitude
0,Arekere‎,0.0,0,12.88568,77.59668
1,BEML‎,0.03,3,12.9649,77.5924
2,Banashankari‎,0.0,0,12.873412,77.538013
3,Banaswadi‎,0.020408,2,13.019644,77.654692
4,Basavanagudi‎,0.0,0,12.93898,77.57137
5,Begur,0.0,0,12.88245,77.62475
6,Bellandur‎,0.011628,2,12.92735,77.67185
7,Bengaluru,0.04,3,12.97194,77.59369
8,Bidadi‎,0.0,0,12.83019,77.86623
9,Bommasandra‎,0.0,0,12.81753,77.67879


### Sorting dataframe based on cluster labels

In [35]:
print(banglore_merged.shape)
banglore_merged.sort_values(["Cluster Labels"], inplace=True)
banglore_merged

(56, 5)


Unnamed: 0,Neighborhood,Pub,Cluster Labels,Latitude,Longitude
0,Arekere‎,0.0,0,12.88568,77.59668
24,Jakkur‎,0.0,0,13.07564,77.60394
25,Jayanagar,0.0,0,12.92872,77.58281
54,Yeshwantpur‎,0.0,0,13.03912,77.57797
29,Konanakunte‎,0.0,0,12.88233,77.56926
31,Krishnarajapura‎,0.0,0,13.00039,77.68368
32,Kundalahalli‎,0.0,0,12.96752,77.715
33,Madiwala‎,0.0,0,12.92052,77.6209
34,Magadi‎,0.0,0,12.986273,77.488591
35,Mahadevapura‎,0.0,0,12.99409,77.66633


### Mapping Clusters

In [36]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(banglore_merged['Latitude'], banglore_merged['Longitude'], banglore_merged['Neighborhood'], banglore_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examine Clusters

#### Cluster 1

In [37]:
banglore_merged.loc[banglore_merged['Cluster Labels'] == 0]

Unnamed: 0,Neighborhood,Pub,Cluster Labels,Latitude,Longitude
0,Arekere‎,0.0,0,12.88568,77.59668
24,Jakkur‎,0.0,0,13.07564,77.60394
25,Jayanagar,0.0,0,12.92872,77.58281
54,Yeshwantpur‎,0.0,0,13.03912,77.57797
29,Konanakunte‎,0.0,0,12.88233,77.56926
31,Krishnarajapura‎,0.0,0,13.00039,77.68368
32,Kundalahalli‎,0.0,0,12.96752,77.715
33,Madiwala‎,0.0,0,12.92052,77.6209
34,Magadi‎,0.0,0,12.986273,77.488591
35,Mahadevapura‎,0.0,0,12.99409,77.66633


In [38]:
len(banglore_merged.loc[banglore_merged['Cluster Labels'] == 0])

37

#### Cluster 2

In [39]:
len(banglore_merged.loc[banglore_merged['Cluster Labels'] == 1])

3

#### Cluster 3

In [40]:
banglore_merged.loc[banglore_merged['Cluster Labels'] == 2]

Unnamed: 0,Neighborhood,Pub,Cluster Labels,Latitude,Longitude
3,Banaswadi‎,0.020408,2,13.019644,77.654692
18,HSR,0.013699,2,12.91943,77.64913
45,Sahakara,0.018519,2,12.923863,77.54815
30,Koramangala‎,0.01,2,12.92004,77.62546
42,Rajajinagar‎,0.012987,2,13.01192,77.54717
37,Malleswaram‎,0.01,2,13.006322,77.568416
6,Bellandur‎,0.011628,2,12.92735,77.67185
55,Yeswanthpur‎,0.010989,2,13.018644,77.557617


#### Cluster 4

In [41]:
banglore_merged.loc[banglore_merged['Cluster Labels'] == 3]

Unnamed: 0,Neighborhood,Pub,Cluster Labels,Latitude,Longitude
7,Bengaluru,0.04,3,12.97194,77.59369
46,Shivajinagar‎,0.04,3,12.9872,77.60401
40,Murugeshpalya‎,0.035714,3,12.95565,77.65335
49,UB,0.04,3,12.97382,77.6031
28,Kodihalli,0.04,3,12.9662,77.64982
1,BEML‎,0.03,3,12.9649,77.5924


#### Cluster 5

In [42]:
banglore_merged.loc[banglore_merged['Cluster Labels'] == 4]

Unnamed: 0,Neighborhood,Pub,Cluster Labels,Latitude,Longitude
21,Indiranagar‎,0.06,4,12.97822,77.63397
50,Ulsoor‎,0.06,4,12.98908,77.62795


#### Observations

Out of all the cities in Banglore, we have made 5 clusters using KMeans Clustering Algorithm. As we can see that first cluster has 37 datapoints, Cluster 2 has 3 datapoints, Cluster 3 has 8 datapoints, Cluster 4 has 6 datapoint(i.e. BEML), whereas Cluster 5 has 2 datapoints. Clearly from the data, we can say that Cluster 5 is good for opening a Pub.