# <font color=Green>Opening a Debut Bakery for an International Chain in Karachi, Pakistan</font>
#### <font color=Green>By: Roha Farooq</font>

### <font color=Green>The Data will be Analysed and Processed in the Following Order</font>

   - Build a dataframe of neighborhoods in Karchi, Pakistan <br>
       - Scrape data from Wikipedia using BeautifulSoup <br>
       - Clean the neighborhood data (remove redundant information and spaces ets) using Regex<br>
   - Get geographical coordinates (Latitude, Longitude) of the neighborhoods <br>
   - Obtain the venue data for the neighborhoods from Foursquare API<br>
       - Categorize venues and obtain information for Bakeries presesnt in the area<br>
   - Explore venue data and make clusters of the neighborhoods<br>
   - Select the best cluster to open a debut bakery

In [1]:
#!pip install folium
#!pip install geocoder
#!pip install opencage

In [2]:
import numpy as np
import pandas as pd
import json
import geocoder
from geopy.geocoders import Nominatim
from geopy.geocoders import ArcGIS
from opencage.geocoder import OpenCageGeocode
import requests
from bs4 import BeautifulSoup
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium
print("Libraries imported.")

Libraries imported.


### <font color=Green>Scrap and Clean Data from Wikipedia</font>

In [3]:
data = requests.get("https://en.wikipedia.org/wiki/Category:Towns_in_Karachi").text

In [4]:
soup = BeautifulSoup(data, 'html.parser')

In [5]:
neighborhoodList = []

In [6]:
for row in soup.find_all("div", class_="mw-category")[0].findAll("li"):
    neighborhoodList.append(row.text)

In [7]:
khi_df = pd.DataFrame({"Neighborhood": neighborhoodList})

khi_df['Neighborhood'] = khi_df['Neighborhood'].str.replace(r"\(.*\)","")
khi_df['Neighborhood'] = khi_df['Neighborhood'].str.replace("(,).*","")

khi_df['Neighborhood'] = khi_df['Neighborhood'].str.replace("►","")
khi_df['Neighborhood'] = khi_df['Neighborhood'].str.strip()

neighborhoodList = khi_df['Neighborhood']

khi_df.head()

Unnamed: 0,Neighborhood
0,Baldia Town‎
1,Bin Qasim Town‎
2,Gadap Town‎
3,Gulberg Town
4,Gulshan Town‎


In [8]:
City = 'Karachi'
Country = 'Pakistan'

khi_df['City'] = City
khi_df['Country'] = Country

print(khi_df.shape)
khi_df.head()

(18, 3)


Unnamed: 0,Neighborhood,City,Country
0,Baldia Town‎,Karachi,Pakistan
1,Bin Qasim Town‎,Karachi,Pakistan
2,Gadap Town‎,Karachi,Pakistan
3,Gulberg Town,Karachi,Pakistan
4,Gulshan Town‎,Karachi,Pakistan


### <font color=Green>Get Geographical Coordinates</font>

In [9]:
key = 'a849662e0c9942559430e36a6717ad56'

geocoder = OpenCageGeocode(key)
query = 'Korangi Town, Karachi, Pakistan'  
results = geocoder.geocode(query)
lat = results[0]['geometry']['lat']
lng = results[0]['geometry']['lng']

print (lat, lng)
#print (results)

24.8577394 67.1521356


In [10]:
list_lat = []
list_long = []

for index, row in khi_df.iterrows():

    Area = row['Neighborhood']
    City = row['City']
    Country = row['Country']
    query = str(Area)+','+str(City)+','+str(Country)
    
    results = geocoder.geocode(query)
    Latitude = results[0]['geometry']['lat']
    Longitude = results[0]['geometry']['lng']

    list_lat.append(Latitude)
    list_long.append(Longitude)

khi_df['Latitude'] = list_lat
khi_df['Longitude'] = list_long

In [11]:
print(khi_df.shape)
khi_df.head()

(18, 5)


Unnamed: 0,Neighborhood,City,Country,Latitude,Longitude
0,Baldia Town‎,Karachi,Pakistan,24.91896,66.987736
1,Bin Qasim Town‎,Karachi,Pakistan,24.822718,67.40351
2,Gadap Town‎,Karachi,Pakistan,25.000475,67.131724
3,Gulberg Town,Karachi,Pakistan,24.936514,67.07474
4,Gulshan Town‎,Karachi,Pakistan,24.92977,67.123607


In [12]:
# save the DataFrame as CSV file
khi_df.to_csv("khi_df.csv", index=False)

### <font color=Green>Create a Map of Karachi with Neighborhoods Superimposed on Top</font>

In [13]:
address = 'Karachi, Pakistan'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Karachi, Pakistan {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Karachi, Pakistan 25.1446897, 67.1847767315734.


In [14]:
map_khi = folium.Map(location=[latitude, longitude])

# add markers to map
for lat, lng, neighborhood in zip(khi_df['Latitude'], khi_df['Longitude'], khi_df['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_khi)  
    
map_khi

In [15]:
# save the map as HTML file
map_khi.save('map_khi.html')

### <font color=Green>Use Foursquare API to Explore the Neighborhoods</font>

In [16]:
CLIENT_ID = '54MD2ZXSZQKAX2IFMQRM4CX3YKE3E3VCQTHO502PTAZCRG4U'
CLIENT_SECRET = '0BIEC1C0HLQFWSXGX5KB0SLHXALFEAV43TNALOX44SDEJRCK'
VERSION = '20180605'

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 54MD2ZXSZQKAX2IFMQRM4CX3YKE3E3VCQTHO502PTAZCRG4U
CLIENT_SECRET:0BIEC1C0HLQFWSXGX5KB0SLHXALFEAV43TNALOX44SDEJRCK


In [17]:
radius = 10000
LIMIT = 200
venues = []

for lat, long, neighborhood in zip(khi_df['Latitude'], khi_df['Longitude'], khi_df['Neighborhood']):

    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)

    results = requests.get(url).json()["response"]['groups'][0]['items']
 
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

KeyError: 'groups'

In [18]:
venues_df = pd.DataFrame(venues)

venues_df.columns = ['Neighborhood', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

(1015, 7)


Unnamed: 0,Neighborhood,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,Baldia Town‎,24.91896,66.987736,Kababjees,24.927708,67.033878,Diner
1,Baldia Town‎,24.91896,66.987736,Cafe Laziz,24.856305,67.015669,Café
2,Baldia Town‎,24.91896,66.987736,Dynasty,24.852399,67.031679,Chinese Restaurant
3,Baldia Town‎,24.91896,66.987736,Noorani Kabab House,24.867698,67.052259,BBQ Joint
4,Baldia Town‎,24.91896,66.987736,Shaikh Abdul Ghaffar Kabab House,24.876738,67.065969,Pakistani Restaurant


In [19]:
venues_df.groupby(["Neighborhood"]).count()

Unnamed: 0_level_0,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Baldia Town‎,58,58,58,58,58,58
Bin Qasim Town‎,5,5,5,5,5,5
Gadap Town‎,48,48,48,48,48,48
Gulberg Town,90,90,90,90,90,90
Gulshan Town‎,97,97,97,97,97,97
Jamshed Town‎,100,100,100,100,100,100
Kiamari Town‎,100,100,100,100,100,100
Korangi Town‎,100,100,100,100,100,100
Landhi Town‎,22,22,22,22,22,22
Liaquatabad Town‎,100,100,100,100,100,100


In [20]:
print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))

There are 84 uniques categories.


In [21]:
# print out the list of categories
venues_df['VenueCategory'].unique()[:100]

array(['Diner', 'Café', 'Chinese Restaurant', 'BBQ Joint',
       'Pakistani Restaurant', 'Ice Cream Shop', 'Performing Arts Venue',
       'Fast Food Restaurant', 'Donut Shop', 'Multiplex',
       'Gym / Fitness Center', 'Japanese Restaurant', 'Steakhouse',
       'Falafel Restaurant', 'Pizza Place', 'Burger Joint', 'Bakery',
       'Tea Room', 'Social Club', 'Indian Restaurant', 'Snack Place',
       'Coffee Shop', 'Hotel', 'Theater', 'Dessert Shop',
       'Department Store', 'Restaurant', 'Salad Place', 'Shopping Mall',
       'Middle Eastern Restaurant', 'Historic Site', 'Frozen Yogurt Shop',
       'Beach', 'Mobile Phone Shop', 'Furniture / Home Store', 'Resort',
       'Asian Restaurant', 'Outdoors & Recreation', 'Park',
       'Fried Chicken Joint', 'Golf Course', 'Gym', 'Cricket Ground',
       'Juice Bar', 'Market', 'Farmers Market', 'Other Nightlife',
       'Convenience Store', 'Toll Plaza', 'Sandwich Place',
       'History Museum', 'Hookah Bar', 'Italian Restaurant', 'Bis

In [22]:
# check if the results contain "Bakery"
"Bakery" in venues_df['VenueCategory'].unique()

True

In [23]:
khi_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
khi_onehot['Neighborhoods'] = venues_df['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [khi_onehot.columns[-1]] + list(khi_onehot.columns[:-1])
khi_onehot = khi_onehot[fixed_columns]

print(khi_onehot.shape)
khi_onehot.head()

(1015, 85)


Unnamed: 0,Neighborhoods,African Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Terminal,Asian Restaurant,Auto Dealership,BBQ Joint,Bakery,...,Spa,Sporting Goods Shop,Steakhouse,Street Food Gathering,Supermarket,Tea Room,Thai Restaurant,Theater,Theme Park,Toll Plaza
0,Baldia Town‎,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Baldia Town‎,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Baldia Town‎,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Baldia Town‎,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,Baldia Town‎,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
khi_grouped = khi_onehot.groupby(["Neighborhoods"]).mean().reset_index()

print(khi_grouped.shape)
khi_grouped

(14, 85)


Unnamed: 0,Neighborhoods,African Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Terminal,Asian Restaurant,Auto Dealership,BBQ Joint,Bakery,...,Spa,Sporting Goods Shop,Steakhouse,Street Food Gathering,Supermarket,Tea Room,Thai Restaurant,Theater,Theme Park,Toll Plaza
0,Baldia Town‎,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068966,0.068966,...,0.0,0.0,0.017241,0.0,0.0,0.017241,0.0,0.017241,0.0,0.0
1,Bin Qasim Town‎,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Gadap Town‎,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.104167,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020833
3,Gulberg Town,0.0,0.0,0.0,0.0,0.0,0.011111,0.011111,0.066667,0.077778,...,0.0,0.0,0.0,0.0,0.0,0.011111,0.0,0.0,0.0,0.0
4,Gulshan Town‎,0.0,0.0,0.0,0.0,0.010309,0.010309,0.010309,0.092784,0.082474,...,0.0,0.0,0.0,0.0,0.0,0.010309,0.0,0.0,0.0,0.0
5,Jamshed Town‎,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.04,0.05,...,0.01,0.0,0.01,0.0,0.01,0.02,0.01,0.01,0.0,0.0
6,Kiamari Town‎,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.05,0.07,...,0.01,0.0,0.01,0.0,0.01,0.01,0.0,0.01,0.0,0.0
7,Korangi Town‎,0.0,0.0,0.0,0.0,0.01,0.02,0.0,0.07,0.05,...,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.01,0.0,0.0
8,Landhi Town‎,0.0,0.045455,0.045455,0.045455,0.045455,0.0,0.0,0.090909,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Liaquatabad Town‎,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.04,0.06,...,0.01,0.0,0.01,0.01,0.01,0.01,0.01,0.01,0.0,0.0


In [25]:
len(khi_grouped[khi_grouped["Bakery"] > 0])

12

In [26]:
khi_bakery = khi_grouped[["Neighborhoods","Bakery"]]

In [27]:
khi_bakery

Unnamed: 0,Neighborhoods,Bakery
0,Baldia Town‎,0.068966
1,Bin Qasim Town‎,0.0
2,Gadap Town‎,0.104167
3,Gulberg Town,0.077778
4,Gulshan Town‎,0.082474
5,Jamshed Town‎,0.05
6,Kiamari Town‎,0.07
7,Korangi Town‎,0.05
8,Landhi Town‎,0.0
9,Liaquatabad Town‎,0.06


### <font color=Green>Clustering Neighbourhoods</font>

In [28]:
# set number of clusters
kclusters = 4

khi_clustering = khi_bakery.drop(["Neighborhoods"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(khi_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([1, 0, 2, 1, 1, 3, 1, 3, 0, 3], dtype=int32)

In [29]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
khi_merged = khi_bakery.copy()

# add clustering labels
khi_merged["Cluster Labels"] = kmeans.labels_

In [30]:
khi_merged.rename(columns={"Neighborhoods": "Neighborhood"}, inplace=True)
khi_merged.head()

Unnamed: 0,Neighborhood,Bakery,Cluster Labels
0,Baldia Town‎,0.068966,1
1,Bin Qasim Town‎,0.0,0
2,Gadap Town‎,0.104167,2
3,Gulberg Town,0.077778,1
4,Gulshan Town‎,0.082474,1


In [31]:
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
khi_merged = khi_merged.join(khi_df.set_index("Neighborhood"), on="Neighborhood")

print(khi_merged.shape)
khi_merged.head() # check the last columns!

(14, 7)


Unnamed: 0,Neighborhood,Bakery,Cluster Labels,City,Country,Latitude,Longitude
0,Baldia Town‎,0.068966,1,Karachi,Pakistan,24.91896,66.987736
1,Bin Qasim Town‎,0.0,0,Karachi,Pakistan,24.822718,67.40351
2,Gadap Town‎,0.104167,2,Karachi,Pakistan,25.000475,67.131724
3,Gulberg Town,0.077778,1,Karachi,Pakistan,24.936514,67.07474
4,Gulshan Town‎,0.082474,1,Karachi,Pakistan,24.92977,67.123607


In [32]:
# sort the results by Cluster Labels
print(khi_merged.shape)
khi_merged.sort_values(["Cluster Labels"], inplace=True)
khi_merged

(14, 7)


Unnamed: 0,Neighborhood,Bakery,Cluster Labels,City,Country,Latitude,Longitude
1,Bin Qasim Town‎,0.0,0,Karachi,Pakistan,24.822718,67.40351
8,Landhi Town‎,0.0,0,Karachi,Pakistan,24.840856,67.193899
10,Lyari Town‎,0.022989,0,Karachi,Pakistan,24.866695,66.992277
0,Baldia Town‎,0.068966,1,Karachi,Pakistan,24.91896,66.987736
3,Gulberg Town,0.077778,1,Karachi,Pakistan,24.936514,67.07474
4,Gulshan Town‎,0.082474,1,Karachi,Pakistan,24.92977,67.123607
6,Kiamari Town‎,0.07,1,Karachi,Pakistan,24.9056,67.0822
13,North Nazimabad Town‎,0.085366,1,Karachi,Pakistan,24.941671,67.045575
2,Gadap Town‎,0.104167,2,Karachi,Pakistan,25.000475,67.131724
12,New Karachi Town‎,0.103448,2,Karachi,Pakistan,24.991363,67.062496


In [33]:
map_clusters = folium.Map(location=[latitude, longitude])

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(khi_merged['Latitude'], khi_merged['Longitude'], khi_merged['Neighborhood'], khi_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [34]:
# save the map as HTML file
map_clusters.save('map_clusters.html')

### <font color=Green>Examining Clusters & Results</font>

In [35]:
khi_merged.loc[khi_merged['Cluster Labels'] == 0]

Unnamed: 0,Neighborhood,Bakery,Cluster Labels,City,Country,Latitude,Longitude
1,Bin Qasim Town‎,0.0,0,Karachi,Pakistan,24.822718,67.40351
8,Landhi Town‎,0.0,0,Karachi,Pakistan,24.840856,67.193899
10,Lyari Town‎,0.022989,0,Karachi,Pakistan,24.866695,66.992277


In [36]:
khi_merged.loc[khi_merged['Cluster Labels'] == 1]

Unnamed: 0,Neighborhood,Bakery,Cluster Labels,City,Country,Latitude,Longitude
0,Baldia Town‎,0.068966,1,Karachi,Pakistan,24.91896,66.987736
3,Gulberg Town,0.077778,1,Karachi,Pakistan,24.936514,67.07474
4,Gulshan Town‎,0.082474,1,Karachi,Pakistan,24.92977,67.123607
6,Kiamari Town‎,0.07,1,Karachi,Pakistan,24.9056,67.0822
13,North Nazimabad Town‎,0.085366,1,Karachi,Pakistan,24.941671,67.045575


In [37]:
khi_merged.loc[khi_merged['Cluster Labels'] == 2]

Unnamed: 0,Neighborhood,Bakery,Cluster Labels,City,Country,Latitude,Longitude
2,Gadap Town‎,0.104167,2,Karachi,Pakistan,25.000475,67.131724
12,New Karachi Town‎,0.103448,2,Karachi,Pakistan,24.991363,67.062496


In [38]:
khi_merged.loc[khi_merged['Cluster Labels'] == 3]

Unnamed: 0,Neighborhood,Bakery,Cluster Labels,City,Country,Latitude,Longitude
5,Jamshed Town‎,0.05,3,Karachi,Pakistan,24.862581,67.061397
7,Korangi Town‎,0.05,3,Karachi,Pakistan,24.857739,67.152136
9,Liaquatabad Town‎,0.06,3,Karachi,Pakistan,24.90414,67.053276
11,Malir Town‎,0.051282,3,Karachi,Pakistan,24.894369,67.20091
