<a href="https://colab.research.google.com/github/Bromus001/PersistentSearch/blob/master/Capstone_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Capstone Project : Little Pizza Store

## **Part 1 : The Idea**

In Buenos Aires, the capital city of Argentina, there are people from many different cultures living and working.

It is a big city with a huge collection of different companies ranging from small startups to big multinational ones.

Each day thousands workers needs to lunch and every night thousands of families need to dinner.

Pizza is a very popular because it can be delivered easily and is easy to share among several persons.

Mi idea is to launch a new Pizza Store and for this I want to identify the more promising neighbohoods as those with some similarity with others where current pizza business is going weel and has not many pizza stores already.

For this, I will get the neighborhood data of the city including latitud and longitud from a government website (https://data.buenosaires.gob.ar/dataset/barrios). This data does not provide the lat and lng for each neighborhood but the entire polygon with corners coordinates. So, some preprocessing will be nedded in order to get the center of each of thos polygons.

Then I will use foursquare API to retrieve the main venues in each neighborhood and use this data to run a clustering model and group the neighborhood by similarity using the venue category and the frecuency of venues of each one in this particular neighborood.

Using the clusters and the main venues in each one, I will be able to prioritize them as explained above.

## **Part 2 : The Code**

In [0]:
import pandas as pd
import numpy as np

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

from bs4 import BeautifulSoup
import requests

import folium # map rendering library

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

from google.colab import files


In [4]:


uploaded = files.upload()

Saving barrios.txt to barrios.txt


In [0]:
# Read the source file and apply basic cleaning
df_bsas = pd.read_csv('barrios.txt', sep='\t', low_memory=False, encoding="latin1")

df_bsas["polygon"] = df_bsas["WKT"].str.replace("POLYGON", "")
df_bsas["polygon"] = df_bsas["polygon"].apply(lambda x: str(x).replace("(", "").replace(")", ""))

In [0]:
"""
Parses the polygon field to determine de center. This is a simple formula that does not consider
the earth curvature. Given the small areas it has not signifivative impact.
"""
def get_data(row):
  min_lat = 0
  min_lng = 0
  max_lat = -999
  max_lng = -999
  
  coords = row["polygon"].split(",")
  
  for coord in coords:
    c = coord.split(" ")
    if len(c)==2:
      if float(c[0])>max_lng: max_lng = float(c[0])
      if float(c[0])<min_lng: min_lng = float(c[0])
      if float(c[1])>max_lat: max_lat = float(c[1])
      if float(c[1])<min_lat: min_lat = float(c[1])
      
  row["min_lat"] = min_lat
  row["max_lat"] = max_lat
  row["min_lng"] = min_lng
  row["max_lng"] = max_lng
  
  row["center_lat"] = min_lat + (max_lat - min_lat)/2
  row["center_lng"] = min_lng + (max_lng - min_lng)/2
  
  return row
    
  
# Get the center of each neghborhood polygon
df_bsas = df_bsas.apply(get_data, axis=1)

In [38]:
df_bsas.head()

Unnamed: 0,WKT,barrio,comuna,perimetro,area,polygon,min_lat,max_lat,min_lng,max_lng,center_lat,center_lng
0,"POLYGON ((-58.4528200492791 -34.5959886570639,...",CHACARITA,15,7725.695228,3118100.972,"-58.4528200492791 -34.5959886570639,-58.45365...",-34.597835,-34.578295,-58.466828,-58.438536,-34.588065,-58.452682
1,"POLYGON ((-58.4655768128541 -34.5965577078058,...",PATERNAL,15,7087.513295,2229829.034,"-58.4655768128541 -34.5965577078058,-58.46562...",-34.605311,-34.587445,-58.478831,-58.456236,-34.596378,-58.467534
2,"POLYGON ((-58.4237529813037 -34.5978273383243,...",VILLA CRESPO,15,8132.699348,3613583.69,"-58.4237529813037 -34.5978273383243,-58.42495...",-34.607616,-34.588668,-58.458935,-58.423367,-34.598142,-58.441151
3,"POLYGON ((-58.4946097568899 -34.6148652395239,...",VILLA DEL PARQUE,11,7705.389797,3399595.641,"-58.4946097568899 -34.6148652395239,-58.49478...",-34.615016,-34.596789,-58.506168,-58.474017,-34.605902,-58.490092
4,"POLYGON ((-58.4128700313089 -34.6141162515854,...",ALMAGRO,5,8537.901368,4050752.245,"-58.4128700313089 -34.6141162515854,-58.41281...",-34.622075,-34.597713,-58.433334,-58.411919,-34.609894,-58.422626


In [39]:
# create map of Buenos Aires using latitude and longitude values
map_bsas = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_bsas['center_lat'], df_bsas['center_lng'], df_bsas['comuna'], df_bsas['barrio']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill_color='#3186cc').add_to(map_bsas)  
    
map_bsas

In [0]:
CLIENT_ID = "CHDQNPMFLQDIRR2U2QOBIWKIVGMI1CU0FDR13YX3PSW5JC0X"
CLIENT_SECRET = "JM3SUNCY0PQEABL3EQJNORNWHJ2MPJBFMGCMBG0ZWB4N5N2B"
VERSION = '20180605' # Foursquare API version
LIMIT = 100
RADIUS = 500

Get venue data from foursquare using lat and lng

In [11]:
def get_venues(name, lat, lng):
  
  venues_list = []

  # create the API request URL
  url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
      CLIENT_ID, 
      CLIENT_SECRET, 
      VERSION, 
      lat, 
      lng, 
      RADIUS, 
      LIMIT)

  #try:
  # make the GET request
  results = requests.get(url).json()["response"]['groups'][0]['items']

  # return only relevant information for each nearby venue
  venues_list.append([(
      name, 
      lat, 
      lng, 
      v['venue']['name'], 
      v['venue']['location']['lat'], 
      v['venue']['location']['lng'],  
      v['venue']['categories'][0]['name']) for v in results])
  
  return venues_list



# Get all Venues from FourSquare
all_venues = []

for index, row in df.iterrows():
  print("Processing {}".format(row["barrio"]))
  venues = get_venues(row["barrio"], row["center_lat"], row["center_lng"])
  all_venues.extend(venues)
  

# Create a DataFrame
df_venues = pd.DataFrame([item for venue_list in all_venues for item in venue_list])

df_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
  
  

Processing CHACARITA
Processing PATERNAL
Processing VILLA CRESPO
Processing VILLA DEL PARQUE
Processing ALMAGRO
Processing CABALLITO
Processing VILLA SANTA RITA
Processing MONTE CASTRO
Processing VILLA REAL
Processing FLORES
Processing FLORESTA
Processing CONSTITUCION
Processing SAN CRISTOBAL
Processing BOEDO
Processing VELEZ SARSFIELD
Processing VILLA LURO
Processing PARQUE PATRICIOS
Processing MATADEROS
Processing VILLA LUGANO
Processing SAN TELMO
Processing SAAVEDRA
Processing COGHLAN
Processing VILLA URQUIZA
Processing COLEGIALES
Processing BALVANERA
Processing VILLA GRAL. MITRE
Processing PARQUE CHAS
Processing AGRONOMIA
Processing VILLA ORTUZAR
Processing BARRACAS
Processing PARQUE AVELLANEDA
Processing PARQUE CHACABUCO
Processing NUEVA POMPEYA
Processing PALERMO
Processing VILLA RIACHUELO
Processing VILLA SOLDATI
Processing VILLA PUEYRREDON
Processing VILLA DEVOTO
Processing LINIERS
Processing VERSALLES
Processing PUERTO MADERO
Processing MONSERRAT
Processing SAN NICOLAS
Process

In [40]:
df_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,CHACARITA,-34.588065,-58.452682,El Imperio de la Pizza,-34.58689,-58.454967,Pizza Place
1,CHACARITA,-34.588065,-58.452682,Santos 4040,-34.588822,-58.449863,Theater
2,CHACARITA,-34.588065,-58.452682,Albamonte Ristorante,-34.587803,-58.453075,Argentinian Restaurant
3,CHACARITA,-34.588065,-58.452682,Fábrica de Churros Olleros,-34.586983,-58.45364,Bakery
4,CHACARITA,-34.588065,-58.452682,Pizzería Santa María,-34.587238,-58.454005,Pizza Place


In [15]:
# one hot encoding
df_bsas_onehot = pd.get_dummies(df_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
df_bsas_onehot['Neighborhood'] = df_venues['Neighborhood'] 

	

# move neighborhood column to the first column
fixed_columns = ["Neighborhood"] + [col for col in df_bsas_onehot.columns.tolist() if col not in ["Neighborhood"]]

df_bsas_onehot = df_bsas_onehot[fixed_columns]

df_bsas_onehot.head()

Unnamed: 0,Neighborhood,Airport,Airport Food Court,Airport Lounge,Airport Terminal,American Restaurant,Arcade,Argentinian Restaurant,Art Museum,Arts & Entertainment,...,Theater,Thrift / Vintage Store,Toll Booth,Toy / Game Store,Train Station,Vegetarian / Vegan Restaurant,Venezuelan Restaurant,Veterinarian,Vietnamese Restaurant,Women's Store
0,CHACARITA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,CHACARITA,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,CHACARITA,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CHACARITA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CHACARITA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
# Grouping by Neighborhood and calculate the mean of the frecuency of each venue
df_bsas_grouped = df_bsas_onehot.groupby(["Neighborhood"]).mean().reset_index()
df_bsas_grouped.head()

Unnamed: 0,Neighborhood,Airport,Airport Food Court,Airport Lounge,Airport Terminal,American Restaurant,Arcade,Argentinian Restaurant,Art Museum,Arts & Entertainment,...,Theater,Thrift / Vintage Store,Toll Booth,Toy / Game Store,Train Station,Vegetarian / Vegan Restaurant,Venezuelan Restaurant,Veterinarian,Vietnamese Restaurant,Women's Store
0,AGRONOMIA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0
1,ALMAGRO,0.0,0.0,0.0,0.0,0.0,0.0,0.151515,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,BALVANERA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,BARRACAS,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,BELGRANO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [0]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [45]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = df_bsas_grouped['Neighborhood']

for ind in np.arange(df_bsas_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(df_bsas_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,AGRONOMIA,Soccer Field,Train Station,Garden Center,Restaurant,Convenience Store,Diner,Factory,Event Space,English Restaurant,Empanada Restaurant
1,ALMAGRO,Argentinian Restaurant,Bar,Café,Ice Cream Shop,Bus Stop,Pizza Place,Dessert Shop,Cafeteria,Cheese Shop,Concert Hall
2,BALVANERA,Café,Pizza Place,Fast Food Restaurant,Electronics Store,Food,Bus Stop,Restaurant,Gym,Cultural Center,Dance Studio
3,BARRACAS,Bakery,Clothing Store,Soccer Field,Sports Club,Café,Shoe Store,Plaza,Auto Workshop,Restaurant,General Entertainment
4,BELGRANO,Bus Stop,Park,Paper / Office Supplies Store,College Cafeteria,Convenience Store,Concert Hall,Factory,Event Space,English Restaurant,Empanada Restaurant


In [62]:
# set number of clusters
kclusters = 5

df_bsas_grouped_clustering = df_bsas_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_bsas_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 4, 4, 4, 0, 3, 1, 4, 4, 1], dtype=int32)

In [63]:
len(kmeans.labels_), df_bsas_grouped.shape, df_bsas.shape

(48, (48, 171), (48, 12))

In [64]:
# add clustering labels
df_bsas_grouped['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
df_bsas_merged = df_bsas.merge(df_bsas_grouped, how="left", left_on='barrio', right_on="Neighborhood")

df_bsas_merged.shape

(48, 183)

In [65]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_bsas_merged['center_lat'], df_bsas_merged['center_lng'], df_bsas_merged['barrio'], df_bsas_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster))
    folium.CircleMarker(
        [lat, lon],
        radius=10,
        popup=label,
        color=rainbow[cluster-1],
        fill_color=rainbow[cluster-1],
        fill_opacity=0.0).add_to(map_clusters)
       
map_clusters

In [66]:
df_bsas_merged["Cluster Labels"].value_counts()

4    34
0     7
1     5
3     1
2     1
Name: Cluster Labels, dtype: int64

In [70]:
df_bsas_merged[df_bsas_merged["Cluster Labels"]==3]

Unnamed: 0,WKT,barrio,comuna,perimetro,area,polygon,min_lat,max_lat,min_lng,max_lng,...,Thrift / Vintage Store,Toll Booth,Toy / Game Store,Train Station,Vegetarian / Vegan Restaurant,Venezuelan Restaurant,Veterinarian,Vietnamese Restaurant,Women's Store,Cluster Labels
47,"POLYGON ((-58.3552004576535 -34.6194307028365,...",BOCA,4,20595.90146,5028580.571,"-58.3552004576535 -34.6194307028365,-58.35513...",-34.632619,-34.618479,-58.361864,-58.335143,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3


In [72]:
df_bsas_grouped.head()

Unnamed: 0,Neighborhood,Airport,Airport Food Court,Airport Lounge,Airport Terminal,American Restaurant,Arcade,Argentinian Restaurant,Art Museum,Arts & Entertainment,...,Thrift / Vintage Store,Toll Booth,Toy / Game Store,Train Station,Vegetarian / Vegan Restaurant,Venezuelan Restaurant,Veterinarian,Vietnamese Restaurant,Women's Store,Cluster Labels
0,AGRONOMIA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0
1,ALMAGRO,0.0,0.0,0.0,0.0,0.0,0.0,0.151515,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
2,BALVANERA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
3,BARRACAS,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
4,BELGRANO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [75]:
num_top_venues = 5

for hood in df_bsas_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = df_bsas_grouped[df_bsas_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp[temp["venue"]!="Cluster Labels"]
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----AGRONOMIA----
           venue  freq
0   Soccer Field   0.4
1  Train Station   0.2
2  Garden Center   0.2
3     Restaurant   0.2
4        Airport   0.0


----ALMAGRO----
                    venue  freq
0  Argentinian Restaurant  0.15
1          Ice Cream Shop  0.09
2                    Café  0.09
3                     Bar  0.09
4             Pizza Place  0.06


----BALVANERA----
                  venue  freq
0                  Café  0.29
1           Pizza Place  0.21
2  Fast Food Restaurant  0.14
3                   Gym  0.07
4            Restaurant  0.07


----BARRACAS----
            venue  freq
0          Bakery  0.12
1  Clothing Store  0.12
2   Auto Workshop  0.06
3        Bus Stop  0.06
4            Café  0.06


----BELGRANO----
                           venue  freq
0                       Bus Stop   0.4
1              College Cafeteria   0.2
2                           Park   0.2
3  Paper / Office Supplies Store   0.2
4                         Museum   0.0


----BOCA----
   