# Capstone DS Course
## Emet Flores
### Week 05 - Final Assignment
### Part 02_2_2 - Foursquare Data
### Notes:
 - Use Foursquare data to review San Diego venues
 - Using reference Zipcode
 - Venue data
 - And focusing on "Resutaurant" type venues

In [1]:
# Import necesary libraries
import pandas as pd
import numpy as np # library for vectorized computation
from sklearn.cluster import KMeans  #for modeling clusters


In [2]:
# Get the additional data that we processed and documented in a CSV file

San_Diego_codes = pd.read_csv ('SD_Target_Zipcodes.csv', sep = ',', index_col=0)

San_Diego_codes.head()

Unnamed: 0,ZipCode,ZipCodePopulation,HouseholdsPerZipcode,WhitePop,BlackPop,HispanicPop,AsianPop,IndianPop,HawaiianPop,OtherPop,...,IncomePerHousehold,MedianAge,AverageFamilySize,Latitude,Longitude,AreaLand,AreaWater,City,CountyName,Labels
1,91901,17403,6345,15466,315,2644,564,743,101,856,...,90397.0,41.9,3.1,32.789915,-116.711202,89.261,0.781,ALPINE,SAN DIEGO,2
2,91902,17653,5956,12379,757,7326,2481,272,217,2596,...,92759.0,43.2,3.26,32.67855,-117.013671,8.707,0.297,BONITA,SAN DIEGO,2
5,91910,75802,26063,47051,4255,45275,9351,1229,786,17635,...,59371.0,35.6,3.4,32.635694,-117.052566,12.233,0.034,CHULA VISTA,SAN DIEGO,0
6,91911,82999,24622,48709,4063,58816,8051,1208,955,24733,...,52274.0,33.5,3.72,32.607009,-117.050286,11.712,0.186,CHULA VISTA,SAN DIEGO,0
7,91913,40971,12133,21126,3018,18109,12528,428,532,6313,...,87440.0,32.0,3.67,32.632497,-116.991164,10.198,0.035,CHULA VISTA,SAN DIEGO,4


In [3]:
# Count the number of target Zip codes 
San_Diego_codes.shape

(80, 24)

In [4]:
# Import additional libraries for data vizualization

import numpy as np
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors


from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import folium # map rendering library

print('Required libraries imported.')

Required libraries imported.


# Review our **TARGET** -San Diego- geographic data on a Map

In [5]:
address = 'San Diego, CA'

geolocator = Nominatim(user_agent="on_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of San Diego is {}, {}.'.format(latitude, longitude))


The geograpical coordinate of San Diego is 32.7174209, -117.1627714.


In [6]:
# Lets reuse code from a previous lab
# create map

# And map the clusters of reference
num_clusters = 5

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=9)

# set color scheme for the clusters
x = np.arange(num_clusters)
ys = [i + x + (i*x)**2 for i in range(num_clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(San_Diego_codes['Latitude'], San_Diego_codes['Longitude'], San_Diego_codes['ZipCode'], San_Diego_codes['Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# Remembering the initial observations - Zip-Codes.com API

- Cluster 0 (Red): This cluster describes **LOWER INCOME** households.
- Cluster 2 (Light Blue): This cluster describes **AFLUENT** households.
- Cluster 4 (Orange): This cluster is the most common with **TYPICAL** households.


## Foursquare Data
### NOTE: Keys for the Foursquare API are removed after obtaining the information

In [7]:
# We add aditional libraries to handle the response of the Foursquare API

import json # library to handle JSON files
import requests # library to handle requests

# And add API credentials

CLIENT_ID = 'XXXX' # your Foursquare ID
CLIENT_SECRET = 'XXXX' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 200


## And reuse functions to explore venues by their coordinates
### Setting radius to 1000

In [112]:
def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['ZipCode', 
                  'ZipCode Latitude', 
                  'ZipCode Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [113]:
# Now we obtain the San Diego Venues

san_diego_venues = getNearbyVenues(names = San_Diego_codes['ZipCode'],
                                latitudes = San_Diego_codes['Latitude'],
                                longitudes = San_Diego_codes['Longitude']
                                )

san_diego_venues.head(5)

91901
91902
91910
91911
91913
91914
91915
91932
91935
91941
91942
91945
91950
91977
92007
92008
92009
92010
92011
92014
92019
92020
92021
92024
92025
92026
92027
92028
92029
92037
92040
92054
92056
92057
92058
92064
92065
92067
92069
92071
92075
92078
92081
92082
92083
92084
92091
92101
92102
92103
92104
92105
92106
92107
92108
92109
92110
92111
92113
92114
92115
92116
92117
92118
92119
92120
92121
92122
92123
92124
92126
92127
92128
92129
92130
92131
92139
92145
92154
92173


Unnamed: 0,ZipCode,ZipCode Latitude,ZipCode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,91902,32.68,-117.01,Bonita Golf Club,32.68,-117.01,Golf Course
1,91902,32.68,-117.01,Murrieta's,32.67,-117.02,Mexican Restaurant
2,91902,32.68,-117.01,Redbox,32.67,-117.02,Video Store
3,91902,32.68,-117.01,La Finca D'Adobe,32.67,-117.02,Mexican Restaurant
4,91902,32.68,-117.01,Bonita Golf Club Cafe,32.68,-117.01,Café


# And reuse the general approach to analise Foursquare data
### Unique categories
### Encode the findings


In [114]:
# Review the amount of data recieved
san_diego_venues.shape

(1887, 7)

In [115]:
# Save venue list in simple CSV File with the target zip codes of later use

san_diego_venues.to_csv('SD_Target_Venues.csv', sep = ',', encoding='UTF8')

In [116]:
# And count the information
san_diego_venues.groupby('ZipCode').count()

Unnamed: 0_level_0,ZipCode Latitude,ZipCode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
ZipCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
91902,10,10,10,10,10,10
91910,40,40,40,40,40,40
91911,26,26,26,26,26,26
91913,18,18,18,18,18,18
91914,38,38,38,38,38,38
...,...,...,...,...,...,...
92131,8,8,8,8,8,8
92139,26,26,26,26,26,26
92145,2,2,2,2,2,2
92154,6,6,6,6,6,6


In [167]:
# Check for unique Venue Categories
print('There are {} uniques categories.'.format(len(san_diego_venues['Venue Category'].unique())))

There are 277 uniques categories.


In [168]:
# Identify only venues associated to 'Restaurant' or 'Food'

sd_food_venues = san_diego_venues[san_diego_venues['Venue Category'].str.contains("Restaurant")]

sd_food_venues.head(5)


Unnamed: 0,ZipCode,ZipCode Latitude,ZipCode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
1,91902,32.68,-117.01,Murrieta's,32.67,-117.02,Mexican Restaurant
3,91902,32.68,-117.01,La Finca D'Adobe,32.67,-117.02,Mexican Restaurant
16,91910,32.64,-117.05,D'Lish California Cuisine,32.64,-117.05,Italian Restaurant
22,91910,32.64,-117.05,Daphne's California Greek,32.64,-117.05,Greek Restaurant
26,91910,32.64,-117.05,Bento & Noodles,32.64,-117.05,Japanese Restaurant


In [169]:
# And identify the total of resutaurant information we obtained
sd_food_venues.shape

(386, 7)

In [170]:
# And better undestand the general food venue categry representation
sd_food_venues.describe(include='all')

Unnamed: 0,ZipCode,ZipCode Latitude,ZipCode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
count,386.0,386.0,386.0,386,386.0,386.0,386
unique,,,,319,,,35
top,,,,Jack in the Box,,,Mexican Restaurant
freq,,,,13,,,88
mean,92060.52,32.82,-117.12,,32.82,-117.12,
std,77.23,0.17,0.1,,0.17,0.1,
min,91902.0,32.55,-117.36,,32.55,-117.36,
25%,92020.0,32.73,-117.18,,32.73,-117.18,
50%,92103.0,32.77,-117.13,,32.77,-117.13,
75%,92115.0,32.9,-117.05,,32.89,-117.05,


In [171]:
# And document the general popularity of the type of food venue

sd_general_top10 = sd_food_venues.groupby(['Venue Category'])['Venue Category'].count().nlargest(10) 
                            
sd_general_top10               

Venue Category
Mexican Restaurant       88
Fast Food Restaurant     49
American Restaurant      31
Chinese Restaurant       28
Sushi Restaurant         27
Seafood Restaurant       23
Italian Restaurant       20
Restaurant               16
Thai Restaurant          14
Vietnamese Restaurant    14
Name: Venue Category, dtype: int64

# In general for San Diego, food venue popularity:
1. Mexican Restaurant       88
2. Fast Food Restaurant     49
3. American Restaurant      31
4. Chinese Restaurant       28
5. Sushi Restaurant         27
6. Seafood Restaurant       23
7. Italian Restaurant       20
8. Restaurant               16
9. Thai Restaurant          14
10. Vietnamese Restaurant    14

In [172]:
# And prepare to analise the foursquere data, encoding the data we obtained

# one hot encoding
san_diego_onehot = pd.get_dummies(sd_food_venues[['Venue Category']], prefix="", prefix_sep="")

# add Zipcode column back to dataframe
san_diego_onehot['ZipCode'] = sd_food_venues['ZipCode'] 

# move Zipcode column to the first column
fixed_columns = [san_diego_onehot.columns[-1]] + list(san_diego_onehot.columns[:-1])
san_diego_onehot = san_diego_onehot[fixed_columns]

san_diego_onehot.head()

Unnamed: 0,ZipCode,American Restaurant,Argentinian Restaurant,Asian Restaurant,Caribbean Restaurant,Chinese Restaurant,Comfort Food Restaurant,Cuban Restaurant,Eastern European Restaurant,Fast Food Restaurant,...,Persian Restaurant,Ramen Restaurant,Restaurant,Russian Restaurant,Seafood Restaurant,Sushi Restaurant,Tex-Mex Restaurant,Thai Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant
1,91902,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,91902,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16,91910,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22,91910,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,91910,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [173]:
# And the generated encoding matrix shape
san_diego_onehot.shape

(386, 36)

In [174]:
# We group the information by the venue catoregory ocurrence
san_diego_grouped = san_diego_onehot.groupby('ZipCode').mean().reset_index()
san_diego_grouped.head(10)


Unnamed: 0,ZipCode,American Restaurant,Argentinian Restaurant,Asian Restaurant,Caribbean Restaurant,Chinese Restaurant,Comfort Food Restaurant,Cuban Restaurant,Eastern European Restaurant,Fast Food Restaurant,...,Persian Restaurant,Ramen Restaurant,Restaurant,Russian Restaurant,Seafood Restaurant,Sushi Restaurant,Tex-Mex Restaurant,Thai Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant
0,91902,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,91910,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,91911,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,91913,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,...,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0
4,91914,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2
5,91915,0.06,0.0,0.06,0.0,0.12,0.06,0.0,0.0,0.19,...,0.0,0.0,0.0,0.0,0.06,0.06,0.06,0.0,0.0,0.0
6,91932,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,91941,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,91942,0.05,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.14,...,0.0,0.0,0.0,0.05,0.05,0.05,0.05,0.05,0.0,0.05
9,91945,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [175]:
# And review groups
san_diego_grouped.shape

(55, 36)

In [124]:
# And prepare to identify the top five categories of each zipcode,
# using functions presented in the lab,
# so we can use the attributes for clustering pourposes

# Reuse function to order venues in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [176]:
# And reuse code to create new dataframe with top 10 restaurant venues for each neighborhood

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['ZipCode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
zipcode_venues_sorted = pd.DataFrame(columns=columns)
zipcode_venues_sorted['ZipCode'] = san_diego_grouped['ZipCode']

for ind in np.arange(san_diego_grouped.shape[0]):
    zipcode_venues_sorted.iloc[ind, 1:] = return_most_common_venues(san_diego_grouped.iloc[ind, :], num_top_venues)

zipcode_venues_sorted.head()

Unnamed: 0,ZipCode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,91902,Mexican Restaurant,Vietnamese Restaurant,Japanese Restaurant,Indian Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Filipino Restaurant,Fast Food Restaurant
1,91910,Mexican Restaurant,Fast Food Restaurant,Chinese Restaurant,Greek Restaurant,Japanese Restaurant,Italian Restaurant,Comfort Food Restaurant,Cuban Restaurant,Eastern European Restaurant,Caribbean Restaurant
2,91911,Asian Restaurant,Filipino Restaurant,Mexican Restaurant,Vietnamese Restaurant,Indian Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Fast Food Restaurant
3,91913,Sushi Restaurant,Fast Food Restaurant,Mexican Restaurant,Vietnamese Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Filipino Restaurant,Eastern European Restaurant
4,91914,Fast Food Restaurant,Mongolian Restaurant,Mexican Restaurant,Vietnamese Restaurant,Caribbean Restaurant,Chinese Restaurant,Comfort Food Restaurant,Cuban Restaurant,Eastern European Restaurant,Italian Restaurant


In [177]:
zipcode_venues_sorted.shape

(55, 11)

In [178]:
# We reuse the same steps as the lab and apply K-Means - First excercise

# set number of clusters to 5
kclusters = 8

sd_grouped_clustering = san_diego_grouped.drop('ZipCode', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(sd_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 4, 7, 7, 4, 7, 5, 6, 7, 0])

In [179]:
# And complement the datafram information

# add clustering labels
zipcode_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

# Create a copy of our initial dataframe
san_diego_merged = San_Diego_codes

# merge zipcode_venues_sorted with San_Diego_codes to add latitude/longitude for each zipcode
san_diego_merged = san_diego_merged.join(zipcode_venues_sorted.set_index('ZipCode'), on='ZipCode')

san_diego_merged.head() # check for the kmeans cluster label

Unnamed: 0,ZipCode,ZipCodePopulation,HouseholdsPerZipcode,WhitePop,BlackPop,HispanicPop,AsianPop,IndianPop,HawaiianPop,OtherPop,...,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,91901,17403,6345,15466,315,2644,564,743,101,856,...,,,,,,,,,,
2,91902,17653,5956,12379,757,7326,2481,272,217,2596,...,Mexican Restaurant,Vietnamese Restaurant,Japanese Restaurant,Indian Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Filipino Restaurant,Fast Food Restaurant
5,91910,75802,26063,47051,4255,45275,9351,1229,786,17635,...,Mexican Restaurant,Fast Food Restaurant,Chinese Restaurant,Greek Restaurant,Japanese Restaurant,Italian Restaurant,Comfort Food Restaurant,Cuban Restaurant,Eastern European Restaurant,Caribbean Restaurant
6,91911,82999,24622,48709,4063,58816,8051,1208,955,24733,...,Asian Restaurant,Filipino Restaurant,Mexican Restaurant,Vietnamese Restaurant,Indian Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Fast Food Restaurant
7,91913,40971,12133,21126,3018,18109,12528,428,532,6313,...,Sushi Restaurant,Fast Food Restaurant,Mexican Restaurant,Vietnamese Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Filipino Restaurant,Eastern European Restaurant


In [180]:
# The joined results might have missing values
number_nan = san_diego_merged['Cluster Labels'].isna().sum()
number_nan

25

In [181]:
# And remove records that had issues
bool_series = pd.notnull(san_diego_merged['Cluster Labels'])
clean_san_diego = san_diego_merged[bool_series] 
clean_san_diego.shape

(55, 35)

In [182]:
#Verify we eliminated records withh issues
number_nan = clean_san_diego['Cluster Labels'].isna().sum()
number_nan

0

In [206]:
# And visualize the K-Means clustering results - First excercise
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=9)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(clean_san_diego['Latitude'], clean_san_diego['Longitude'], clean_san_diego['ZipCode'], clean_san_diego['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)

    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# Initial Observations - Foursquare API
## At a glance:
- Cluster 0 (Red): This cluster describes a combination of **<span style="color:tomato"> Mexican </span>, <span style="color:DarkSalmon"> Vietnamese </span>, <span style="color:purple"> Japanese </span> and <span style="color:Khaki"> Indian </span>**.
- Cluster 1 (Purple): This cluster describes primarily **<span style="color:blue"> American </span> and <span style="color:green"> Seafood </span>** types.
- Cluster 2 (Blue): This cluster describes primarily **<span style="color:blue"> American </span>, <span style="color:GoldenRod"> Fast food </span> and <span style="color:Khaki"> Indian </span>** types.
- Cluster 3 (Light Blue): This small cluster describes a combination general **Restaurant, <span style="color:DarkSalmon"> Vietnamese </span>, <span style="color:GoldenRod"> Fast food </span>** types.
- Cluster 4 (Green): This cluster describes primarily **<span style="color:GoldenRod"> Fast food </span>, <span style="color:tomato"> Mexican </span> and <span style="color:CadetBlue"> Chinese </span>** types types.
- Cluster 5 (Light Green): This small cluster describes **<span style="color:CornflowerBlue"> Italian </span>, <span style="color:GoldenRod"> Fast food </span> and <span style="color:Khaki"> Indian </span>**
- Cluster 6 (Yellow): This small cluster describes **Chinese, <span style="color:DarkSalmon"> Vietnamese </span>, <span style="color:Khaki"> Indian </span>** .
- Cluster 7 (Orange): This cluster describes mayority **<span style="color:tomato"> Mexican </span>** and **<span style="color:GoldenRod"> Fast food </span>** venues.

## Note: If you run the cluster algorithms again, the order of the documented clusters changes.

Notes for HTML colors for

<span style="color:blue"> Blue </span>
<span style="color:tomato"> Red </span>
<span style="color:green"> Green </span>
<span style="color:purple"> Purple </span>
<span style="color:CornflowerBlue"> OtherBlue </span>
<span style="color:CadetBlue"> CadetBlue </span>
<span style="color:DarkSalmon"> DarkSalmon </span>
<span style="color:GoldenRod"> GoldenRod </span>
<span style="color:Khaki"> Khaki </span>

<span style="color:blue"> American </span>
<span style="color:tomato"> Mexican </span>
<span style="color:green"> Seafood </span>
<span style="color:purple"> Japanese </span>
<span style="color:CornflowerBlue"> Italian </span>
<span style="color:CadetBlue"> Chinese </span>
<span style="color:DarkSalmon"> Vietnamese </span>
<span style="color:GoldenRod"> Fast food </span>
<span style="color:Khaki"> Indian </span>

In [78]:
# We update pandas to more format friendly for reading
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [184]:
clean_san_diego.shape

(55, 35)

# Review the information to confirm insight for each cluster

In [197]:
# clean_san_diego.loc[clean_san_diego['Cluster Labels'] == 0,clean_san_diego.columns[list(range(0, 3))+list(range(23, 35))]].describe(include='all')
clean_san_diego.loc[clean_san_diego['Cluster Labels'] == 0,clean_san_diego.columns[list(range(0, 3))+list(range(23, 35))]]

Unnamed: 0,ZipCode,ZipCodePopulation,HouseholdsPerZipcode,Labels,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,91902,17653,5956,2,0.0,Mexican Restaurant,Vietnamese Restaurant,Japanese Restaurant,Indian Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Filipino Restaurant,Fast Food Restaurant
18,91945,25460,8480,4,0.0,Mexican Restaurant,Vietnamese Restaurant,Japanese Restaurant,Indian Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Filipino Restaurant,Fast Food Restaurant
70,92102,43267,13981,0,0.0,Mexican Restaurant,Vietnamese Restaurant,Japanese Restaurant,Indian Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Filipino Restaurant,Fast Food Restaurant
87,92120,26317,10971,4,0.0,Mexican Restaurant,American Restaurant,Italian Restaurant,Asian Restaurant,Caribbean Restaurant,Chinese Restaurant,Comfort Food Restaurant,Cuban Restaurant,Eastern European Restaurant,Japanese Restaurant
90,92123,26823,10039,4,0.0,Mexican Restaurant,Japanese Restaurant,Thai Restaurant,Argentinian Restaurant,Filipino Restaurant,Indian Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant
97,92131,32787,11574,4,0.0,Mexican Restaurant,Vietnamese Restaurant,Japanese Restaurant,Indian Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Filipino Restaurant,Fast Food Restaurant


In [199]:
#clean_san_diego.loc[clean_san_diego['Cluster Labels'] == 1,clean_san_diego.columns[list(range(0, 3))+list(range(23, 35))]].describe(include='all')
clean_san_diego.loc[clean_san_diego['Cluster Labels'] == 1,clean_san_diego.columns[list(range(0, 3))+list(range(23, 35))]]

Unnamed: 0,ZipCode,ZipCodePopulation,HouseholdsPerZipcode,Labels,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
23,91977,58368,18190,0,1.0,Seafood Restaurant,Vietnamese Restaurant,Fast Food Restaurant,Indian Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Filipino Restaurant,Eastern European Restaurant
69,92101,37095,20599,4,1.0,Seafood Restaurant,American Restaurant,Sushi Restaurant,Mexican Restaurant,Fast Food Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Filipino Restaurant
73,92105,69813,20540,0,1.0,Thai Restaurant,American Restaurant,Fast Food Restaurant,Indian Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Filipino Restaurant,Eastern European Restaurant
93,92127,39337,13019,4,1.0,American Restaurant,Thai Restaurant,Sushi Restaurant,Seafood Restaurant,Italian Restaurant,Fast Food Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant
94,92128,47490,20263,4,1.0,American Restaurant,Chinese Restaurant,Thai Restaurant,Fast Food Restaurant,Indian Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Filipino Restaurant
105,92154,79708,20202,0,1.0,American Restaurant,Seafood Restaurant,Fast Food Restaurant,Indian Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Filipino Restaurant,Eastern European Restaurant


In [188]:
clean_san_diego.loc[clean_san_diego['Cluster Labels'] == 2,clean_san_diego.columns[list(range(0, 3))+list(range(23, 35))]]

Unnamed: 0,ZipCode,ZipCodePopulation,HouseholdsPerZipcode,Labels,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
34,92019,42598,15137,4,2.0,American Restaurant,Fast Food Restaurant,Indian Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Filipino Restaurant,Eastern European Restaurant,Japanese Restaurant
66,92084,47654,14671,0,2.0,American Restaurant,Fast Food Restaurant,Indian Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Filipino Restaurant,Eastern European Restaurant,Japanese Restaurant
74,92106,19330,7807,2,2.0,American Restaurant,Asian Restaurant,Fast Food Restaurant,Indian Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Filipino Restaurant,Eastern European Restaurant
96,92130,48940,17528,4,2.0,American Restaurant,Fast Food Restaurant,Indian Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Filipino Restaurant,Eastern European Restaurant,Japanese Restaurant


In [189]:
clean_san_diego.loc[clean_san_diego['Cluster Labels'] == 3,clean_san_diego.columns[list(range(0, 3))+list(range(23, 35))]]

Unnamed: 0,ZipCode,ZipCodePopulation,HouseholdsPerZipcode,Labels,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
57,92067,9535,3460,2,3.0,Restaurant,Vietnamese Restaurant,Fast Food Restaurant,Indian Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Filipino Restaurant,Eastern European Restaurant
68,92091,1048,548,2,3.0,Restaurant,Vietnamese Restaurant,Fast Food Restaurant,Indian Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Filipino Restaurant,Eastern European Restaurant


In [201]:
#clean_san_diego.loc[clean_san_diego['Cluster Labels'] == 4,clean_san_diego.columns[list(range(0, 3))+list(range(23, 35))]].describe(include='all')
clean_san_diego.loc[clean_san_diego['Cluster Labels'] == 4,clean_san_diego.columns[list(range(0, 3))+list(range(23, 35))]]

Unnamed: 0,ZipCode,ZipCodePopulation,HouseholdsPerZipcode,Labels,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,91910,75802,26063,0,4.0,Mexican Restaurant,Fast Food Restaurant,Chinese Restaurant,Greek Restaurant,Japanese Restaurant,Italian Restaurant,Comfort Food Restaurant,Cuban Restaurant,Eastern European Restaurant,Caribbean Restaurant
8,91914,15448,4331,2,4.0,Fast Food Restaurant,Mongolian Restaurant,Mexican Restaurant,Vietnamese Restaurant,Caribbean Restaurant,Chinese Restaurant,Comfort Food Restaurant,Cuban Restaurant,Eastern European Restaurant,Italian Restaurant
20,91950,60322,15869,0,4.0,Mexican Restaurant,Fast Food Restaurant,Sushi Restaurant,Vietnamese Restaurant,Seafood Restaurant,Restaurant,Chinese Restaurant,Filipino Restaurant,Halal Restaurant,Greek Restaurant
36,92021,65068,22649,0,4.0,Fast Food Restaurant,Mexican Restaurant,Italian Restaurant,Asian Restaurant,Caribbean Restaurant,Chinese Restaurant,Comfort Food Restaurant,Cuban Restaurant,Eastern European Restaurant,Japanese Restaurant
49,92057,54096,17768,0,4.0,Fast Food Restaurant,Mexican Restaurant,Japanese Restaurant,Indian Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Filipino Restaurant,Vietnamese Restaurant
80,92113,56066,12315,0,4.0,Mexican Restaurant,Fast Food Restaurant,Chinese Restaurant,Filipino Restaurant,Indian Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Vietnamese Restaurant
84,92117,51332,20658,4,4.0,Fast Food Restaurant,Mexican Restaurant,Chinese Restaurant,Filipino Restaurant,Indian Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Vietnamese Restaurant
85,92118,23575,7408,2,4.0,Fast Food Restaurant,Chinese Restaurant,Vietnamese Restaurant,Indian Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Filipino Restaurant,Eastern European Restaurant
101,92139,35125,10216,0,4.0,Vietnamese Restaurant,Chinese Restaurant,Mexican Restaurant,Fast Food Restaurant,Caribbean Restaurant,Asian Restaurant,Comfort Food Restaurant,Cuban Restaurant,Eastern European Restaurant,Japanese Restaurant
107,92173,29429,7563,4,4.0,Fast Food Restaurant,Mexican Restaurant,Chinese Restaurant,Filipino Restaurant,Indian Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Vietnamese Restaurant


In [191]:
clean_san_diego.loc[clean_san_diego['Cluster Labels'] == 5,clean_san_diego.columns[list(range(0, 3))+list(range(23, 35))]]

Unnamed: 0,ZipCode,ZipCodePopulation,HouseholdsPerZipcode,Labels,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
13,91932,25718,9113,4,5.0,Italian Restaurant,Fast Food Restaurant,Indian Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Filipino Restaurant,Vietnamese Restaurant,Japanese Restaurant
77,92109,45787,23349,4,5.0,Italian Restaurant,Fast Food Restaurant,Indian Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Filipino Restaurant,Vietnamese Restaurant,Japanese Restaurant


In [192]:
clean_san_diego.loc[clean_san_diego['Cluster Labels'] == 6,clean_san_diego.columns[list(range(0, 3))+list(range(23, 35))]]

Unnamed: 0,ZipCode,ZipCodePopulation,HouseholdsPerZipcode,Labels,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
16,91941,31779,12327,4,6.0,Mexican Restaurant,Chinese Restaurant,Vietnamese Restaurant,Filipino Restaurant,Indian Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Fast Food Restaurant
32,92011,22405,9034,2,6.0,Chinese Restaurant,Vietnamese Restaurant,Fast Food Restaurant,Indian Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Filipino Restaurant,Eastern European Restaurant


In [194]:
clean_san_diego.loc[clean_san_diego['Cluster Labels'] == 7,clean_san_diego.columns[list(range(0, 3))+list(range(23, 35))]].describe(include='all')

Unnamed: 0,ZipCode,ZipCodePopulation,HouseholdsPerZipcode,Labels,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
count,23.0,23.0,23.0,23.0,23.0,23,23,23,23,23,23,23,23,23,23
unique,,,,,,7,13,10,12,14,17,13,12,12,15
top,,,,,,Mexican Restaurant,Fast Food Restaurant,Mexican Restaurant,Chinese Restaurant,Vietnamese Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Filipino Restaurant
freq,,,,,,6,4,6,5,6,4,5,7,5,4
mean,92063.61,40020.65,15090.87,2.78,7.0,,,,,,,,,,
std,74.08,18353.49,5953.3,1.78,0.0,,,,,,,,,,
min,91911.0,4179.0,1677.0,0.0,7.0,,,,,,,,,,
25%,92039.0,28923.5,10546.5,1.0,7.0,,,,,,,,,,
50%,92103.0,40375.0,16498.0,4.0,7.0,,,,,,,,,,
75%,92115.5,50328.5,19687.0,4.0,7.0,,,,,,,,,,


In [195]:
clean_san_diego.loc[clean_san_diego['Cluster Labels'] == 7,clean_san_diego.columns[list(range(0, 3))+list(range(23, 35))]]

Unnamed: 0,ZipCode,ZipCodePopulation,HouseholdsPerZipcode,Labels,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,91911,82999,24622,0,7.0,Asian Restaurant,Filipino Restaurant,Mexican Restaurant,Vietnamese Restaurant,Indian Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Fast Food Restaurant
7,91913,40971,12133,4,7.0,Sushi Restaurant,Fast Food Restaurant,Mexican Restaurant,Vietnamese Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Filipino Restaurant,Eastern European Restaurant
9,91915,24659,7070,4,7.0,Fast Food Restaurant,Mexican Restaurant,Chinese Restaurant,Kebab Restaurant,Asian Restaurant,Comfort Food Restaurant,French Restaurant,Italian Restaurant,American Restaurant,Tex-Mex Restaurant
17,91942,38069,16998,4,7.0,Fast Food Restaurant,Mexican Restaurant,Italian Restaurant,Mediterranean Restaurant,Vietnamese Restaurant,New American Restaurant,Chinese Restaurant,Greek Restaurant,Indian Restaurant,American Restaurant
35,92020,57767,19966,0,7.0,Mexican Restaurant,Restaurant,Middle Eastern Restaurant,Italian Restaurant,Fast Food Restaurant,Greek Restaurant,Seafood Restaurant,Chinese Restaurant,Persian Restaurant,Halal Restaurant
37,92024,49121,19649,4,7.0,Sushi Restaurant,Seafood Restaurant,Mexican Restaurant,Vietnamese Restaurant,Fast Food Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Filipino Restaurant
46,92054,40375,15218,4,7.0,American Restaurant,Fast Food Restaurant,Thai Restaurant,Sushi Restaurant,Mexican Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant,Filipino Restaurant
48,92056,51835,19175,0,7.0,Sushi Restaurant,Mexican Restaurant,Fast Food Restaurant,Chinese Restaurant,Vietnamese Restaurant,Seafood Restaurant,Caribbean Restaurant,Restaurant,Halal Restaurant,Greek Restaurant
61,92075,12056,5304,2,7.0,Mexican Restaurant,Sushi Restaurant,Seafood Restaurant,Chinese Restaurant,Vietnamese Restaurant,Fast Food Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant
63,92081,27404,10239,4,7.0,Mexican Restaurant,Japanese Restaurant,Italian Restaurant,Asian Restaurant,Filipino Restaurant,Indian Restaurant,Hawaiian Restaurant,Halal Restaurant,Greek Restaurant,French Restaurant


In [204]:
# Create a simple CSV File with the information gathered

clean_san_diego.to_csv('SD_Foursquare_Clusters.csv', sep = ',', encoding = 'ANSI')

In [209]:
# And visualize the both clustering results - ZipCodes API and Foursquare API
# create map
map_combined_clusters = folium.Map(location=[latitude, longitude], zoom_start=9)


# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# set color scheme for secondary clusters
cluster_2 = 5
x_2 = np.arange(cluster_2)
ys_2 = [i + x_2 + (i*x_2)**2 for i in range(cluster_2)]
colors_array_2 = cm.rainbow(np.linspace(0, 1, len(ys_2)))
rainbow_2 = [colors.rgb2hex(i) for i in colors_array_2]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster_venues, cluster_zip in zip(clean_san_diego['Latitude'], clean_san_diego['Longitude'], clean_san_diego['ZipCode'], clean_san_diego['Cluster Labels'], clean_san_diego['Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster_venues), parse_html=True)

    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster_venues)-1],
        fill=True,
        fill_color=rainbow[int(cluster_venues)-1],
        fill_opacity=0.8).add_to(map_combined_clusters)
    folium.CircleMarker(
        [lat, lon],
        radius=6,
        #popup=label,
        color=rainbow_2[int(cluster_zip)-1],
        fill=True,
        fill_color=rainbow_2[int(cluster_zip)-1],
        fill_opacity=0.0).add_to(map_combined_clusters)

map_combined_clusters

In [215]:
# And finaly we prepare a complete dataframe will all preliminary data
# So we can further analyze clusters and correlations

san_diego_all_data = clean_san_diego

# merge san_diego_grouped with San_Diego_codes to add all of Foursaquere coded insight to our dataframe
san_diego_all_data = san_diego_all_data.join(san_diego_grouped.set_index('ZipCode'), on='ZipCode')


In [216]:
#And verify our dataframe
san_diego_all_data.head()

Unnamed: 0,ZipCode,ZipCodePopulation,HouseholdsPerZipcode,WhitePop,BlackPop,HispanicPop,AsianPop,IndianPop,HawaiianPop,OtherPop,...,Persian Restaurant,Ramen Restaurant,Restaurant,Russian Restaurant,Seafood Restaurant,Sushi Restaurant,Tex-Mex Restaurant,Thai Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant
2,91902,17653,5956,12379,757,7326,2481,272,217,2596,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,91910,75802,26063,47051,4255,45275,9351,1229,786,17635,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,91911,82999,24622,48709,4063,58816,8051,1208,955,24733,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,91913,40971,12133,21126,3018,18109,12528,428,532,6313,...,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0
8,91914,15448,4331,8790,912,7140,4153,168,169,2440,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2


In [217]:
san_diego_all_data.shape

(55, 70)

In [218]:
# The joined results might have missing values
number_nan = san_diego_all_data['American Restaurant'].isna().sum()
number_nan

0

In [219]:
# Save all the relevant data in simple CSV File with the available target zip codes of later use

san_diego_all_data.to_csv('SD_All_Data_Target_Zipcodes.csv', sep = ',', encoding='UTF8')