# Comparing Anaheim, CA and Minneapolis, MN

In [1]:
import pandas as pd
import numpy as np
import geopy as gp
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium
import geocoder
import json
import requests
from pandas.io.json import json_normalize

## Clustering and Mapping of Orange County, CA

In [2]:
## defines url and dataframe for California
url_CA = 'https://www.zipcodestogo.com/California/'
df_CA = pd.read_html(url_CA)

In [3]:
## shows how many tables are in the webpage
len(df_CA)

4

In [4]:
## show particular table in the webpage
df_CA[1]

Unnamed: 0,0,1,2,3
0,Zip Codes for the State of California,Zip Codes for the State of California,Zip Codes for the State of California,Zip Codes for the State of California
1,Zip Code,City,County,Zip Code Map
2,90001,Los Angeles,Los Angeles,View Map
3,90002,Los Angeles,Los Angeles,View Map
4,90003,Los Angeles,Los Angeles,View Map
...,...,...,...,...
2654,96157,South Lake Tahoe,El Dorado,View Map
2655,96158,South Lake Tahoe,El Dorado,View Map
2656,96160,Truckee,Nevada,View Map
2657,96161,Truckee,Nevada,View Map


In [5]:
## extracts correct table from webpage
df_CA_1 = df_CA[1]
df_CA_1

Unnamed: 0,0,1,2,3
0,Zip Codes for the State of California,Zip Codes for the State of California,Zip Codes for the State of California,Zip Codes for the State of California
1,Zip Code,City,County,Zip Code Map
2,90001,Los Angeles,Los Angeles,View Map
3,90002,Los Angeles,Los Angeles,View Map
4,90003,Los Angeles,Los Angeles,View Map
...,...,...,...,...
2654,96157,South Lake Tahoe,El Dorado,View Map
2655,96158,South Lake Tahoe,El Dorado,View Map
2656,96160,Truckee,Nevada,View Map
2657,96161,Truckee,Nevada,View Map


In [6]:
## removes irrelevant first row of df
df_CA_2 = df_CA_1.iloc[1:]
df_CA_2

Unnamed: 0,0,1,2,3
1,Zip Code,City,County,Zip Code Map
2,90001,Los Angeles,Los Angeles,View Map
3,90002,Los Angeles,Los Angeles,View Map
4,90003,Los Angeles,Los Angeles,View Map
5,90004,Los Angeles,Los Angeles,View Map
...,...,...,...,...
2654,96157,South Lake Tahoe,El Dorado,View Map
2655,96158,South Lake Tahoe,El Dorado,View Map
2656,96160,Truckee,Nevada,View Map
2657,96161,Truckee,Nevada,View Map


In [7]:
## replaces column labels
df_CA_2 = df_CA_2.rename(columns=df_CA_2.iloc[0]).drop(df_CA_2.index[0])

In [8]:
## check to make sure code worked
df_CA_2.head()

Unnamed: 0,Zip Code,City,County,Zip Code Map
2,90001,Los Angeles,Los Angeles,View Map
3,90002,Los Angeles,Los Angeles,View Map
4,90003,Los Angeles,Los Angeles,View Map
5,90004,Los Angeles,Los Angeles,View Map
6,90005,Los Angeles,Los Angeles,View Map


In [9]:
## drops irrelevant zip code map column
df_CA_2 = df_CA_2.drop(columns = ["Zip Code Map"])

In [10]:
## check to make sure code worked
df_CA_2.head()

Unnamed: 0,Zip Code,City,County
2,90001,Los Angeles,Los Angeles
3,90002,Los Angeles,Los Angeles
4,90003,Los Angeles,Los Angeles
5,90004,Los Angeles,Los Angeles
6,90005,Los Angeles,Los Angeles


In [11]:
## keeps only Orange County values
df_Orange = df_CA_2[df_CA_2["County"]=="Orange"]

In [12]:
## verify new df
df_Orange

Unnamed: 0,Zip Code,City,County
197,90620,Buena Park,Orange
198,90621,Buena Park,Orange
199,90622,Buena Park,Orange
200,90623,La Palma,Orange
201,90624,Buena Park,Orange
...,...,...,...
1082,92871,Placentia,Orange
1090,92885,Yorba Linda,Orange
1091,92886,Yorba Linda,Orange
1092,92887,Yorba Linda,Orange


In [13]:
## reset index
df_Orange = df_Orange.reset_index(drop=True)

In [14]:
## verifying that index reset worked
df_Orange

Unnamed: 0,Zip Code,City,County
0,90620,Buena Park,Orange
1,90621,Buena Park,Orange
2,90622,Buena Park,Orange
3,90623,La Palma,Orange
4,90624,Buena Park,Orange
...,...,...,...
144,92871,Placentia,Orange
145,92885,Yorba Linda,Orange
146,92886,Yorba Linda,Orange
147,92887,Yorba Linda,Orange


In [15]:
## code for finding coordinates for zip codes
latitude=[]
longitude=[]
for code in df_Orange['Zip Code']:
    g = geocoder.arcgis('{}, California'.format(code))
    print(code, g.latlng)
    while (g.latlng is None):
        g = geocoder.arcgis('{}, California'.format(code))
        print(code, g.latlng)
    latlng = g.latlng
    latitude.append(latlng[0])
    longitude.append(latlng[1])

90620 [33.83478000000002, -118.01703499999996]
90621 [33.867869100000064, -117.99701829999998]
90622 [33.84661500000004, -118.00417499999998]
90623 [33.847830000000044, -118.04340499999995]
90624 [33.86520000000007, -117.99803999999995]
90630 [33.82919500000003, -118.04135499999995]
90631 [33.93594000000007, -117.94338999999997]
90632 [33.917320000000075, -117.95770999999996]
90633 [33.93186500000007, -117.94525999999996]
90680 [33.80564000000004, -117.99793499999998]
90720 [33.804925000000026, -118.06853499999994]
90721 [33.808100000000024, -118.06927999999999]
90740 [33.74354000000005, -118.10199999999998]
90742 [33.71629163000006, -118.06856207699997]
90743 [33.72861450000005, -118.08388096599998]
92602 [33.73287000000005, -117.77036499999997]
92603 [33.63692210000005, -117.8062301]
92604 [33.68290510000003, -117.78684589999995]
92605 [33.71560500000004, -118.01043999999996]
92606 [33.69343500000008, -117.82913499999995]
92607 [33.56065000000007, -117.70821999999998]
92609 [33.62236

In [16]:
df_Orange_2 = pd.DataFrame([latitude, longitude], ["Latitude", "Longitude"]).T
df_Orange_2

Unnamed: 0,Latitude,Longitude
0,33.834780,-118.017035
1,33.867869,-117.997018
2,33.846615,-118.004175
3,33.847830,-118.043405
4,33.865200,-117.998040
...,...,...
144,33.886580,-117.863080
145,33.891865,-117.820090
146,33.891445,-117.817305
147,33.880805,-117.760510


In [17]:
## combines Zip Code and Lat/Long tables
df_Orange_all = pd.concat([df_Orange, df_Orange_2], axis = 1)
df_Orange_all.head()

Unnamed: 0,Zip Code,City,County,Latitude,Longitude
0,90620,Buena Park,Orange,33.83478,-118.017035
1,90621,Buena Park,Orange,33.867869,-117.997018
2,90622,Buena Park,Orange,33.846615,-118.004175
3,90623,La Palma,Orange,33.84783,-118.043405
4,90624,Buena Park,Orange,33.8652,-117.99804


In [18]:
## creates blank map of Orange County using lat/long values
map_Orange = folium.Map(location = [33.7175, -117.8311], zoom_start = 10)
map_Orange

In [19]:
## adds markers for zip code values
for lat, long, city, zipcode in zip(df_Orange_all["Latitude"], df_Orange_all["Longitude"], df_Orange_all["Zip Code"], df_Orange_all["City"]):
    label = "{},{}".format(zipcode, city)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
    [lat, long],
    radius = 5,
    popup = label,
    color = "blue",
    fill = True,
    fill_color = "#3186cc",
    fill_opacity = 0.7,
    parse_html = False).add_to(map_Orange)

map_Orange

In [20]:
## Calls on Foursquare API
CLIENT_ID = 'M2I1QGJOQGDW33GVQENR0PZWFD5AZA0SNWZNLST0J1IMSRSN' # your Foursquare ID
CLIENT_SECRET = 'X3GUNXBQ1E4JZNUZ0N5IY5ZYYOJBWT00FCOPMI2UHMSUX3RM' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentials:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentials:
CLIENT_ID: M2I1QGJOQGDW33GVQENR0PZWFD5AZA0SNWZNLST0J1IMSRSN
CLIENT_SECRET:X3GUNXBQ1E4JZNUZ0N5IY5ZYYOJBWT00FCOPMI2UHMSUX3RM


In [21]:
##Function to get Nearby Venues for all Zip Codes
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Zip Code', 
                  'Zip Code Latitude', 
                  'Zip Code Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [22]:
## calls above function for each neighbourhood into a new dataframe
Orange_venues = getNearbyVenues(names = df_Orange_all["Zip Code"],
                                latitudes = df_Orange_all["Latitude"],
                                longitudes = df_Orange_all["Longitude"]
                               )

90620
90621
90622
90623
90624
90630
90631
90632
90633
90680
90720
90721
90740
90742
90743
92602
92603
92604
92605
92606
92607
92609
92610
92612
92614
92615
92616
92617
92618
92619
92620
92623
92624
92625
92626
92627
92628
92629
92630
92637
92646
92647
92648
92649
92650
92651
92652
92653
92654
92655
92656
92657
92658
92659
92660
92661
92662
92663
92672
92673
92674
92675
92676
92677
92678
92679
92683
92684
92685
92688
92690
92691
92692
92693
92694
92697
92698
92701
92702
92703
92704
92705
92706
92707
92708
92709
92710
92711
92712
92725
92728
92735
92780
92781
92782
92799
92801
92802
92803
92804
92805
92806
92807
92808
92809
92811
92812
92814
92815
92816
92817
92821
92822
92823
92825
92831
92832
92833
92834
92835
92836
92837
92838
92840
92841
92842
92843
92844
92845
92846
92850
92856
92857
92859
92861
92862
92863
92864
92865
92866
92867
92868
92869
92870
92871
92885
92886
92887
92899


In [23]:
## checks size of dataframe, previews first 20 rows of data
print(Orange_venues.shape)
Orange_venues.head(20)

(3239, 7)


Unnamed: 0,Zip Code,Zip Code Latitude,Zip Code Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,90620,33.83478,-118.017035,Maki Yaki,33.832474,-118.012574,Sushi Restaurant
1,90620,33.83478,-118.017035,San Antonio Park Playground,33.837466,-118.015545,Playground
2,90620,33.83478,-118.017035,Eddies Liquor Mart,33.832061,-118.019452,Liquor Store
3,90620,33.83478,-118.017035,Buena Park Little League,33.836906,-118.020844,Baseball Stadium
4,90621,33.867869,-117.997018,Poquito Mas,33.864773,-117.998474,Mexican Restaurant
5,90621,33.867869,-117.997018,Love Lash by Mia,33.869606,-117.997818,Spa
6,90621,33.867869,-117.997018,Bernie cafe,33.869717,-117.994929,Mexican Restaurant
7,90621,33.867869,-117.997018,63 Fishale House,33.86575,-117.994354,Gastropub
8,90621,33.867869,-117.997018,Corners Jip,33.865784,-117.994318,Gastropub
9,90621,33.867869,-117.997018,Gol Mok Gil (골목길),33.865778,-117.994192,Korean Restaurant


In [24]:
## prints out how many venues were returned for each zip code
Orange_venues.groupby("Zip Code").count()

Unnamed: 0_level_0,Zip Code Latitude,Zip Code Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Zip Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
90620,4,4,4,4,4,4
90621,10,10,10,10,10,10
90622,25,25,25,25,25,25
90623,13,13,13,13,13,13
90624,14,14,14,14,14,14
...,...,...,...,...,...,...
92871,39,39,39,39,39,39
92885,16,16,16,16,16,16
92886,17,17,17,17,17,17
92887,7,7,7,7,7,7


In [25]:
## counts for unique categories
print("There are {} unique categories.".format(len(Orange_venues["Venue Category"].unique())))

There are 290 unique categories.


## Analyzing each Zip Code in Orange County

In [26]:
## One Hot Encoding
Orange_onehot = pd.get_dummies(Orange_venues[["Venue Category"]], prefix = "", prefix_sep = "")

## add zip code column back to dataframe
Orange_onehot["Zip Code"] = Orange_venues["Zip Code"]

## move zip code column to first column
fixed_columns = [Orange_onehot.columns[-1]] + list(Orange_onehot.columns[:-1])
Orange_onehot = Orange_onehot[fixed_columns]

Orange_onehot.head(20)

Unnamed: 0,Zip Code,ATM,Accessories Store,Airport Terminal,American Restaurant,Antique Shop,Arcade,Art Gallery,Arts & Crafts Store,Arts & Entertainment,...,Video Store,Vietnamese Restaurant,Warehouse Store,Weight Loss Center,Whisky Bar,Wine Bar,Winery,Wings Joint,Women's Store,Yoga Studio
0,90620,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,90620,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,90620,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,90620,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,90621,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,90621,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,90621,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,90621,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,90621,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,90621,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
## examines dataframe size
Orange_onehot.shape

(3239, 291)

In [28]:
## grouping rows by zip code and by taking the mean of the frequency of occurence of each category
Orange_grouped = Orange_onehot.groupby("Zip Code").mean().reset_index()
Orange_grouped

Unnamed: 0,Zip Code,ATM,Accessories Store,Airport Terminal,American Restaurant,Antique Shop,Arcade,Art Gallery,Arts & Crafts Store,Arts & Entertainment,...,Video Store,Vietnamese Restaurant,Warehouse Store,Weight Loss Center,Whisky Bar,Wine Bar,Winery,Wings Joint,Women's Store,Yoga Studio
0,90620,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
1,90621,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
2,90622,0.000000,0.0,0.0,0.040000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
3,90623,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
4,90624,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141,92871,0.025641,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000
142,92885,0.000000,0.0,0.0,0.062500,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.062500,0.0,0.0,0.0,0.062500
143,92886,0.000000,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.058824
144,92887,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000


In [29]:
## confirming size
Orange_grouped.shape

(146, 291)

In [30]:
## printing each zip code along with the top 5 most common venues
num_top_venues = 5

for hood in Orange_grouped["Zip Code"]:
    print("----"+hood+"----")
    temp = Orange_grouped[Orange_grouped["Zip Code"] == hood].T.reset_index()
    temp.columns = ["Venue", "freq"]
    temp = temp.iloc[1:]
    temp["freq"] = temp["freq"].astype(float)
    temp = temp.round({"freq":2})
    print(temp.sort_values("freq", ascending = False).reset_index(drop = True).head(num_top_venues))
    print("\n")

----90620----
              Venue  freq
0      Liquor Store  0.25
1        Playground  0.25
2  Baseball Stadium  0.25
3  Sushi Restaurant  0.25
4         Pool Hall  0.00


----90621----
                Venue  freq
0        Liquor Store   0.2
1           Gastropub   0.2
2  Mexican Restaurant   0.2
3   Korean Restaurant   0.2
4         Pizza Place   0.1


----90622----
                          Venue  freq
0  Theme Park Ride / Attraction  0.48
1                    Theme Park  0.20
2                          Park  0.04
3                      Pharmacy  0.04
4              Theme Restaurant  0.04


----90623----
                  Venue  freq
0     Korean Restaurant  0.23
1      Asian Restaurant  0.15
2           Supermarket  0.08
3  Fast Food Restaurant  0.08
4                Bakery  0.08


----90624----
               Venue  freq
0  Korean Restaurant  0.21
1        Pizza Place  0.14
2          Gastropub  0.14
3      Shopping Mall  0.07
4       Liquor Store  0.07


----90630----
            

4       Sandwich Place  0.04


----92673----
                     Venue  freq
0         Airport Terminal  0.33
1                     Park  0.33
2                    Trail  0.33
3                      ATM  0.00
4  North Indian Restaurant  0.00


----92674----
                  Venue  freq
0        Clothing Store  0.14
1            Shoe Store  0.08
2  Fast Food Restaurant  0.04
3        Discount Store  0.04
4    Mexican Restaurant  0.04


----92675----
                  Venue  freq
0  Fast Food Restaurant  0.09
1   American Restaurant  0.07
2                Bakery  0.07
3    Mexican Restaurant  0.07
4           Coffee Shop  0.07


----92677----
                     Venue  freq
0           Hardware Store  0.14
1        Convenience Store  0.14
2                      Gym  0.14
3  Comfort Food Restaurant  0.14
4     Fast Food Restaurant  0.14


----92678----
                Venue  freq
0                Park   0.2
1          Campground   0.2
2           Pet Store   0.2
3    Business Service  

                     Venue  freq
0              Coffee Shop  0.12
1           Sandwich Place  0.06
2             Burger Joint  0.06
3  New American Restaurant  0.06
4             Noodle House  0.06


----92825----
          Venue  freq
0   Bus Station  0.11
1  Home Service  0.11
2    Strip Club  0.11
3       Brewery  0.11
4    Steakhouse  0.11


----92831----
                      Venue  freq
0               Pizza Place  0.10
1    Thrift / Vintage Store  0.10
2         Convenience Store  0.10
3          Sushi Restaurant  0.05
4  Mediterranean Restaurant  0.05


----92832----
                Venue  freq
0                 Bar  0.05
1         Pizza Place  0.05
2  Mexican Restaurant  0.05
3  Italian Restaurant  0.04
4                Café  0.04


----92833----
                  Venue  freq
0                  Park   0.4
1             Pool Hall   0.2
2            Playground   0.2
3  Kitchen Supply Store   0.2
4                   ATM   0.0


----92834----
                      Venue  freq
0   

In [31]:
## function to sort venues in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [32]:
## creating new dataframe and display top 10 venues for each zip code
num_top_venues = 10
indicators = ["st", "nd", "rd"]

#create columns according to number of top venues
columns = ["Zip Code"]
for ind in np.arange(num_top_venues):
    try:
        columns.append("{}{} Most Common Venue".format(ind+1, indicators[ind]))
    except:
        columns.append("{}th Most Common Venue".format(ind+1))
        
## create new dataframe
Orange_venues_sorted = pd.DataFrame(columns = columns)
Orange_venues_sorted["Zip Code"] = Orange_grouped["Zip Code"]

for ind in np.arange(Orange_grouped.shape[0]):
    Orange_venues_sorted.iloc[ind,1:] = return_most_common_venues(Orange_grouped.iloc[ind,:], num_top_venues)
    
Orange_venues_sorted.head()

Unnamed: 0,Zip Code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,90620,Baseball Stadium,Sushi Restaurant,Liquor Store,Playground,Yoga Studio,Fish Market,Fast Food Restaurant,Filipino Restaurant,Financial or Legal Service,Fish & Chips Shop
1,90621,Liquor Store,Korean Restaurant,Gastropub,Mexican Restaurant,Spa,Pizza Place,Fish Market,Fast Food Restaurant,Filipino Restaurant,Financial or Legal Service
2,90622,Theme Park Ride / Attraction,Theme Park,Theme Restaurant,Theater,American Restaurant,Dessert Shop,Electronics Store,Pharmacy,Flower Shop,Park
3,90623,Korean Restaurant,Asian Restaurant,Bubble Tea Shop,Mexican Restaurant,Fast Food Restaurant,Bakery,Soup Place,Spa,Supermarket,Grocery Store
4,90624,Korean Restaurant,Pizza Place,Gastropub,Fried Chicken Joint,Liquor Store,Rental Car Location,Spa,Mexican Restaurant,Shopping Mall,Fast Food Restaurant


In [33]:
## set number of clusters
kclusters = 5
Orange_grouped_clustering = Orange_grouped.drop("Zip Code", 1)

## run k-means clustering
kmeans = KMeans(n_clusters = kclusters, random_state = 0).fit(Orange_grouped_clustering)

## check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [34]:
## add clustering labels
Orange_venues_sorted.insert(0, "Cluster Labels", kmeans.labels_)

In [35]:
## merge Orange_grouped with zip codes to add lat/long for each zip code
Orange_merged = df_Orange_all
Orange_merged = Orange_merged.join(Orange_venues_sorted.set_index("Zip Code"), on = "Zip Code")

## remove any NaN values
Orange_merged = Orange_merged.dropna()
Orange_merged.head(20)

Unnamed: 0,Zip Code,City,County,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,90620,Buena Park,Orange,33.83478,-118.017035,2.0,Baseball Stadium,Sushi Restaurant,Liquor Store,Playground,Yoga Studio,Fish Market,Fast Food Restaurant,Filipino Restaurant,Financial or Legal Service,Fish & Chips Shop
1,90621,Buena Park,Orange,33.867869,-117.997018,2.0,Liquor Store,Korean Restaurant,Gastropub,Mexican Restaurant,Spa,Pizza Place,Fish Market,Fast Food Restaurant,Filipino Restaurant,Financial or Legal Service
2,90622,Buena Park,Orange,33.846615,-118.004175,2.0,Theme Park Ride / Attraction,Theme Park,Theme Restaurant,Theater,American Restaurant,Dessert Shop,Electronics Store,Pharmacy,Flower Shop,Park
3,90623,La Palma,Orange,33.84783,-118.043405,2.0,Korean Restaurant,Asian Restaurant,Bubble Tea Shop,Mexican Restaurant,Fast Food Restaurant,Bakery,Soup Place,Spa,Supermarket,Grocery Store
4,90624,Buena Park,Orange,33.8652,-117.99804,2.0,Korean Restaurant,Pizza Place,Gastropub,Fried Chicken Joint,Liquor Store,Rental Car Location,Spa,Mexican Restaurant,Shopping Mall,Fast Food Restaurant
5,90630,Cypress,Orange,33.829195,-118.041355,2.0,Construction & Landscaping,Insurance Office,Playground,Rental Car Location,Business Service,Latin American Restaurant,Mexican Restaurant,Fish Market,Farmers Market,Fast Food Restaurant
6,90631,La Habra,Orange,33.93594,-117.94339,2.0,Deli / Bodega,Electronics Store,Pizza Place,Lounge,Burger Joint,Southern / Soul Food Restaurant,Cosmetics Shop,Vietnamese Restaurant,Bowling Alley,Mexican Restaurant
7,90632,La Habra,Orange,33.91732,-117.95771,2.0,Coffee Shop,Chinese Restaurant,Big Box Store,Bakery,Mobile Phone Shop,Smoke Shop,Sandwich Place,Korean Restaurant,Fast Food Restaurant,Salon / Barbershop
8,90633,La Habra,Orange,33.931865,-117.94526,2.0,Pizza Place,Other Repair Shop,Electronics Store,Lawyer,Burger Joint,Museum,Mexican Restaurant,Convenience Store,Theater,Fast Food Restaurant
9,90680,Stanton,Orange,33.80564,-117.997935,2.0,Health & Beauty Service,Movie Theater,Liquor Store,American Restaurant,Golf Driving Range,Arts & Entertainment,Mexican Restaurant,Japanese Restaurant,Pet Store,Yoga Studio


In [36]:
## create map
map_clusters = folium.Map(location=[33.7175,-117.8311], zoom_start = 10)

## set colour scheme for clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

## add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Orange_merged['Latitude'], Orange_merged['Longitude'], Orange_merged['Zip Code'], Orange_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Cluster 1

In [37]:
## Cluster 1
Orange_merged.loc[Orange_merged['Cluster Labels'] == 0, Orange_merged.columns[[1] + list(range(5, Orange_merged.shape[1]))]]

Unnamed: 0,City,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
16,Irvine,0.0,Park,Tennis Court,Restaurant,Falafel Restaurant,Farm,Farmers Market,Fast Food Restaurant,Filipino Restaurant,Financial or Legal Service,Fish & Chips Shop
17,Irvine,0.0,Park,Playground,Massage Studio,Gym,Food Court,Food & Drink Shop,Food,Food Service,Flower Shop,Fabric Shop
23,Irvine,0.0,Park,Gym,Food Truck,Playground,Fish & Chips Shop,Farm,Farmers Market,Fast Food Restaurant,Filipino Restaurant,Financial or Legal Service
27,Irvine,0.0,Park,Dog Run,Pool,Scenic Lookout,Farm,Farmers Market,Fast Food Restaurant,Filipino Restaurant,Financial or Legal Service,Fish & Chips Shop
28,Irvine,0.0,Park,Pizza Place,Fish Market,Farm,Farmers Market,Fast Food Restaurant,Filipino Restaurant,Financial or Legal Service,Fish & Chips Shop,Flower Shop
30,Irvine,0.0,Park,Pool,Music Venue,Gym / Fitness Center,Fish & Chips Shop,Falafel Restaurant,Farm,Farmers Market,Fast Food Restaurant,Filipino Restaurant
47,Laguna Hills,0.0,Park,Rental Car Location,Sports Bar,Convenience Store,Fish Market,Farm,Farmers Market,Fast Food Restaurant,Filipino Restaurant,Financial or Legal Service
51,Newport Coast,0.0,Pool,Park,Flower Shop,Tennis Court,Indoor Play Area,Coffee Shop,Farmers Market,Fast Food Restaurant,Filipino Restaurant,Financial or Legal Service
59,San Clemente,0.0,Park,Airport Terminal,Trail,Fish Market,Farm,Farmers Market,Fast Food Restaurant,Filipino Restaurant,Financial or Legal Service,Fish & Chips Shop
97,Anaheim,0.0,Park,Performing Arts Venue,Food Truck,Fish & Chips Shop,Farm,Farmers Market,Fast Food Restaurant,Filipino Restaurant,Financial or Legal Service,Fish Market


## Cluster 2

In [38]:
## Cluster 2
Orange_merged.loc[Orange_merged['Cluster Labels'] == 1, Orange_merged.columns[[1] + list(range(5, Orange_merged.shape[1]))]]

Unnamed: 0,City,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
72,Mission Viejo,1.0,Home Service,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Filipino Restaurant,Financial or Legal Service,Fish & Chips Shop,Fish Market,Flower Shop,Yoga Studio
101,Anaheim,1.0,Brewery,Music Venue,BBQ Joint,Home Service,Yoga Studio,Flower Shop,Fast Food Restaurant,Filipino Restaurant,Financial or Legal Service,Fish & Chips Shop
126,Garden Grove,1.0,Men's Store,Shopping Plaza,Home Service,Bar,Fish Market,Farmers Market,Fast Food Restaurant,Filipino Restaurant,Financial or Legal Service,Fish & Chips Shop
138,Orange,1.0,Thrift / Vintage Store,Home Service,Flower Shop,Mexican Restaurant,Farmers Market,Fast Food Restaurant,Filipino Restaurant,Financial or Legal Service,Fish & Chips Shop,Fish Market


## Cluster 3

In [39]:
## Cluster 3
Orange_merged.loc[Orange_merged['Cluster Labels'] == 2, Orange_merged.columns[[1] + list(range(5, Orange_merged.shape[1]))]]

Unnamed: 0,City,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Buena Park,2.0,Baseball Stadium,Sushi Restaurant,Liquor Store,Playground,Yoga Studio,Fish Market,Fast Food Restaurant,Filipino Restaurant,Financial or Legal Service,Fish & Chips Shop
1,Buena Park,2.0,Liquor Store,Korean Restaurant,Gastropub,Mexican Restaurant,Spa,Pizza Place,Fish Market,Fast Food Restaurant,Filipino Restaurant,Financial or Legal Service
2,Buena Park,2.0,Theme Park Ride / Attraction,Theme Park,Theme Restaurant,Theater,American Restaurant,Dessert Shop,Electronics Store,Pharmacy,Flower Shop,Park
3,La Palma,2.0,Korean Restaurant,Asian Restaurant,Bubble Tea Shop,Mexican Restaurant,Fast Food Restaurant,Bakery,Soup Place,Spa,Supermarket,Grocery Store
4,Buena Park,2.0,Korean Restaurant,Pizza Place,Gastropub,Fried Chicken Joint,Liquor Store,Rental Car Location,Spa,Mexican Restaurant,Shopping Mall,Fast Food Restaurant
...,...,...,...,...,...,...,...,...,...,...,...,...
144,Placentia,2.0,Coffee Shop,Mexican Restaurant,Pharmacy,Sandwich Place,Chinese Restaurant,Comfort Food Restaurant,Noodle House,Supermarket,Fried Chicken Joint,Breakfast Spot
145,Yorba Linda,2.0,Yoga Studio,Historic Site,Library,Basketball Court,Grocery Store,New American Restaurant,Boxing Gym,Pharmacy,Optical Shop,Cosmetics Shop
146,Yorba Linda,2.0,Mexican Restaurant,Yoga Studio,Historic Site,Cosmetics Shop,Pharmacy,Sandwich Place,New American Restaurant,Boxing Gym,Tex-Mex Restaurant,Library
147,Yorba Linda,2.0,Trail,Basketball Court,Park,Bed & Breakfast,Yoga Studio,Farm,Fast Food Restaurant,Filipino Restaurant,Financial or Legal Service,Fish & Chips Shop


## Cluster 4

In [40]:
## Cluster 4
Orange_merged.loc[Orange_merged['Cluster Labels'] == 3, Orange_merged.columns[[1] + list(range(5, Orange_merged.shape[1]))]]

Unnamed: 0,City,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
68,Westminster,3.0,Sporting Goods Shop,Yoga Studio,Fish Market,Farm,Farmers Market,Fast Food Restaurant,Filipino Restaurant,Financial or Legal Service,Fish & Chips Shop,Flower Shop


## Cluster 5

In [41]:
## Cluster 5
Orange_merged.loc[Orange_merged['Cluster Labels'] == 4, Orange_merged.columns[[1] + list(range(5, Orange_merged.shape[1]))]]

Unnamed: 0,City,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
86,Irvine,4.0,Seafood Restaurant,Yoga Studio,Fabric Shop,Farm,Farmers Market,Fast Food Restaurant,Filipino Restaurant,Financial or Legal Service,Fish & Chips Shop,Fish Market


## Clustering and Mapping of Hennepin County, MN

In [42]:
## defines url and dataframe for Minnesota
url_MN = 'https://www.zipcodestogo.com/Minnesota/'
df_MN = pd.read_html(url_MN)

In [43]:
## shows how many tables are in the webpage
len(df_MN)

4

In [44]:
## show particular table in the webpage
df_MN[1]

Unnamed: 0,0,1,2,3
0,Zip Codes for the State of Minnesota,Zip Codes for the State of Minnesota,Zip Codes for the State of Minnesota,Zip Codes for the State of Minnesota
1,Zip Code,City,County,Zip Code Map
2,55001,Afton,Washington,View Map
3,55002,Almelund,Chisago,View Map
4,55003,Bayport,Washington,View Map
...,...,...,...,...
1028,56759,Strathcona,Roseau,View Map
1029,56760,Viking,Marshall,View Map
1030,56761,Wannaska,Roseau,View Map
1031,56762,Warren,Marshall,View Map


In [45]:
## extracts correct table from the webpage
df_MN_1 = df_MN[1]
df_MN_1

Unnamed: 0,0,1,2,3
0,Zip Codes for the State of Minnesota,Zip Codes for the State of Minnesota,Zip Codes for the State of Minnesota,Zip Codes for the State of Minnesota
1,Zip Code,City,County,Zip Code Map
2,55001,Afton,Washington,View Map
3,55002,Almelund,Chisago,View Map
4,55003,Bayport,Washington,View Map
...,...,...,...,...
1028,56759,Strathcona,Roseau,View Map
1029,56760,Viking,Marshall,View Map
1030,56761,Wannaska,Roseau,View Map
1031,56762,Warren,Marshall,View Map


In [46]:
## removes irrelevant first row of df
df_MN_2 = df_MN_1.iloc[1:]
df_MN_2

Unnamed: 0,0,1,2,3
1,Zip Code,City,County,Zip Code Map
2,55001,Afton,Washington,View Map
3,55002,Almelund,Chisago,View Map
4,55003,Bayport,Washington,View Map
5,55005,Bethel,Anoka,View Map
...,...,...,...,...
1028,56759,Strathcona,Roseau,View Map
1029,56760,Viking,Marshall,View Map
1030,56761,Wannaska,Roseau,View Map
1031,56762,Warren,Marshall,View Map


In [47]:
## replaces column labels
df_MN_2 = df_MN_2.rename(columns=df_MN_2.iloc[0]).drop(df_MN_2.index[0])

In [48]:
## check to make sure code worked
df_MN_2.head()

Unnamed: 0,Zip Code,City,County,Zip Code Map
2,55001,Afton,Washington,View Map
3,55002,Almelund,Chisago,View Map
4,55003,Bayport,Washington,View Map
5,55005,Bethel,Anoka,View Map
6,55006,Braham,Isanti,View Map


In [49]:
## drops irrelevant zip code map column
df_MN_2 = df_MN_2.drop(columns = ["Zip Code Map"])

In [50]:
## check to make sure code worked
df_MN_2.head()

Unnamed: 0,Zip Code,City,County
2,55001,Afton,Washington
3,55002,Almelund,Chisago
4,55003,Bayport,Washington
5,55005,Bethel,Anoka
6,55006,Braham,Isanti


In [51]:
## keeps only Hennepin County values
df_Hennepin = df_MN_2[df_MN_2["County"]=="Hennepin"]

In [52]:
## verify new df
df_Hennepin

Unnamed: 0,Zip Code,City,County
86,55111,Saint Paul,Hennepin
130,55305,Hopkins,Hennepin
136,55311,Osseo,Hennepin
141,55316,Champlin,Hennepin
148,55323,Crystal Bay,Hennepin
...,...,...,...
338,55595,Loretto,Hennepin
339,55596,Loretto,Hennepin
340,55597,Loretto,Hennepin
341,55598,Loretto,Hennepin


In [53]:
## reset index
df_Hennepin = df_Hennepin.reset_index(drop=True)

In [54]:
## verify that index reset worked
df_Hennepin

Unnamed: 0,Zip Code,City,County
0,55111,Saint Paul,Hennepin
1,55305,Hopkins,Hennepin
2,55311,Osseo,Hennepin
3,55316,Champlin,Hennepin
4,55323,Crystal Bay,Hennepin
...,...,...,...
101,55595,Loretto,Hennepin
102,55596,Loretto,Hennepin
103,55597,Loretto,Hennepin
104,55598,Loretto,Hennepin


In [55]:
## code for finding coordinates for zip codes
latitude=[]
longitude=[]
for code in df_Hennepin['Zip Code']:
    g = geocoder.arcgis('{}, Minnesota'.format(code))
    print(code, g.latlng)
    while (g.latlng is None):
        g = geocoder.arcgis('{}, Minnesota'.format(code))
        print(code, g.latlng)
    latlng = g.latlng
    latitude.append(latlng[0])
    longitude.append(latlng[1])

55111 [44.892485900000054, -93.19748649999997]
55305 [44.971740700000055, -93.44723729999998]
55311 [45.078224900000066, -93.45516909999998]
55316 [45.16441000000003, -93.36128499999995]
55323 [44.95292000000006, -93.57627999999994]
55327 [45.24200500000006, -93.51450999999997]
55331 [44.90003500000006, -93.56950499999994]
55340 [45.04446010000004, -93.52840989999999]
55343 [44.92076000000003, -93.41533999999996]
55344 [44.85089500000004, -93.42545499999994]
55345 [44.90990500000004, -93.50318999999996]
55346 [44.87420580000003, -93.50273849999996]
55347 [44.826360700000066, -93.40649629999996]
55348 [45.00798000000003, -93.65734999999995]
55356 [44.98457270000006, -93.57674809999997]
55357 [45.05747500000007, -93.63541499999997]
55359 [45.00819000000007, -93.65596499999998]
55361 [44.94077000000004, -93.59714999999994]
55364 [44.92528500000003, -93.64464999999996]
55369 [45.11797000000007, -93.39996499999995]
55374 [45.195475000000044, -93.55018499999994]
55375 [44.90560500000004, -93

In [56]:
df_Hennepin_2 = pd.DataFrame([latitude, longitude], ["Latitude", "Longitude"]).T
df_Hennepin_2

Unnamed: 0,Latitude,Longitude
0,44.892486,-93.197486
1,44.971741,-93.447237
2,45.078225,-93.455169
3,45.164410,-93.361285
4,44.952920,-93.576280
...,...,...
101,45.055380,-93.635360
102,45.055380,-93.635360
103,45.055380,-93.635360
104,45.055380,-93.635360


In [57]:
## combines Zip Code and Lat/Long tables
df_Hennepin_all = pd.concat([df_Hennepin, df_Hennepin_2], axis = 1)
df_Hennepin_all.head()

Unnamed: 0,Zip Code,City,County,Latitude,Longitude
0,55111,Saint Paul,Hennepin,44.892486,-93.197486
1,55305,Hopkins,Hennepin,44.971741,-93.447237
2,55311,Osseo,Hennepin,45.078225,-93.455169
3,55316,Champlin,Hennepin,45.16441,-93.361285
4,55323,Crystal Bay,Hennepin,44.95292,-93.57628


In [58]:
## creats blank map of Hennepin County using lat/long values
map_Hennepin = folium.Map(location = [45.0209, -93.5095], zoom_start = 10)
map_Hennepin

In [59]:
## adds markers for zip code values
for lat, long, city, zipcode in zip(df_Hennepin_all["Latitude"], df_Hennepin_all["Longitude"], df_Hennepin_all["Zip Code"], df_Hennepin_all["City"]):
    label = "{},{}".format(zipcode, city)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
    [lat, long],
    radius = 5,
    popup = label,
    color = "blue",
    fill = True,
    fill_color = "#3186cc",
    fill_opacity = 0.7,
    parse_html = False).add_to(map_Hennepin)

map_Hennepin

In [60]:
## Foursqaure API was already called earlier in this project

In [61]:
## Function for Nearby Venues defined earlier in this project

In [62]:
## calls Nearby Venues function for each zip code into a new dataframe
Hennepin_venues = getNearbyVenues(names = df_Hennepin_all["Zip Code"],
                                 latitudes = df_Hennepin_all["Latitude"],
                                 longitudes = df_Hennepin_all["Longitude"]
                                 )

55111
55305
55311
55316
55323
55327
55331
55340
55343
55344
55345
55346
55347
55348
55356
55357
55359
55361
55364
55369
55374
55375
55384
55391
55392
55401
55402
55403
55404
55405
55406
55407
55408
55409
55410
55411
55412
55413
55414
55415
55416
55417
55418
55419
55420
55422
55423
55424
55425
55426
55427
55428
55429
55430
55431
55435
55436
55437
55438
55439
55440
55441
55442
55443
55444
55445
55446
55447
55450
55454
55455
55458
55459
55460
55467
55468
55470
55472
55473
55474
55478
55479
55480
55483
55484
55485
55486
55487
55488
55569
55570
55571
55572
55573
55574
55575
55576
55577
55578
55579
55593
55595
55596
55597
55598
55599


In [63]:
## checks size of dataframe, previews first 20 rows of data
print(Hennepin_venues.shape)
Hennepin_venues.head(20)

(2319, 7)


Unnamed: 0,Zip Code,Zip Code Latitude,Zip Code Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,55111,44.892486,-93.197486,"Base Camp, Northern Star Council",44.893286,-93.191379,Campground
1,55111,44.892486,-93.197486,Air National Guard Museum,44.893159,-93.199497,Museum
2,55111,44.892486,-93.197486,Fort Snelling Club,44.892161,-93.195725,Bar
3,55111,44.892486,-93.197486,Fort Snelling LRT Station,44.89361,-93.198009,Light Rail Station
4,55111,44.892486,-93.197486,Federal Building--Cafe,44.893936,-93.195946,Café
5,55111,44.892486,-93.197486,Fred Wells Tennis & Education Center,44.892848,-93.194291,Tennis Court
6,55111,44.892486,-93.197486,Leonard H Neiman Youth Athletic Complex,44.892024,-93.192233,Athletics & Sports
7,55305,44.971741,-93.447237,Whole Foods Market,44.973737,-93.445441,Grocery Store
8,55305,44.971741,-93.447237,Target,44.970157,-93.447411,Big Box Store
9,55305,44.971741,-93.447237,Lunds & Byerlys,44.968491,-93.44686,Grocery Store


In [64]:
## prints out how many venues were returned for each zip code
Hennepin_venues.groupby("Zip Code").count()

Unnamed: 0_level_0,Zip Code Latitude,Zip Code Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Zip Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
55111,7,7,7,7,7,7
55305,29,29,29,29,29,29
55311,4,4,4,4,4,4
55316,2,2,2,2,2,2
55323,5,5,5,5,5,5
...,...,...,...,...,...,...
55595,10,10,10,10,10,10
55596,10,10,10,10,10,10
55597,10,10,10,10,10,10
55598,10,10,10,10,10,10


In [65]:
## counts for unique categories
print("There are {} unique categories.".format(len(Hennepin_venues["Venue Category"].unique())))

There are 256 unique categories.


## Analyzing each Zip Code in Hennepin County

In [66]:
## One Hot Encoding
Hennepin_onehot = pd.get_dummies(Hennepin_venues[["Venue Category"]], prefix = "", prefix_sep = "")

## add zip code column back to dataframe
Hennepin_onehot["Zip Code"] = Hennepin_venues["Zip Code"]

## move zip code column to first column
fixed_columns_1 = [Hennepin_onehot.columns[-1]] + list(Hennepin_onehot.columns[:-1])
Hennepin_onehot = Hennepin_onehot[fixed_columns_1]

Hennepin_onehot.head(20)

Unnamed: 0,Zip Code,ATM,Accessories Store,Acupuncturist,Adult Boutique,Advertising Agency,African Restaurant,Airport,Airport Service,Airport Terminal,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Weight Loss Center,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,55111,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,55111,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,55111,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,55111,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,55111,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,55111,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,55111,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,55305,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,55305,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,55305,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [67]:
## examines dataframe size
Hennepin_onehot.shape

(2319, 257)

In [68]:
## grouping rows by zip code and by taking the mean of the frequency of occurence in each category
Hennepin_grouped = Hennepin_onehot.groupby("Zip Code").mean().reset_index()

In [69]:
Hennepin_grouped

Unnamed: 0,Zip Code,ATM,Accessories Store,Acupuncturist,Adult Boutique,Advertising Agency,African Restaurant,Airport,Airport Service,Airport Terminal,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Weight Loss Center,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,55111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
1,55305,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.034483,0.0,0.0,0.0
2,55311,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
3,55316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
4,55323,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101,55595,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
102,55596,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
103,55597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
104,55598,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0


In [70]:
## confirming size
Hennepin_grouped.shape

(106, 257)

In [71]:
## printing each zip code along with the top 5 most common venues
num_top_venues = 5

for hood in Hennepin_grouped["Zip Code"]:
    print("----"+hood+"----")
    temp = Hennepin_grouped[Hennepin_grouped["Zip Code"] == hood].T.reset_index()
    temp.columns = ["Venue", "freq"]
    temp = temp.iloc[1:]
    temp["freq"] = temp["freq"].astype(float)
    temp = temp.round({"freq":2})
    print(temp.sort_values("freq", ascending = False).reset_index(drop = True).head(num_top_venues))
    print("\n")

----55111----
                Venue  freq
0  Athletics & Sports  0.14
1              Museum  0.14
2                Café  0.14
3                 Bar  0.14
4  Light Rail Station  0.14


----55305----
                    Venue  freq
0               Pet Store  0.07
1              Shoe Store  0.07
2       Electronics Store  0.07
3             Coffee Shop  0.07
4  Furniture / Home Store  0.07


----55311----
        Venue  freq
0   Locksmith  0.50
1       Field  0.25
2   Roof Deck  0.25
3         ATM  0.00
4  Non-Profit  0.00


----55316----
                Venue  freq
0      Baseball Field   0.5
1            Bus Stop   0.5
2                 ATM   0.0
3          Non-Profit   0.0
4  Miscellaneous Shop   0.0


----55323----
             Venue  freq
0  Harbor / Marina   0.2
1             Food   0.2
2      Post Office   0.2
3    Boat or Ferry   0.2
4      Art Gallery   0.2


----55327----
                     Venue  freq
0           Baseball Field  0.50
1             Liquor Store  0.25
2      Am

4      Department Store  0.04


----55435----
                    Venue  freq
0          Clothing Store  0.07
1                     Spa  0.06
2             Coffee Shop  0.06
3       Accessories Store  0.05
4  Furniture / Home Store  0.05


----55436----
             Venue  freq
0         Pharmacy  0.17
1   Shipping Store  0.08
2      Video Store  0.08
3  Laundry Service  0.08
4    Grocery Store  0.08


----55437----
                Venue  freq
0  Athletics & Sports  0.25
1             Butcher  0.25
2        Dance Studio  0.25
3         Video Store  0.25
4        Music School  0.00


----55438----
                Venue  freq
0         Coffee Shop  0.14
1        Intersection  0.07
2     Automotive Shop  0.07
3  Chinese Restaurant  0.07
4  Salon / Barbershop  0.07


----55439----
                  Venue  freq
0                   Gym  0.14
1           Coffee Shop  0.09
2        Sandwich Place  0.09
3                 Hotel  0.05
4  Arts & Entertainment  0.05


----55440----
                

4  Caribbean Restaurant   0.1


----55599----
                  Venue  freq
0   American Restaurant   0.2
1                   Bar   0.2
2        Baseball Field   0.2
3      Business Service   0.1
4  Caribbean Restaurant   0.1




In [72]:
## function for sort venues already defined

In [73]:
## creating new dataframe and display top 10 venues for each zip code
num_top_venues = 10
indicators = ["st", "nd", "rd"]

#create columns according to number of top venues
columns = ["Zip Code"]
for ind in np.arange(num_top_venues):
    try:
        columns.append("{}{} Most Common Venue".format(ind+1, indicators[ind]))
    except:
        columns.append("{}th Most Common Venue".format(ind+1))
        
## create new dataframe
Hennepin_venues_sorted = pd.DataFrame(columns = columns)
Hennepin_venues_sorted["Zip Code"] = Hennepin_grouped["Zip Code"]

for ind in np.arange(Hennepin_grouped.shape[0]):
    Hennepin_venues_sorted.iloc[ind,1:] = return_most_common_venues(Hennepin_grouped.iloc[ind,:], num_top_venues)
    
Hennepin_venues_sorted.head()

Unnamed: 0,Zip Code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,55111,Café,Tennis Court,Campground,Light Rail Station,Museum,Athletics & Sports,Bar,English Restaurant,Dutch Restaurant,Electronics Store
1,55305,Coffee Shop,Pet Store,Shoe Store,Electronics Store,Grocery Store,Furniture / Home Store,Bank,Italian Restaurant,Big Box Store,Spa
2,55311,Locksmith,Field,Roof Deck,Yoga Studio,Escape Room,Dutch Restaurant,Electronics Store,Empanada Restaurant,English Restaurant,Event Space
3,55316,Baseball Field,Bus Stop,Yoga Studio,Dutch Restaurant,Flower Shop,Fish & Chips Shop,Field,Fast Food Restaurant,Farmers Market,Exhibit
4,55323,Food,Boat or Ferry,Post Office,Harbor / Marina,Art Gallery,Event Service,Electronics Store,Empanada Restaurant,English Restaurant,Escape Room


In [74]:
## set number of clusters
kclusters = 5
Hennepin_grouped_clustering = Hennepin_grouped.drop("Zip Code", 1)

## run k-means clustering
kmeans = KMeans(n_clusters = kclusters, random_state = 0).fit(Hennepin_grouped_clustering)

## check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([3, 0, 0, 1, 0, 1, 0, 0, 0, 0])

In [75]:
## add clustering labels
Hennepin_venues_sorted.insert(0, "Cluster Labels", kmeans.labels_)

In [76]:
## merge Hennepin_grouped with zip codes to add lat/long for each zip code
Hennepin_merged = df_Hennepin_all
Hennepin_merged = Hennepin_merged.join(Hennepin_venues_sorted.set_index("Zip Code"), on = "Zip Code")

## remove any NaN values
Hennepin_merged = Hennepin_merged.dropna()
Hennepin_merged.head(20)

Unnamed: 0,Zip Code,City,County,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,55111,Saint Paul,Hennepin,44.892486,-93.197486,3,Café,Tennis Court,Campground,Light Rail Station,Museum,Athletics & Sports,Bar,English Restaurant,Dutch Restaurant,Electronics Store
1,55305,Hopkins,Hennepin,44.971741,-93.447237,0,Coffee Shop,Pet Store,Shoe Store,Electronics Store,Grocery Store,Furniture / Home Store,Bank,Italian Restaurant,Big Box Store,Spa
2,55311,Osseo,Hennepin,45.078225,-93.455169,0,Locksmith,Field,Roof Deck,Yoga Studio,Escape Room,Dutch Restaurant,Electronics Store,Empanada Restaurant,English Restaurant,Event Space
3,55316,Champlin,Hennepin,45.16441,-93.361285,1,Baseball Field,Bus Stop,Yoga Studio,Dutch Restaurant,Flower Shop,Fish & Chips Shop,Field,Fast Food Restaurant,Farmers Market,Exhibit
4,55323,Crystal Bay,Hennepin,44.95292,-93.57628,0,Food,Boat or Ferry,Post Office,Harbor / Marina,Art Gallery,Event Service,Electronics Store,Empanada Restaurant,English Restaurant,Escape Room
5,55327,Dayton,Hennepin,45.242005,-93.51451,1,Baseball Field,Liquor Store,American Restaurant,Yoga Studio,Empanada Restaurant,English Restaurant,Escape Room,Event Service,Event Space,Exhibit
6,55331,Excelsior,Hennepin,44.900035,-93.569505,0,Bakery,Coffee Shop,Clothing Store,Bridal Shop,Brewery,Boutique,Spa,Bank,Sushi Restaurant,Music Store
7,55340,Hamel,Hennepin,45.04446,-93.52841,0,Coffee Shop,Salon / Barbershop,American Restaurant,Pizza Place,Automotive Shop,Café,Mobile Phone Shop,Martial Arts School,Food,Big Box Store
8,55343,Hopkins,Hennepin,44.92076,-93.41534,0,Park,Rental Car Location,Automotive Shop,Pizza Place,Baseball Field,Fast Food Restaurant,Brazilian Restaurant,Mexican Restaurant,Social Club,Movie Theater
9,55344,Eden Prairie,Hennepin,44.850895,-93.425455,0,Clothing Store,Sandwich Place,Ice Cream Shop,Bakery,Pizza Place,Chinese Restaurant,Fast Food Restaurant,Kids Store,Coffee Shop,Mobile Phone Shop


In [77]:
## create map
map_clusters_2 = folium.Map(location=[45.0209, -93.5095], zoom_start = 10)

## set colour scheme for clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

## add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Hennepin_merged['Latitude'], Hennepin_merged['Longitude'], Hennepin_merged['Zip Code'], Hennepin_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters_2)
       
map_clusters_2

## Cluster 1

In [78]:
## Cluster 1
Hennepin_merged.loc[Hennepin_merged['Cluster Labels'] == 0, Hennepin_merged.columns[[1] + list(range(5, Hennepin_merged.shape[1]))]]

Unnamed: 0,City,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Hopkins,0,Coffee Shop,Pet Store,Shoe Store,Electronics Store,Grocery Store,Furniture / Home Store,Bank,Italian Restaurant,Big Box Store,Spa
2,Osseo,0,Locksmith,Field,Roof Deck,Yoga Studio,Escape Room,Dutch Restaurant,Electronics Store,Empanada Restaurant,English Restaurant,Event Space
4,Crystal Bay,0,Food,Boat or Ferry,Post Office,Harbor / Marina,Art Gallery,Event Service,Electronics Store,Empanada Restaurant,English Restaurant,Escape Room
6,Excelsior,0,Bakery,Coffee Shop,Clothing Store,Bridal Shop,Brewery,Boutique,Spa,Bank,Sushi Restaurant,Music Store
7,Hamel,0,Coffee Shop,Salon / Barbershop,American Restaurant,Pizza Place,Automotive Shop,Café,Mobile Phone Shop,Martial Arts School,Food,Big Box Store
8,Hopkins,0,Park,Rental Car Location,Automotive Shop,Pizza Place,Baseball Field,Fast Food Restaurant,Brazilian Restaurant,Mexican Restaurant,Social Club,Movie Theater
9,Eden Prairie,0,Clothing Store,Sandwich Place,Ice Cream Shop,Bakery,Pizza Place,Chinese Restaurant,Fast Food Restaurant,Kids Store,Coffee Shop,Mobile Phone Shop
10,Minnetonka,0,ATM,Locksmith,Rental Car Location,Furniture / Home Store,Arcade,Clothing Store,American Restaurant,Advertising Agency,Fast Food Restaurant,Farmers Market
11,Eden Prairie,0,Concert Hall,Skating Rink,Yoga Studio,Dry Cleaner,Fish & Chips Shop,Field,Fast Food Restaurant,Farmers Market,Exhibit,Event Space
14,Long Lake,0,Pizza Place,Shopping Mall,Chinese Restaurant,Salon / Barbershop,Sandwich Place,Jewelry Store,Liquor Store,Spa,Gym / Fitness Center,Coffee Shop


## Cluster 2

In [79]:
## Cluster 2
Hennepin_merged.loc[Hennepin_merged['Cluster Labels'] == 1, Hennepin_merged.columns[[1] + list(range(5, Hennepin_merged.shape[1]))]]

Unnamed: 0,City,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,Champlin,1,Baseball Field,Bus Stop,Yoga Studio,Dutch Restaurant,Flower Shop,Fish & Chips Shop,Field,Fast Food Restaurant,Farmers Market,Exhibit
5,Dayton,1,Baseball Field,Liquor Store,American Restaurant,Yoga Studio,Empanada Restaurant,English Restaurant,Escape Room,Event Service,Event Space,Exhibit
15,Loretto,1,American Restaurant,Bar,Baseball Field,Construction & Landscaping,Business Service,Caribbean Restaurant,Gym,English Restaurant,Escape Room,Event Service
18,Mound,1,Park,Business Service,Bar,Yoga Studio,Event Service,Electronics Store,Empanada Restaurant,English Restaurant,Escape Room,Event Space
93,Young America,1,Art Gallery,Bar,Antique Shop,Baseball Field,Yoga Studio,Electronics Store,English Restaurant,Escape Room,Event Service,Event Space
101,Loretto,1,American Restaurant,Bar,Baseball Field,Business Service,Caribbean Restaurant,Furniture / Home Store,Gym,Yoga Studio,English Restaurant,Escape Room
102,Loretto,1,American Restaurant,Bar,Baseball Field,Business Service,Caribbean Restaurant,Furniture / Home Store,Gym,Yoga Studio,English Restaurant,Escape Room
103,Loretto,1,American Restaurant,Bar,Baseball Field,Business Service,Caribbean Restaurant,Furniture / Home Store,Gym,Yoga Studio,English Restaurant,Escape Room
104,Loretto,1,American Restaurant,Bar,Baseball Field,Business Service,Caribbean Restaurant,Furniture / Home Store,Gym,Yoga Studio,English Restaurant,Escape Room
105,Loretto,1,American Restaurant,Bar,Baseball Field,Business Service,Caribbean Restaurant,Furniture / Home Store,Gym,Yoga Studio,English Restaurant,Escape Room


## Cluster 3

In [80]:
## Cluster 3
Hennepin_merged.loc[Hennepin_merged['Cluster Labels'] == 2, Hennepin_merged.columns[[1] + list(range(5, Hennepin_merged.shape[1]))]]

Unnamed: 0,City,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
13,Maple Plain,2,Café,Construction & Landscaping,Garden,Post Office,Pub,Baseball Field,Escape Room,Electronics Store,Empanada Restaurant,English Restaurant
16,Maple Plain,2,Café,Construction & Landscaping,Garden,Post Office,Pub,Baseball Field,Escape Room,Electronics Store,Empanada Restaurant,English Restaurant
90,Maple Plain,2,Café,Construction & Landscaping,Garden,Post Office,Pub,Baseball Field,Escape Room,Electronics Store,Empanada Restaurant,English Restaurant
91,Maple Plain,2,Café,Construction & Landscaping,Garden,Post Office,Pub,Baseball Field,Escape Room,Electronics Store,Empanada Restaurant,English Restaurant
94,Maple Plain,2,Café,Construction & Landscaping,Garden,Post Office,Pub,Baseball Field,Escape Room,Electronics Store,Empanada Restaurant,English Restaurant
96,Maple Plain,2,Café,Construction & Landscaping,Garden,Post Office,Pub,Baseball Field,Escape Room,Electronics Store,Empanada Restaurant,English Restaurant
98,Maple Plain,2,Café,Construction & Landscaping,Garden,Post Office,Pub,Baseball Field,Escape Room,Electronics Store,Empanada Restaurant,English Restaurant
99,Maple Plain,2,Café,Construction & Landscaping,Garden,Post Office,Pub,Baseball Field,Escape Room,Electronics Store,Empanada Restaurant,English Restaurant
100,Maple Plain,2,Café,Construction & Landscaping,Garden,Post Office,Pub,Baseball Field,Escape Room,Electronics Store,Empanada Restaurant,English Restaurant


## Cluster 4

In [81]:
## Cluster 4
Hennepin_merged.loc[Hennepin_merged['Cluster Labels'] == 3, Hennepin_merged.columns[[1] + list(range(5, Hennepin_merged.shape[1]))]]

Unnamed: 0,City,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Saint Paul,3,Café,Tennis Court,Campground,Light Rail Station,Museum,Athletics & Sports,Bar,English Restaurant,Dutch Restaurant,Electronics Store
12,Eden Prairie,3,Playground,Plaza,Dog Run,Field,Fast Food Restaurant,Farmers Market,Exhibit,Event Space,Event Service,Escape Room
19,Osseo,3,Bar,Coffee Shop,Greek Restaurant,Cajun / Creole Restaurant,Sandwich Place,Sporting Goods Shop,Thai Restaurant,Motorcycle Shop,Liquor Store,Gourmet Shop
20,Rogers,3,Fast Food Restaurant,Sporting Goods Shop,Coffee Shop,Gas Station,Dive Bar,Diner,Motel,Bar,Big Box Store,Restaurant
21,Saint Bonifacius,3,Coffee Shop,Video Store,Dive Bar,Bar,Baseball Field,Sandwich Place,Convenience Store,Park,Farmers Market,Exhibit
25,Minneapolis,3,Bar,Nightclub,Pizza Place,Café,American Restaurant,Yoga Studio,Scandinavian Restaurant,New American Restaurant,Sushi Restaurant,Food Truck
26,Minneapolis,3,Coffee Shop,Hotel,Theater,American Restaurant,Food Truck,Seafood Restaurant,Breakfast Spot,Steakhouse,Salad Place,Restaurant
27,Minneapolis,3,Coffee Shop,Hotel,Theater,New American Restaurant,Salon / Barbershop,Liquor Store,Seafood Restaurant,Gym,Bar,Plaza
29,Minneapolis,3,Coffee Shop,Pizza Place,Brewery,Men's Store,Park,Furniture / Home Store,Grocery Store,Antique Shop,Escape Room,Electronics Store
33,Minneapolis,3,Park,Market,American Restaurant,Butcher,Restaurant,Latin American Restaurant,Diner,Empanada Restaurant,Thrift / Vintage Store,Sushi Restaurant


## Cluster 5

In [82]:
## Cluster 5
Hennepin_merged.loc[Hennepin_merged['Cluster Labels'] == 4, Hennepin_merged.columns[[1] + list(range(5, Hennepin_merged.shape[1]))]]

Unnamed: 0,City,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
67,Minneapolis,4,Music Venue,Yoga Studio,Hospital,Fish & Chips Shop,Field,Fast Food Restaurant,Farmers Market,Exhibit,Event Space,Event Service
