In [33]:
import pandas as pd
import re
import itertools
import numpy as np
import matplotlib.pyplot as plt
import pgeocode
import folium
import requests
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

## Getting data

In [34]:
#read 
data= pd.read_csv('./Data/dfinitivo.csv',index_col=0)

In [35]:
data.head()

Unnamed: 0,postalCode,Median_Income,rental price,city,persons counted 2018,persons counted 2019,persons in perma housing,latitude,longitude,Median Age,Population,Household,median lengh of residence,average house income,Unemployed %,crimes
0,90023,40225,2201,Commerce,490,295,17,34.0245,-118.1975,31.4,46455,10966,8.0,59679,6.84,128
1,90040,43585,2317,Commerce,490,295,17,33.9909,-118.1532,33.2,14121,3605,10.0,63661,9.51,128
2,90066,75209,3403,Culver City,117,236,69,34.003,-118.4298,41.1,55775,24339,7.3,127240,5.31,188
3,90230,79242,3630,Culver City,117,236,69,33.9949,-118.3991,41.8,33255,13293,8.8,127747,5.01,188
4,90232,82254,3472,Culver City,117,236,69,34.0168,-118.3973,42.2,16107,7116,7.1,122950,3.39,188


### Let's see them in a map

In [36]:
#first map
map_losangeles = folium.Map(location=[34.0194, -118.411], zoom_start=8.4)

# add markers to map
for lat, lng, label in zip(data['latitude'], data['longitude'], data['city']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_losangeles)  
    
map_losangeles

### Starting with foursquare

In [37]:
# client for Foursquare
CLIENT_ID = '0NO4O2SNXJZXFIHKTCE5B0CNL42BASTCCAEPQZ1WX5RK3W2Y'
CLIENT_SECRET = 'BMFNRGRT0VLM5NJ0S4KULRWJW00VFHYBFWQDZML331I5NEC1'
VERSION = '20180605'
LIMIT=100

In [38]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [39]:
#function to get nearby venues
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['zip_code', 
                  'zip Latitude', 
                  'zip Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category'] 
    return(nearby_venues)

In [8]:
losangeles_venues = getNearbyVenues(names=data['postalCode'],
                                   latitudes=data['latitude'],
                                   longitudes=data['longitude']
                                  )

90023
90040
90066
90230
90232
90201
90270
90210
90211
90212
90220
90221
90222
90240
90241
90242
90245
90247
90248
90249
90254
90255
90260
90262
90265
90266
90277
90278
90280
90301
90302
90303
90304
90305
90402
90403
90404
90501
90502
90503
90504
90505
90602
90603
90638
90640
90650
90660
90670
90701
90703
90704
90706
90712
90713
90715
90716
90717
90723
90745
90746
90810
90755
90802
90806
90808
90813
90814
90815
91006
91007
91010
91702
91011
91016
91024
91030
91101
91103
91104
91105
91106
91107
91201
91202
91203
91206
91207
91208
91214
91301
91302
91340
91501
91502
91505
91506
91706
91711
91722
91723
91731
91732
91733
91740
91741
91750
91754
91755
91765
91767
91768
91770
91773
91775
91776
91780
91790
91791
91792
91801
91803
93534
93552


In [40]:
losangeles_venues.head()

Unnamed: 0,zip_code,zip Latitude,zip Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,90023,34.0245,-118.1975,Cemitas La China Poblana,34.025624,-118.196399,Mexican Restaurant
1,90023,34.0245,-118.1975,Mercado Mexico,34.025657,-118.197017,Grocery Store
2,90023,34.0245,-118.1975,GameStop,34.02646,-118.199032,Video Game Store
3,90023,34.0245,-118.1975,Little Caesars Pizza,34.026385,-118.198913,Pizza Place
4,90023,34.0245,-118.1975,99 Cents Only Stores,34.027186,-118.199292,Discount Store


In [41]:
#Let's save it
losangeles_venues.to_csv('./Data/lavenues.csv')

In [42]:
#Reading to avoid request venues
lavenues= pd.read_csv('./Data/lavenues.csv',index_col=0)

### Working on venues

In [43]:
# group by zip to see number of venues on them
lavenues.groupby('zip_code').count()

Unnamed: 0_level_0,zip Latitude,zip Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
zip_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
90023,11,11,11,11,11,11
90040,1,1,1,1,1,1
90066,42,42,42,42,42,42
90201,8,8,8,8,8,8
90210,3,3,3,3,3,3
...,...,...,...,...,...,...
91792,13,13,13,13,13,13
91801,32,32,32,32,32,32
91803,13,13,13,13,13,13
93534,9,9,9,9,9,9


In [44]:
# Uniques categories
print('There are {} uniques categories.'.format(len(lavenues['Venue Category'].unique())))

There are 262 uniques categories.


In [45]:
#Transform venues to 0 and 1, to work on it.
# one hot encoding
losangeles_onehot = pd.get_dummies(lavenues[['Venue Category']], prefix="", prefix_sep="")

# add zip_code column back to dataframe
losangeles_onehot['zip_code'] = lavenues['zip_code'] 

# move zip column to the first column
fixed_columns = [losangeles_onehot.columns[-1]] + list(losangeles_onehot.columns[:-1])
losangeles_onehot = losangeles_onehot[fixed_columns]

losangeles_onehot.head()

Unnamed: 0,zip_code,ATM,Accessories Store,Afghan Restaurant,African Restaurant,American Restaurant,Antique Shop,Arcade,Argentinian Restaurant,Art Gallery,...,Vietnamese Restaurant,Warehouse Store,Water Park,Weight Loss Center,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio
0,90023,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,90023,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,90023,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,90023,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,90023,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
losangeles_onehot.shape

(2009, 263)

In [47]:
#Group by zip, doing mean.
losangeles_grouped = losangeles_onehot.groupby('zip_code').mean().reset_index()
losangeles_grouped

Unnamed: 0,zip_code,ATM,Accessories Store,Afghan Restaurant,African Restaurant,American Restaurant,Antique Shop,Arcade,Argentinian Restaurant,Art Gallery,...,Vietnamese Restaurant,Warehouse Store,Water Park,Weight Loss Center,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio
0,90023,0.0,0.00000,0.0,0.0,0.00000,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0,0.00000,0.0,0.0
1,90040,0.0,0.00000,0.0,0.0,0.00000,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0,0.00000,0.0,0.0
2,90066,0.0,0.02381,0.0,0.0,0.02381,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0,0.00000,0.0,0.0
3,90201,0.0,0.00000,0.0,0.0,0.00000,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0,0.00000,0.0,0.0
4,90210,0.0,0.00000,0.0,0.0,0.00000,0.0,0.0,0.0,0.333333,...,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0,0.00000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,91792,0.0,0.00000,0.0,0.0,0.00000,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0,0.00000,0.0,0.0
117,91801,0.0,0.00000,0.0,0.0,0.00000,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.03125,0.0,0.0,0.03125,0.0,0.0
118,91803,0.0,0.00000,0.0,0.0,0.00000,0.0,0.0,0.0,0.000000,...,0.076923,0.0,0.0,0.0,0.00000,0.0,0.0,0.00000,0.0,0.0
119,93534,0.0,0.00000,0.0,0.0,0.00000,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.00000,0.0,0.0,0.00000,0.0,0.0


In [48]:
losangeles_grouped.shape

(121, 263)

In [49]:
#function to get most common venues
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [50]:
#Get top 3 venues and add it to a new dataframe, for the zip code
num_top_venues = 3
indicators = ['st', 'nd', 'rd']
# create columns according to number of top venues
columns = ['zip_code']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
zip_venues_sorted = pd.DataFrame(columns=columns)
zip_venues_sorted['zip_code'] = losangeles_grouped['zip_code']

for ind in np.arange(losangeles_grouped.shape[0]):
    zip_venues_sorted.iloc[ind, 1:] = return_most_common_venues(losangeles_grouped.iloc[ind, :], num_top_venues)

zip_venues_sorted.head()

Unnamed: 0,zip_code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,90023,Mexican Restaurant,Video Game Store,Supermarket
1,90040,Clothing Store,Yoga Studio,Electronics Store
2,90066,Japanese Restaurant,Coffee Shop,Grocery Store
3,90201,Fast Food Restaurant,Fried Chicken Joint,Seafood Restaurant
4,90210,Art Gallery,Clothing Store,Speakeasy


### kmeans to cluster

In [54]:
# set number of clusters
kclusters =7
losangeles_grouped_clustering = losangeles_grouped.drop('zip_code', 1)
# run k-means clustering
kmeans = KMeans(n_clusters = kclusters, random_state = 0).fit(losangeles_grouped_clustering)
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 5, 6, 1, 6, 6, 6, 5, 6, 3], dtype=int32)

In [55]:
#copying data to do not modify the original
datavenue= data

In [56]:
datavenue.rename(columns={'postalCode':'zip_code'},inplace=True)

In [57]:
datavenue.head()

Unnamed: 0,zip_code,Median_Income,rental price,city,persons counted 2018,persons counted 2019,persons in perma housing,latitude,longitude,Median Age,Population,Household,median lengh of residence,average house income,Unemployed %,crimes
0,90023,40225,2201,Commerce,490,295,17,34.0245,-118.1975,31.4,46455,10966,8.0,59679,6.84,128
1,90040,43585,2317,Commerce,490,295,17,33.9909,-118.1532,33.2,14121,3605,10.0,63661,9.51,128
2,90066,75209,3403,Culver City,117,236,69,34.003,-118.4298,41.1,55775,24339,7.3,127240,5.31,188
3,90230,79242,3630,Culver City,117,236,69,33.9949,-118.3991,41.8,33255,13293,8.8,127747,5.01,188
4,90232,82254,3472,Culver City,117,236,69,34.0168,-118.3973,42.2,16107,7116,7.1,122950,3.39,188


In [58]:
#inserting the cluster labels to the df
zip_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

losangeles_merged = datavenue
losangeles_merged = losangeles_merged.merge(zip_venues_sorted, on = 'zip_code')

losangeles_merged.head()

Unnamed: 0,zip_code,Median_Income,rental price,city,persons counted 2018,persons counted 2019,persons in perma housing,latitude,longitude,Median Age,Population,Household,median lengh of residence,average house income,Unemployed %,crimes,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,90023,40225,2201,Commerce,490,295,17,34.0245,-118.1975,31.4,46455,10966,8.0,59679,6.84,128,1,Mexican Restaurant,Video Game Store,Supermarket
1,90040,43585,2317,Commerce,490,295,17,33.9909,-118.1532,33.2,14121,3605,10.0,63661,9.51,128,5,Clothing Store,Yoga Studio,Electronics Store
2,90066,75209,3403,Culver City,117,236,69,34.003,-118.4298,41.1,55775,24339,7.3,127240,5.31,188,6,Japanese Restaurant,Coffee Shop,Grocery Store
3,90230,79242,3630,Culver City,117,236,69,33.9949,-118.3991,41.8,33255,13293,8.8,127747,5.01,188,6,Fast Food Restaurant,Pet Store,ATM
4,90232,82254,3472,Culver City,117,236,69,34.0168,-118.3973,42.2,16107,7116,7.1,122950,3.39,188,6,Gym,Coffee Shop,Deli / Bodega


In [59]:
#saving it
losangeles_merged.to_csv('./Data/lamerged.csv')

In [60]:
lamerged= pd.read_csv('./Data/lamerged.csv',index_col=0)

In [61]:
lamerged.head()

Unnamed: 0,zip_code,Median_Income,rental price,city,persons counted 2018,persons counted 2019,persons in perma housing,latitude,longitude,Median Age,Population,Household,median lengh of residence,average house income,Unemployed %,crimes,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,90023,40225,2201,Commerce,490,295,17,34.0245,-118.1975,31.4,46455,10966,8.0,59679,6.84,128,1,Mexican Restaurant,Video Game Store,Supermarket
1,90040,43585,2317,Commerce,490,295,17,33.9909,-118.1532,33.2,14121,3605,10.0,63661,9.51,128,5,Clothing Store,Yoga Studio,Electronics Store
2,90066,75209,3403,Culver City,117,236,69,34.003,-118.4298,41.1,55775,24339,7.3,127240,5.31,188,6,Japanese Restaurant,Coffee Shop,Grocery Store
3,90230,79242,3630,Culver City,117,236,69,33.9949,-118.3991,41.8,33255,13293,8.8,127747,5.01,188,6,Fast Food Restaurant,Pet Store,ATM
4,90232,82254,3472,Culver City,117,236,69,34.0168,-118.3973,42.2,16107,7116,7.1,122950,3.39,188,6,Gym,Coffee Shop,Deli / Bodega


### Making a map of zip by cluster colors

In [62]:
# create map
map_clusters = folium.Map(location=[34.0194, -118.411], zoom_start=8)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(lamerged['latitude'], lamerged['longitude'], lamerged['city'], lamerged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Check venues

In [64]:
#You can check every cluster changing the int after ==
lamerged.loc[lamerged['Cluster Labels'] == 6, lamerged.columns[[1] + list(range(5, lamerged.shape[1]))]]

Unnamed: 0,Median_Income,persons counted 2019,persons in perma housing,latitude,longitude,Median Age,Population,Household,median lengh of residence,average house income,Unemployed %,crimes,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
2,75209,236,69,34.0030,-118.4298,41.1,55775,24339,7.3,127240,5.31,188,6,Japanese Restaurant,Coffee Shop,Grocery Store
3,79242,236,69,33.9949,-118.3991,41.8,33255,13293,8.8,127747,5.01,188,6,Fast Food Restaurant,Pet Store,ATM
4,82254,236,69,34.0168,-118.3973,42.2,16107,7116,7.1,122950,3.39,188,6,Gym,Coffee Shop,Deli / Bodega
7,149732,19,14,34.0901,-118.4065,49.1,23296,9414,11.2,236159,4.16,106,6,Art Gallery,Clothing Store,Speakeasy
8,71402,19,14,34.0652,-118.3830,43.5,8537,3707,7.5,149531,4.70,106,6,Café,Cosmetics Shop,Movie Theater
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,67910,30,7,34.1016,-118.0537,44.1,35242,11500,10.0,105516,4.09,64,6,Home Service,Historic Site,Pet Store
115,74965,142,42,34.0673,-117.9366,37.0,46256,13167,9.9,101350,6.83,292,6,Bakery,Sushi Restaurant,Gym / Fitness Center
116,70932,142,42,34.0229,-117.8975,38.1,32385,9438,9.3,97211,7.49,292,6,Korean Restaurant,Auto Garage,Restaurant
117,53644,68,39,34.0914,-118.1293,42.6,54337,20094,7.4,87650,4.72,193,6,Ice Cream Shop,Sushi Restaurant,Bakery
