### Install & import libraries

In [1]:
#!pip install folium
#!pip install geocoder

In [2]:
import pandas as pd

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

from geopy.geocoders import Nominatim

import folium

import json

import requests

from pandas.io.json import json_normalize

import numpy as np

import geocoder

print('Libraries imported')

Libraries imported


### Web Scrapping

In [3]:
url = "https://en.wikipedia.org/wiki/List_of_Houston_neighborhoods"
df_scrap = pd.read_html(url)
df_list = df_scrap[0]
df_list.rename(columns={'Name' : 'Neighborhood'}, inplace=True)
df_list = df_list[['Neighborhood']]
df_list.head()

Unnamed: 0,Neighborhood
0,Willowbrook
1,Greater Greenspoint
2,Carverdale
3,Fairbanks / Northwest Crossing
4,Greater Inwood


#### Get coordenates for neighborhoods

In [4]:
# Defining a function to get coordinates
def get_latlng(neighborhood):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Houston, Texas'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords
# Call the function to get the coordinates, store in a new list using list comprehension
coords = [get_latlng(neighborhood) for neighborhood in df_list["Neighborhood"].tolist()]

In [15]:
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])
df_list = df_list.merge(right = df_coords, how ='inner', left_index = True, right_index = True) 

In [16]:
df_list

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Willowbrook,29.952400,-95.544630
1,Greater Greenspoint,29.939670,-95.407480
2,Carverdale,29.849590,-95.542450
3,Fairbanks / Northwest Crossing,29.849380,-95.510880
4,Greater Inwood,29.869770,-95.480440
...,...,...,...
83,Spring Branch North,29.803513,-95.515875
84,Spring Branch Central,29.815870,-95.517730
85,Spring Branch East,29.807620,-95.480110
86,Greenway / Upper Kirby,29.732500,-95.441440


#### Get coordinates for Houston

In [17]:
address = 'Houston, Texas'
geolocator = Nominatim(user_agent="houston_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Houston, Texas {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Houston, Texas 29.7589382, -95.3676974.


#### First Map

In [24]:
map_houston = folium.Map(location=[latitude, longitude], zoom_start=9)
# Adding markers to map
for lat, lng, neighborhood in zip(df_list['Latitude'],  df_list['Longitude'], df_list['Neighborhood']):
 label = '{}'.format(neighborhood)
 label = folium.Popup(label, parse_html=True)
 folium.CircleMarker([lat, lng], radius=5, popup=label, color='black', fill=True, fill_color='#808080', fill_opacity=0.7).add_to(map_houston)
map_houston

### Foursquare credentials

In [25]:
# The code was removed by Watson Studio for sharing.

In [26]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [27]:
houston_venues = getNearbyVenues(names = df_list['Neighborhood'], latitudes = df_list['Latitude'], longitudes = df_list['Longitude'])

Willowbrook
Greater Greenspoint
Carverdale
Fairbanks / Northwest Crossing
Greater Inwood
Acres Home
Hidden Valley
Westbranch
Addicks / Park Ten
Spring Branch West
Langwood
Central Northwest (formerly Near Northwest)
Independence Heights
Lazybrook / Timbergrove
Greater Heights
Memorial
Eldridge / West Oaks
Briar Forest
Westchase
Mid-West (formerly Woodlake/Briarmeadow)
Greater Uptown
Washington Avenue Coalition / Memorial Park
Afton Oaks / River Oaks
Neartown / Montrose
Alief
Sharpstown
Gulfton
University Place
Westwood
Braeburn
Meyerland
Braeswood
Medical Center
Astrodome Area
South Main
Brays Oaks (formerly Greater Fondren S.W.)
Westbury
Willow Meadows / Willowbend
Fondren Gardens
Central Southwest
Fort Bend / Houston
IAH Airport
Kingwood
Lake Houston
Northside / Northline
Jensen
East Little York / Homestead
Trinity / Houston Gardens
East Houston
Settegast
Northside Village
Kashmere Gardens
El Dorado / Oates Prairie
Hunterwood
Greater Fifth Ward
Denver Harbor / Port Houston
Pleasantvi

In [28]:
print(houston_venues.shape)
houston_venues.head()

(694, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Willowbrook,29.9524,-95.54463,Bed Bath & Beyond,29.953517,-95.543865,Furniture / Home Store
1,Willowbrook,29.9524,-95.54463,buybuy BABY,29.953127,-95.543557,Kids Store
2,Willowbrook,29.9524,-95.54463,Babin's Seafood House,29.955088,-95.544452,Seafood Restaurant
3,Willowbrook,29.9524,-95.54463,World Market,29.95338,-95.544243,Furniture / Home Store
4,Willowbrook,29.9524,-95.54463,Buffalo Wild Wings,29.954015,-95.541562,Wings Joint


In [29]:
houston_venues[['Neighborhood', 'Venue']].groupby('Neighborhood').count().sort_values('Venue', ascending = False)

Unnamed: 0_level_0,Venue
Neighborhood,Unnamed: 1_level_1
IAH Airport,44
Gulfgate Riverview / Pine Valley,40
Neartown / Montrose,38
Midtown,36
Medical Center,34
...,...
Greater Inwood,1
Clinton Park / Tri-Community,1
Minnetex,1
Braeburn,1


In [30]:
houston_venues[['Venue', 'Neighborhood']].groupby('Venue').count().sort_values('Neighborhood', ascending = False)

Unnamed: 0_level_0,Neighborhood
Venue,Unnamed: 1_level_1
Starbucks,8
SUBWAY,7
Domino's Pizza,5
Redbox,5
Church's Chicken,5
...,...
Hampton Inn by Hilton,1
Half Price Books,1
Hair Revolution,1
Guayaba Latin Grill,1


### One hot Encoding

In [31]:
# one hot encoding
houston_onehot = pd.get_dummies(houston_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
houston_onehot['Neighborhood'] = houston_venues['Neighborhood'] 

# move neighborhood column to the first column

col = houston_onehot.pop("Neighborhood")
houston_onehot.insert(0, col.name, col)

In [32]:
df_grouped = houston_onehot.groupby('Neighborhood').mean().reset_index()
df_grouped.head()

Unnamed: 0,Neighborhood,Accessories Store,African Restaurant,Airport Food Court,Airport Lounge,Airport Service,American Restaurant,Arcade,Art Gallery,Art Museum,...,Video Store,Vietnamese Restaurant,Warehouse Store,Water Park,Weight Loss Center,Whisky Bar,Wine Bar,Wings Joint,Yoga Studio,Zoo Exhibit
0,Acres Home,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Afton Oaks / River Oaks,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Alief,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Astrodome Area,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Braeburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Print each neighborhood with top 5 venues

In [33]:
num_top_venues = 5

for hood in df_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = df_grouped[df_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Acres Home----
                venue  freq
0      Discount Store  0.25
1    Business Service  0.25
2         Shoe Repair  0.25
3  Athletics & Sports  0.25
4   Accessories Store  0.00


----Afton Oaks / River Oaks----
               venue  freq
0     Shop & Service   0.5
1         Public Art   0.5
2  Accessories Store   0.0
3        Music Venue   0.0
4         Nail Salon   0.0


----Alief----
              venue  freq
0  Football Stadium   0.5
1              Pool   0.5
2              Park   0.0
3       Music Venue   0.0
4        Nail Salon   0.0


----Astrodome Area----
                venue  freq
0          Food Truck  0.38
1         Pizza Place  0.12
2       Moving Target  0.12
3    Business Service  0.12
4  Chinese Restaurant  0.12


----Braeburn----
               venue  freq
0        Pizza Place   1.0
1  Accessories Store   0.0
2               Park   0.0
3        Music Venue   0.0
4         Nail Salon   0.0


----Braeswood----
                                      venue  freq
0

Transform into Dataframe

In [34]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [87]:
um_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = df_grouped['Neighborhood']

for ind in np.arange(df_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(df_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Acres Home,Athletics & Sports,Shoe Repair,Discount Store,Business Service,Food
1,Afton Oaks / River Oaks,Public Art,Shop & Service,Zoo Exhibit,Dry Cleaner,Food
2,Alief,Football Stadium,Pool,Zoo Exhibit,Dry Cleaner,Food
3,Astrodome Area,Food Truck,Moving Target,Auto Garage,Pizza Place,Business Service
4,Braeburn,Pizza Place,Zoo Exhibit,Dry Cleaner,Food,Flower Shop
...,...,...,...,...,...,...
74,Westbury,Dive Bar,Theater,Pizza Place,Hardware Store,Salon / Barbershop
75,Westchase,IT Services,Pizza Place,Dry Cleaner,Food,Flower Shop
76,Westwood,Fast Food Restaurant,Chinese Restaurant,BBQ Joint,African Restaurant,Flea Market
77,Willow Meadows / Willowbend,Food Truck,Soccer Stadium,Electronics Store,Shoe Store,Zoo Exhibit


### Cluster Neighborhood

In [88]:
# set number of clusters
kclusters = 10

df_grouped_clustering = df_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([9, 1, 6, 9, 2, 9, 9, 9, 9, 1], dtype=int32)

In [89]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

df_merged = df_list

# merge grouped with original dataframe df (neighborhood with lat/lon) to add latitude/longitude for each neighborhood
df_merged = df_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

df_merged.head(10) # check the last columns!

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Willowbrook,29.9524,-95.54463,9.0,Furniture / Home Store,Mobile Phone Shop,Breakfast Spot,Discount Store,Movie Theater
1,Greater Greenspoint,29.93967,-95.40748,9.0,Hotel Pool,Hotel,Japanese Restaurant,American Restaurant,Burger Joint
2,Carverdale,29.84959,-95.54245,9.0,Breakfast Spot,Taco Place,Burger Joint,Gas Station,BBQ Joint
3,Fairbanks / Northwest Crossing,29.84938,-95.51088,1.0,Convenience Store,Asian Restaurant,Clothing Store,Seafood Restaurant,Financial or Legal Service
4,Greater Inwood,29.86977,-95.48044,4.0,Gas Station,Zoo Exhibit,Electronics Store,Food & Drink Shop,Food
5,Acres Home,29.87047,-95.43536,9.0,Athletics & Sports,Shoe Repair,Discount Store,Business Service,Food
6,Hidden Valley,29.88847,-95.4146,9.0,Hotel,Rental Service,Food Truck,Sandwich Place,Zoo Exhibit
7,Westbranch,29.83937,-95.55361,1.0,Park,Nightclub,Clothing Store,Dry Cleaner,Food
8,Addicks / Park Ten,29.81434,-95.61593,,,,,,
9,Spring Branch West,29.8011,-95.54933,1.0,Mexican Restaurant,IT Services,Donut Shop,Fried Chicken Joint,Park


In [90]:
df_merged.isna().sum()

Neighborhood             0
Latitude                 0
Longitude                0
Cluster Labels           9
1st Most Common Venue    9
2nd Most Common Venue    9
3rd Most Common Venue    9
4th Most Common Venue    9
5th Most Common Venue    9
dtype: int64

In [91]:
df_merged.dropna(inplace = True)

In [92]:
df_merged.isna().sum()

Neighborhood             0
Latitude                 0
Longitude                0
Cluster Labels           0
1st Most Common Venue    0
2nd Most Common Venue    0
3rd Most Common Venue    0
4th Most Common Venue    0
5th Most Common Venue    0
dtype: int64

In [93]:
df_merged.groupby('Cluster Labels').count()

Unnamed: 0_level_0,Neighborhood,Latitude,Longitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,1,1,1,1,1,1,1,1
1.0,31,31,31,31,31,31,31,31
2.0,3,3,3,3,3,3,3,3
3.0,5,5,5,5,5,5,5,5
4.0,1,1,1,1,1,1,1,1
5.0,1,1,1,1,1,1,1,1
6.0,1,1,1,1,1,1,1,1
7.0,1,1,1,1,1,1,1,1
8.0,5,5,5,5,5,5,5,5
9.0,30,30,30,30,30,30,30,30


In [102]:
df_merged['Cluster Labels'] = df_merged['Cluster Labels'].astype('int64')

In [130]:
df_merged = df_merged.drop(11)
df_merged

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Willowbrook,29.952400,-95.544630,9,Furniture / Home Store,Mobile Phone Shop,Breakfast Spot,Discount Store,Movie Theater
1,Greater Greenspoint,29.939670,-95.407480,9,Hotel Pool,Hotel,Japanese Restaurant,American Restaurant,Burger Joint
2,Carverdale,29.849590,-95.542450,9,Breakfast Spot,Taco Place,Burger Joint,Gas Station,BBQ Joint
3,Fairbanks / Northwest Crossing,29.849380,-95.510880,1,Convenience Store,Asian Restaurant,Clothing Store,Seafood Restaurant,Financial or Legal Service
4,Greater Inwood,29.869770,-95.480440,4,Gas Station,Zoo Exhibit,Electronics Store,Food & Drink Shop,Food
...,...,...,...,...,...,...,...,...,...
82,MacGregor,29.711500,-95.356030,9,Snack Place,BBQ Joint,Seafood Restaurant,Zoo Exhibit,Electronics Store
83,Spring Branch North,29.803513,-95.515875,1,Mexican Restaurant,Convenience Store,Movie Theater,Electronics Store,Food & Drink Shop
84,Spring Branch Central,29.815870,-95.517730,8,Business Service,Construction & Landscaping,Zoo Exhibit,Electronics Store,Food & Drink Shop
86,Greenway / Upper Kirby,29.732500,-95.441440,9,Coffee Shop,Bakery,Sushi Restaurant,Italian Restaurant,Pizza Place


In [134]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_merged['Latitude'], df_merged['Longitude'], df_merged['Neighborhood'], df_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [147]:
df_0 = df_merged.loc[df_merged['Cluster Labels'] == 0]
df_1 = df_merged.loc[df_merged['Cluster Labels'] == 1]
df_2 = df_merged.loc[df_merged['Cluster Labels'] == 2]
df_3 = df_merged.loc[df_merged['Cluster Labels'] == 3]
df_4 = df_merged.loc[df_merged['Cluster Labels'] == 4]
df_5 = df_merged.loc[df_merged['Cluster Labels'] == 5]
df_6 = df_merged.loc[df_merged['Cluster Labels'] == 6]
df_7 = df_merged.loc[df_merged['Cluster Labels'] == 7]
df_8 = df_merged.loc[df_merged['Cluster Labels'] == 8]
df_9 = df_merged.loc[df_merged['Cluster Labels'] == 9]

In [164]:
df_3

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
54,Greater Fifth Ward,29.78,-95.32484,3,Park,Health Food Store,Zoo Exhibit,Food,Flower Shop
58,Clinton Park / Tri-Community,29.74322,-95.25751,3,Park,Food Court,Food,Flower Shop,Flea Market
71,South Park,29.66705,-95.32869,3,Park,Discount Store,Dry Cleaner,Food,Flower Shop
75,South Acres / Crestmont Park,29.6311,-95.35415,3,Park,Cosmetics Shop,Dry Cleaner,Food,Flower Shop
87,Lawndale / Wayside,29.7227,-95.30594,3,Park,Flower Shop,Food Court,Food,Flea Market
