# Applied Data Science Capstone

This notebook will be used to answer deliverables for the Data Science Capstone

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import geocoder
import os
import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
# import k-means from clustering stage
from sklearn.cluster import KMeans
import folium # map rendering library

## Week 3 Capstone Deliverable

### Extracting dataset and build dataframe
After retreiving the URL and creating a Beautiful soup object** 

Firstly create a list  

Later after finding the table and table data  create a dictionary called cell having 3 keys PostalCode, Borough and Neighborhood.

As postal code contains upto 3 characters extract that using tablerow.p.text

Next use split ,strip and replace functions for getting Borough and Neighborhood information.
Append to the list  

Create a dataframe with list

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html_data = requests.get(url).text
soup = BeautifulSoup(html_data, 'html5lib')

table_contents=[]
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

# print(table_contents)
df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})


df.shape

(103, 3)

### Append latitude / longitude to dataset

In [3]:
def getLatLong(postal_code):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.bing('{}, Toronto, Ontario'.format(postal_code),key=os.environ.get('BING_API_KEY'))
        lat_lng_coords = g.latlng
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    return [latitude,longitude]

for index, row in df.iterrows():
    latlng = getLatLong(row['PostalCode'])
    df.loc[index,'latitude'] = latlng[0]
    df.loc[index,'longitude'] = latlng[1]
    
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,latitude,longitude
0,M3A,North York,Parkwoods,43.756123,-79.329636
1,M4A,North York,Victoria Village,43.72678,-79.310738
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654842,-79.365379
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.721996,-79.445915
4,M7A,Queen's Park,Ontario Provincial Government,43.66391,-79.388733


### Display Map with Neighborhoods Plotted as Markers

In [4]:
lat_lng_coords = None
# loop until you get the coordinates
while(lat_lng_coords is None):
    g = geocoder.bing('{}, Toronto, Ontario',key=os.environ.get('BING_API_KEY'))
    lat_lng_coords = g.latlng
    
map_toronto = folium.Map(location=[lat_lng_coords[0], lat_lng_coords[1]], zoom_start=11)

# add markers to map
for index, row in df.iterrows():
    lat = row['latitude']
    lng = row['longitude']
    label = row['Neighborhood']
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Poll Foursquare Data on Neighborhoods

In [5]:
CLIENT_ID = 'HCPXZ5D4FGKWPDDPQPWECSAM4KEHIVOCIAAGLHOW3QOQTY1I' # key=os.environ.get('FOURSQUARE_CLIENT')
CLIENT_SECRET = 'CIP01S2AQJSNDL0XYUNUNREFVFK25H1JV2T55TNNIQRUYHEV' # key=os.environ.get('FOURSQUARE_SECRET')
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

toronto_venues = getNearbyVenues(names=df['Neighborhood'],
                                   latitudes=df['latitude'],
                                   longitudes=df['longitude']
                                  )

print(toronto_venues.shape)
toronto_venues.head()
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

#toronto_onehot.head()

(2581, 7)
There are 267 uniques categories.


### Process Data for K-means Clustering

In [6]:
toronto_onehot.shape
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped
toronto_grouped.shape
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')
    
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 20

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

----Agincourt----
                venue  freq
0  Chinese Restaurant   0.2
1                Pool   0.1
2        Intersection   0.1
3                Bank   0.1
4       Shopping Mall   0.1


----Alderwood, Long Branch----
                             venue  freq
0                   Breakfast Spot   1.0
1                      Opera House   0.0
2               Miscellaneous Shop   0.0
3       Modern European Restaurant   0.0
4  Molecular Gastronomy Restaurant   0.0


----Bathurst Manor, Wilson Heights, Downsview North----
                       venue  freq
0  Middle Eastern Restaurant  0.25
1   Mediterranean Restaurant  0.25
2                Pizza Place  0.25
3                       Park  0.25
4                     Museum  0.00


----Bayview Village----
          venue  freq
0       Dog Run   0.5
1  Tennis Court   0.5
2  Music School   0.0
3  Noodle House   0.0
4     Nightclub   0.0


----Bedford Park, Lawrence Manor East----
                venue  freq
0         Coffee Shop  0.12
1  Italia

4          Yoga Studio  0.00


----Lawrence Manor, Lawrence Heights----
               venue  freq
0        Coffee Shop  0.25
1  Convenience Store  0.12
2      Metro Station  0.12
3             Bakery  0.12
4           Platform  0.12


----Lawrence Park----
         venue  freq
0  Yoga Studio  0.11
1         Park  0.11
2         Pool  0.11
3     Bus Line  0.11
4         Bank  0.11


----Leaside----
                 venue  freq
0  Sporting Goods Shop  0.22
1                 Bank  0.22
2           Restaurant  0.11
3           Sports Bar  0.11
4     Sushi Restaurant  0.11


----Little Portugal, Trinity----
                           venue  freq
0                            Bar  0.10
1          Vietnamese Restaurant  0.05
2               Asian Restaurant  0.05
3  Vegetarian / Vegan Restaurant  0.05
4                    Yoga Studio  0.02


----Malvern, Rouge----
                  venue  freq
0            Hobby Shop  0.33
1         Design Studio  0.33
2  Fast Food Restaurant  0.33
3         

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,...,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue,16th Most Common Venue,17th Most Common Venue,18th Most Common Venue,19th Most Common Venue,20th Most Common Venue
0,Agincourt,Chinese Restaurant,Pool,Intersection,Bank,Shopping Mall,Skating Rink,Asian Restaurant,Noodle House,Supermarket,...,Office,Music Store,Music Venue,New American Restaurant,Movie Theater,Moroccan Restaurant,Nightclub,Monument / Landmark,Molecular Gastronomy Restaurant,Music School
1,"Alderwood, Long Branch",Breakfast Spot,Opera House,Miscellaneous Shop,Modern European Restaurant,Molecular Gastronomy Restaurant,Monument / Landmark,Moroccan Restaurant,Movie Theater,Museum,...,Music Store,Music Venue,New American Restaurant,Nightclub,Noodle House,Office,Optical Shop,Mexican Restaurant,Organic Grocery,Other Great Outdoors
2,"Bathurst Manor, Wilson Heights, Downsview North",Middle Eastern Restaurant,Mediterranean Restaurant,Pizza Place,Park,Museum,New American Restaurant,Music Venue,Music Store,Music School,...,Movie Theater,Noodle House,Monument / Landmark,Molecular Gastronomy Restaurant,Modern European Restaurant,Miscellaneous Shop,Nightclub,Yoga Studio,Office,Mexican Restaurant
3,Bayview Village,Dog Run,Tennis Court,Music School,Noodle House,Nightclub,New American Restaurant,Music Venue,Music Store,Museum,...,Movie Theater,Moroccan Restaurant,Monument / Landmark,Molecular Gastronomy Restaurant,Modern European Restaurant,Miscellaneous Shop,Office,Yoga Studio,Opera House,Optical Shop
4,"Bedford Park, Lawrence Manor East",Coffee Shop,Italian Restaurant,Sandwich Place,Women's Store,Bakery,Pizza Place,Pub,Comfort Food Restaurant,Restaurant,...,Sushi Restaurant,Liquor Store,Thai Restaurant,Grocery Store,Toy / Game Store,Cupcake Shop,Bagel Shop,Juice Bar,Wine Shop,American Restaurant


### Cluster Neighborhoods

In [7]:
# set number of clusters
kclusters = 10

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

# add clustering labels
#neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
neighborhoods_venues_sorted['Cluster Labels'] = kmeans.labels_
neighborhoods_venues_sorted['Cluster Labels'] = neighborhoods_venues_sorted['Cluster Labels'].astype(int)

toronto_merged = df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

# create map
map_clusters = folium.Map(location=lat_lng_coords, zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for index, row in toronto_merged.iterrows():
    if np.isnan(row['Cluster Labels']):
        continue
    lat = row['latitude']
    lng = row['longitude']
    poi = row['Neighborhood']
    cluster = int(row['Cluster Labels'])
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters