# Coursera Capstone
This notebook will be mainly used for the capstone project of Coursera

## Import libraries

In [79]:
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation

from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML

# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

import folium # plotting library
import matplotlib.cm as cm
import matplotlib.colors as colors

from bs4 import BeautifulSoup # web scraping library

import pgeocode

# import k-means from clustering stage
from sklearn.cluster import KMeans

import config #credentials

print('Libraries imported.')

Libraries imported.


In [80]:
CLIENT_ID = config.access['CLIENT_ID']
CLIENT_SECRET = config.access['CLIENT_SECRET']
ACCESS_TOKEN = config.access['ACCESS_TOKEN']
 # your FourSquare Access Token
VERSION = '20180604'
LIMIT = 30

## Question 1: Scrape Wikipedia page

In this week's assignment, we would like to scrape a table from Wikipedia and convert it into a Pandas dataframe. <br>
Step 1 is to initiate a html parser. I chose to use the BeautifulSoup package.

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

wiki_page_request = requests.get(url)
wiki_page_text = wiki_page_request.text

soup = BeautifulSoup(wiki_page_text, 'html.parser')

Step 2 is to clean up the table, by merging neighborhoods with the same postal code and replace missing values

In [4]:
table_contents=[]
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

# print(table_contents)
df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest',
                                             'East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

In [5]:
df.shape

(103, 3)

## Question 2: Add LatLongs to the dataframe

The package pgeocode is used to obtain the latlongs based on the postal code (the other packages were not working for me. The latlongs are added to the original dataframe

In [6]:
pgeocode.Nominatim('ca')
geolocator = pgeocode.Nominatim('ca')
postal_codes = df['PostalCode'].tolist()
latitudes = []
longitudes = []
for i, postal_code in enumerate(postal_codes):
    # initialize your variable to None
    #print(f'--Getting Postal Code: {postal_code}')
    g = geolocator.query_postal_code(postal_code)
    
    if not g.empty:
        #print(f'Postal Code {postal_code} has been retrieved. {len(postal_codes) - (i + 1)} codes left')
        latitudes.append(g.latitude)
        longitudes.append(g.longitude)
        
df['Latitude'] = latitudes
df['Longitude'] = longitudes

In [7]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7545,-79.33
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
4,M7A,Queen's Park,Ontario Provincial Government,43.6641,-79.3889


## Question 3: Explore and cluster the neighborhoods

We work with only Boroughs that contain the word Toronto 

In [8]:
df_selected = df.loc[df.Borough.str.contains('Toronto')]
df_selected.reset_index(drop=True, inplace = True)

# Let's give labels to the borough names

df_selected['Borough'] = df_selected['Borough'].astype('category')
df_selected['Label'] = df_selected['Borough'].cat.codes

borough_names = df_selected[['Borough', 'Label']].drop_duplicates()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [9]:
df_selected['Borough'].value_counts()

Downtown Toronto          18
Central Toronto            9
West Toronto               6
East Toronto               4
East York/East Toronto     1
East Toronto Business      1
Name: Borough, dtype: int64

In [10]:
color_list = [
    'red',
    'blue',
    'gray',
    'orange',
    'beige',
    'green',
    'purple',
    'cadetblue',
    'black',
    'pink'
]

In [11]:
#for set the cluster number as label number
kclusters=len(df_selected.Label.unique())

latitude_tor, longitude_tor = df_selected[['Latitude', 'Longitude']].mean().values

# create map
map_toronto = folium.Map(location=[latitude_tor, longitude_tor], zoom_start=12)


# add markers to the map
markers_colors = []
for lat, lon, cluster in zip(df_selected['Latitude'], df_selected['Longitude'], df_selected['Label']):
    label = folium.Popup(str(df_selected['Borough']) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=color_list[cluster],
        fill=True,
        fill_color=color_list[cluster],
        fill_opacity=0.7).add_to(map_toronto)

In [12]:
map_toronto

In [13]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [14]:
df_venues = getNearbyVenues(names=df_selected['Neighborhood'],
                                   latitudes=df_selected['Latitude'],
                                   longitudes=df_selected['Longitude']
                                  )

Regent Park, Harbourfront
Garden District, Ryerson
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Dufferin, Dovercourt Village
The Danforth  East
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
The Danforth West, Riverdale
Toronto Dominion Centre, Design Exchange
Brockton, Parkdale Village, Exhibition Place
India Bazaar, The Beaches West
Commerce Court, Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North & West
High Park, The Junction South
North Toronto West
The Annex, North Midtown, Yorkville
Parkdale, Roncesvalles
Davisville
University of Toronto, Harbord
Runnymede, Swansea
Moore Park, Summerhill East
Kensington Market, Chinatown, Grange Park
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Rosedale
Enclave of M5E
St. James Town, Cabbagetown
First Canadi

In [15]:
print(df_venues.shape)
df_venues.head()

(743, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.6555,-79.3626,Tandem Coffee,43.653559,-79.361809,Coffee Shop
1,"Regent Park, Harbourfront",43.6555,-79.3626,Roselle Desserts,43.653447,-79.362017,Bakery
2,"Regent Park, Harbourfront",43.6555,-79.3626,Souvlaki Express,43.655584,-79.364438,Greek Restaurant
3,"Regent Park, Harbourfront",43.6555,-79.3626,Berkeley Church,43.655123,-79.365873,Event Space
4,"Regent Park, Harbourfront",43.6555,-79.3626,Figs Breakfast & Lunch,43.655675,-79.364503,Breakfast Spot


#### Calculate relative share of occurences of venue categories per neighborhood

Create a matrix with count of venue category per neighborhood

In [16]:
df_occurences = df_venues.groupby(['Neighborhood', 'Venue Category']).count().Venue.unstack('Venue Category').fillna(0)

Divide by total count of venue categories per neighborhood to standardize the data

In [17]:
df_share = df_occurences.div(df_occurences.sum(axis=1), axis=0)

#### Select only the top 10 most occuring venue categories to be included in the clustering

In [50]:
top10 = df_share.sum().nlargest(10)
top10_names = top10.index.values
print(top10_names)

['Park' 'Coffee Shop' 'Café' 'Restaurant' 'Grocery Store'
 'Residential Building (Apartment / Condo)' 'Bakery' 'Gym'
 'Sandwich Place' 'Pub']


#### Add the shares of the top 10 venue categories to the original dataframe

In [51]:
df_combined = pd.merge(df_selected, df_share[top10_names], left_on = 'Neighborhood', right_index = True, how ='outer')
df_combined.set_index('Neighborhood', inplace = True)

In [52]:
# features = np.append([ 'Longitude', 'Latitude'], top20_names)


In [53]:
features = top10_names

X = df_combined[features].fillna(0)
kmeans = KMeans(n_clusters=5, random_state=0).fit(X)

In [54]:
df_combined.insert(0, 'Cluster Labels', kmeans.labels_)
df_combined.reset_index(inplace = True)

In [55]:
# create map
map_clusters = folium.Map(location=[latitude_tor, longitude_tor], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_combined['Latitude'], df_combined['Longitude'], df_combined['Neighborhood'], df_combined['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters