## Implementation of final data science project in IBM Data Science Capstone Course work

### Importing libraries

In [91]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files
import numpy as np
# !conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

# !conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

### Reading the wikipedia page

In [15]:
# reading the webpage
from bs4 import BeautifulSoup
import requests
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')

# Getting the class for which we need to display data
class_data = soup.find(class_="wikitable sortable")

### Creating  the dataframe out of wikipedia page

In [195]:
main_dict = {}
IGNORE = 'Not assigned'
for tr in class_data.find_all('tr')[1:]:
    tds = tr.find_all('td')
    postcode = tds[0].string
    if len(tds[1].find_all('a')) > 0:
        borough = tds[1].find_all('a')[0].string
    else:
        borough = tds[1].string 
        if borough == IGNORE:
            continue
    
    if len(tds[2].find_all('a')) > 0:
        neighbourhood = tds[2].find_all('a')[0].string
    else:
        neighbourhood = tds[2].string.rstrip()
        neighbourhood = borough if neighbourhood == IGNORE else neighbourhood
    
    postcode, borough, neighbourhood = [x.rstrip() for x in (postcode, borough, neighbourhood)]
    if postcode not in main_dict:
        main_dict[postcode] = {
            'Borough': borough,
            'Neighbourhood': [neighbourhood]
        }
    else:
        main_dict[postcode]['Neighbourhood'].append(neighbourhood)

df_dict = {col:[] for col in cols}
for key in main_dict:
    df_dict[cols[0]].append(key)
    df_dict[cols[1]].append(main_dict[key]['Borough'])
    df_dict[cols[2]].append(','.join(main_dict[key]['Neighbourhood'][::-1]))

df = pd.DataFrame(df_dict)
df.head(10)
  

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Manor,Lawrence Heights"
4,M7A,Queen's Park,Queen's Park
5,M9A,Downtown Toronto,Queen's Park
6,M1B,Scarborough,"Malvern,Rouge"
7,M3B,North York,Don Mills North
8,M4B,East York,"Parkview Hill,Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District,Ryerson"


### Reading latitude from locally stored file

In [20]:
## Reading latitude and longitude from a local file.
filename = 'Geospatial_Coordinates.csv'
location_df = pd.read_csv(filename)
location_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Latitude and Longitude columns inserted on dataframe

In [22]:
# Merging latitude and longitude information with boroughs and neighborhood information
df_lats = df.merge(location_df,left_on='Postcode', right_on='Postal Code').drop('Postal Code', axis=1)
print(df_lats.head())

  Postcode           Borough                     Neighborhood   Latitude  \
0      M3A        North York                        Parkwoods  43.753259   
1      M4A        North York                 Victoria Village  43.725882   
2      M5A  Downtown Toronto                     Harbourfront  43.654260   
3      M6A        North York  Lawrence Manor,Lawrence Heights  43.718518   
4      M7A      Queen's Park                     Queen's Park  43.662301   

   Longitude  
0 -79.329656  
1 -79.315572  
2 -79.360636  
3 -79.464763  
4 -79.389494  


### Getting the Toronto latitude and longitude from Nomatim library

In [92]:
# Getting the Toronto latitude and longitude
address = 'Toronto, ON'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_lats['Latitude'], df_lats['Longitude'], df_lats['Borough'], df_lats['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

The geograpical coordinate of Toronto are 43.653963, -79.387207.


### Filterting dataframe for Borough that have Toronto in them

In [44]:
onlyToronto = df_lats[df_lats['Borough'].map(lambda x: 'toronto' in x.lower())].reset_index()
print(onlyToronto.head(), onlyToronto.shape)

   index Postcode           Borough             Neighborhood   Latitude  \
0      2      M5A  Downtown Toronto             Harbourfront  43.654260   
1      5      M9A  Downtown Toronto             Queen's Park  43.667856   
2      9      M5B  Downtown Toronto  Garden District,Ryerson  43.657162   
3     15      M5C  Downtown Toronto           St. James Town  43.651494   
4     19      M4E      East Toronto              The Beaches  43.676357   

   Longitude  
0 -79.360636  
1 -79.532242  
2 -79.378937  
3 -79.375418  
4 -79.293031   (39, 6)


### Creating a Folium map to display all datapoints with Borough that contains "Toronto"

In [108]:
# create map of Manhattan using latitude and longitude values
# taking latitude and longitude of toronto city to map the boroughs with name "Toronto"
map_only_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(onlyToronto['Latitude'], onlyToronto['Longitude'], onlyToronto['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_only_toronto)  
    
map_only_toronto

### Credentials for accessing Foursqaure

base_url = "https://api.foursquare.com/v2/"
CLIENT_ID = 'BEHDJQL0BJLKEFV1X2VOE5TX0V5EDNM3AVNWQ1HCOEKZF1SZ'
CLIENT_SECRET = 'VCFEI0CG1WHRV4RWERC3CWMS313GAZ0LIVEYJSS5AVIT14QU'
VERSION = '20180605' # Foursquare API version


### Method to create url

In [198]:
def create_url(row):
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION, 
        row['Latitude'], 
        row['Longitude'], 
        500, 
        100)
    return url
url = create_url(onlyToronto.loc[0])
results = requests.get(url).json()
print(len(results))

2


### Extracts the categories from Foursquare results

In [95]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

### Fetches the nearby venues for each latitude and longitude

In [55]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    LIMIT = 100
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return nearby_venues

###  Reading all venues

In [159]:
only_toronto_venues = getNearbyVenues(names=onlyToronto['Neighborhood'],
                                   latitudes=onlyToronto['Latitude'],
                                   longitudes=onlyToronto['Longitude'])
print(only_toronto_venues.head())
print(only_toronto_venues[only_toronto_venues['Venue Category'] == 'Neighborhood'].reset_index())
only_toronto_venues = only_toronto_venues[only_toronto_venues['Venue Category'] != 'Neighborhood'].reset_index()

   Neighborhood  Neighborhood Latitude  Neighborhood Longitude  \
0  Harbourfront               43.65426              -79.360636   
1  Harbourfront               43.65426              -79.360636   
2  Harbourfront               43.65426              -79.360636   
3  Harbourfront               43.65426              -79.360636   
4  Harbourfront               43.65426              -79.360636   

                    Venue  Venue Latitude  Venue Longitude  \
0        Roselle Desserts       43.653447       -79.362017   
1           Tandem Coffee       43.653559       -79.361809   
2  Cooper Koo Family YMCA       43.653191       -79.357947   
3     Body Blitz Spa East       43.654735       -79.359874   
4      Morning Glory Cafe       43.653947       -79.361149   

         Venue Category  
0                Bakery  
1           Coffee Shop  
2  Gym / Fitness Center  
3                   Spa  
4        Breakfast Spot  
   index                                     Neighborhood  \
0    251     

### Adding one hot encoding for all venue types

In [158]:
# Analysing the Results
only_toronto_onehot = pd.get_dummies(only_toronto_venues[['Venue Category']], prefix="", prefix_sep="")
only_toronto_onehot.insert(0, column='Neighborhood',value=only_toronto_venues['Neighborhood'])
only_toronto_grouped = only_toronto_onehot.groupby('Neighborhood').mean().reset_index()

print(only_toronto_grouped.shape)

(38, 236)


### Function to return 'n' most common venues 

In [115]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [199]:
### Dataframe with most common venues for each neighborhood

In [200]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
only_toronto_venues_sorted = pd.DataFrame(columns=columns)
only_toronto_venues_sorted['Neighborhood'] = only_toronto_grouped['Neighborhood']

for ind in np.arange(only_toronto_grouped.shape[0]):
    only_toronto_venues_sorted.iloc[ind, 1:] = return_most_common_venues(only_toronto_grouped.iloc[ind, :], num_top_venues)

only_toronto_venues_sorted.head()


Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Café,Bakery,Cheese Shop,Farmers Market,Seafood Restaurant,Beer Bar,French Restaurant,Steakhouse
1,Business Reply Mail Processing Centre 969 Eastern,Light Rail Station,Yoga Studio,Auto Workshop,Pizza Place,Recording Studio,Restaurant,Butcher,Burrito Place,Brewery,Skate Park
2,Central Bay Street,Coffee Shop,Café,Italian Restaurant,Sandwich Place,Burger Joint,Ice Cream Shop,Japanese Restaurant,Bakery,Salad Place,Bubble Tea Shop
3,Christie,Grocery Store,Café,Park,Baby Store,Candy Store,Diner,Restaurant,Athletics & Sports,Nightclub,Italian Restaurant
4,Church and Wellesley,Coffee Shop,Japanese Restaurant,Gay Bar,Sushi Restaurant,Restaurant,Pub,Men's Store,Mediterranean Restaurant,Hotel,Gym


### Function to observe clusters as map using K Means algorithm

In [204]:
def observe_clusters(kclusters=6, seed=0):
    ## Cluster Neighborhoods

    # set number of clusters

    only_toronto_grouped_clustering = only_toronto_grouped.drop('Neighborhood', 1)

    # run k-means clustering
    kmeans = KMeans(n_clusters=kclusters, random_state=seed).fit(only_toronto_grouped_clustering)

    # check cluster labels generated for each row in the dataframe
    print(kmeans.labels_)
    only_toronto_venues_sorted_copied = only_toronto_venues_sorted.copy()
    if 'Cluster Labels' not in only_toronto_venues_sorted.columns:
        only_toronto_venues_sorted_copied.insert(0,'Cluster Labels', kmeans.labels_)
    only_toronto_merged = onlyToronto 
    only_toronto_merged = only_toronto_merged.merge(only_toronto_venues_sorted_copied,left_on='Neighborhood',right_on='Neighborhood')
    map_clusters = folium.Map(location=[latitude, longitude],zoom_start=11)
    
    x = np.arange(kclusters)
    ys = [i + x + (i*x)**2 for i in range(kclusters)]
    colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
    rainbow = [colors.rgb2hex(i) for i in colors_array]

    
    marker_colors = []
    for lat, lon, poi, cluster in zip(only_toronto_merged['Latitude'], only_toronto_merged['Longitude'], only_toronto_merged['Neighborhood'], only_toronto_merged['Cluster Labels']):
       
        label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
        folium.CircleMarker(
            [lat, lon],
            radius=5,
            popup=label,
            color=rainbow[cluster-1],
            fill=True,
            fill_color=rainbow[cluster-1],
            fill_opacity=0.7).add_to(map_clusters)

    return map_clusters
observe_clusters(kclusters=5,seed=13)

[0 4 0 0 0 0 4 0 4 0 0 4 0 2 0 0 0 0 0 1 3 4 0 0 0 0 1 0 0 4 0 0 0 0 0 0 0
 0]
