# IBM Final Capstone Project

## Using Data Science to find the best location to open a Japanese Restaurant in Toronto, CA

#### Start By importing and installing all the libraries

In [1]:
# Installing the libraries that have not yet been installed
!pip install folium
!pip install xlrd
!pip install lxml

# Importing the libraries
import requests
import pandas as pd
import numpy as np
import folium
import lxml

print("All Libraries Installed Successfully!")

All Libraries Installed Successfully!


## Getting the URL information of the postal codes of Canada and storing it into a dataframe

In [2]:
# Downloading the page into an lxml file which can be read into a pandas dataframe later
url  = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = requests.get(url)
if page.status_code == 200:
    print('Page downloaded successfully!')
else:
    print('Page download error. Error code: {}'.format(page.status_code))

Page downloaded successfully!


In [3]:
# Placing the downloaded page into a pandas dataframe
df_html = pd.read_html(url, header=0, na_values = ['Not assigned'])[0] # Changing the 'Not assigned' values to 'NaN'
df_html.dropna(subset=['Borough'], inplace=True) # Dropping the rows that have 'NaN' for their Borough

# Grouping the data by their postal codes and Borough information
df_postalcodes =pd.DataFrame( df_html.groupby(['Postal Code','Borough']).Neighborhood.agg([('Neighborhood', ', '.join)]))
df_postalcodes.reset_index(inplace=True) # Resetting the index back to 0 for the first entry
df_postalcodes.head(5)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [4]:
# Displaying the shape of the dataframe
print("The shape of the DataFrame is: ", df_postalcodes.shape)

The shape of the DataFrame is:  (103, 3)


In [5]:
# Converting the dataframe to a csv file to be used later
df_postalcodes.to_csv('Toronto_Postalcodes.csv')

## Converting latitude and longitude information of the neighborhoods into a new dataframe

In [6]:
url_csv = 'http://cocl.us/Geospatial_data' # URL to a CSV file for coordinate information 
df_coordinates = pd.read_csv(url_csv) # converting CSV into a new dataframe
df_coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merging the two dataframes together

In [7]:
df_neighborhoods = pd.read_csv('Toronto_Postalcodes.csv') # Storing the csv we created earlier into a new dataframe
df_neighborhoods_coordinates = pd.merge(df_neighborhoods, df_coordinates, on='Postal Code') # Merging the dataframes together
df_neighborhoods_coordinates.drop(columns=['Unnamed: 0'], inplace = True) # Dropping the index column that got carried over when merged

df_neighborhoods_coordinates.to_csv('Toronto_Postalcodes_2.csv') # Saving the dataframe to a csv for easy use

# Giving the merged dataframe an easy name like 'df'
df = pd.read_csv('Toronto_Postalcodes_2.csv', index_col=0)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Dropping all Boroughs that are not part of Toronto

In [8]:
df_toronto = df[df['Borough'].str.contains('Toronto')] # This removes all rows that do not have 'Toronto' in their Borough
df_toronto.reset_index(inplace=True) # Resetting the index to start at 0
df_toronto.drop('index', axis=1, inplace=True) # Dropping the 'index' column that was created
df_toronto.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [9]:
# Creating a list with the Boroughs (to be used later)
boroughs = df_toronto['Borough'].unique().tolist()

# Displaying the number of Neighborhoods in each Borough within Toronto
df_toronto.groupby('Borough').count()['Neighborhood']

Borough
Central Toronto      9
Downtown Toronto    19
East Toronto         5
West Toronto         6
Name: Neighborhood, dtype: int64

## Creating a Map with Folium

In [10]:
# Getting the average latitude and longitude of Tornoto
lat_toronto = df_toronto['Latitude'].mean()
lon_toronto = df_toronto['Longitude'].mean()

# Creating a dictionary to hold the Borough and its unique color
borough_color = {}
for borough in boroughs:
    borough_color[borough]= '#%02X%02X%02X' % tuple(np.random.choice(range(256), size=3))

In [11]:
# Creating the map object with folium
map_toronto = folium.Map(location=[lat_toronto, lon_toronto], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], 
                                           df_toronto['Longitude'],
                                           df_toronto['Borough'], 
                                           df_toronto['Neighborhood']):
    label_text = borough + ' - ' + neighborhood
    label = folium.Popup(label_text)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=borough_color[borough],
        fill_color=borough_color[borough],
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

## Accessing Foursquare API

In [12]:
# Creating variables to store Foursquare Credentials
CLIENT_ID = 'IAF2J3CCPZX050IKBLM0FP5Q5IDHVP1OL0WSRX3EMB5LAD5P' 
CLIENT_SECRET = 'TGPQBBAEFFGH4VTAQEZSXGMOOAGBZF15W1I0Z1W4SVXL1SZE' 
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # Limiting the number of venues returned by Foursquare API
radius = 500 # Radius of venues

In [13]:
# Defining a function to return nearby venues using the Foursquare API
# This function will store the nearby venues into a list
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
   
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [14]:
# Getting the nearby venues with the function defined above
toronto_venues = getNearbyVenues(names=df_toronto['Neighborhood'],
                                latitudes=df_toronto['Latitude'],
                                longitudes=df_toronto['Longitude'])

# Seeing if The unique ID 'Japanese Restaurant is found in toronto_venues['Venue Category']
if "Japanese Restaurant" in toronto_venues['Venue Category'].unique():
    print('\nJapanese Restaurant was FOUND in Venue Category!')
else:
    print('\nJapanese Restaurant was NOT found in Venue Category!')

The Beaches
The Danforth West, Riverdale
India Bazaar, The Beaches West
Studio District
Lawrence Park
Davisville North
North Toronto West, Lawrence Park
Davisville
Moore Park, Summerhill East
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
Rosedale
St. James Town, Cabbagetown
Church and Wellesley
Regent Park, Harbourfront
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
Roselawn
Forest Hill North & West, Forest Hill Road Park
The Annex, North Midtown, Yorkville
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Stn A PO Boxes
First Canadian Place, Underground city
Christie
Dufferin, Dovercourt Village
Little Portugal, Trinity
Brockton, Parkdale Village, Exhibition Place
High 

## One-Hot encoding for the restaurant types per neighborhood

In [15]:
to_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
to_onehot['Neighborhoods'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [to_onehot.columns[-1]] + list(to_onehot.columns[:-1])
to_onehot = to_onehot[fixed_columns]

to_onehot.head()

Unnamed: 0,Neighborhoods,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Art Gallery,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"The Danforth West, Riverdale",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Getting the average of the restaurants in the neighborhoods
to_grouped = to_onehot.groupby(["Neighborhoods"]).mean().reset_index()

to_grouped.head()

Unnamed: 0,Neighborhoods,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Art Gallery,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017544,...,0.0,0.0,0.0,0.0,0.017544,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0625,0.0625,0.125,0.125,0.125,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.015152,0.0,0.0,0.015152,0.0,0.015152


### Getting the average Japanese Restaurant distribution per neighborhood

In [17]:
to_japanese = to_grouped[["Neighborhoods","Japanese Restaurant"]]

to_japanese.head()

Unnamed: 0,Neighborhoods,Japanese Restaurant
0,Berczy Park,0.017544
1,"Brockton, Parkdale Village, Exhibition Place",0.0
2,"Business reply mail Processing Centre, South C...",0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0
4,Central Bay Street,0.045455


## K-means Clustering Algorithm

In [18]:
from sklearn.cluster import KMeans

# Number of clusters
toclusters = 3

to_clustering = to_japanese.drop(["Neighborhoods"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=toclusters, random_state=1)
kmeans.fit_transform(to_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:20]

array([0, 1, 1, 1, 2, 1, 2, 0, 0, 1, 1, 2, 1, 0, 1, 1, 1, 0, 1, 0],
      dtype=int32)

In [19]:
# Merging the dataframes with the clustering labels
to_merged = to_japanese.copy()

# add clustering labels
to_merged["Cluster Labels"] = kmeans.labels_

# Renaming the 'Neighborhoods' column
to_merged.rename(columns={"Neighborhoods": "Neighborhood"}, inplace=True)
to_merged.head()

Unnamed: 0,Neighborhood,Japanese Restaurant,Cluster Labels
0,Berczy Park,0.017544,0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,1
2,"Business reply mail Processing Centre, South C...",0.0,1
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,1
4,Central Bay Street,0.045455,2


In [20]:
# Joining the cluster dataframe with the venues dataframe
to_merged = to_merged.join(toronto_venues.set_index("Neighborhood"), on="Neighborhood")

# Sorting the 'Cluster Labels' Column
to_merged.sort_values(["Cluster Labels"], inplace=True)
to_merged.head()


Unnamed: 0,Neighborhood,Japanese Restaurant,Cluster Labels,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Berczy Park,0.017544,0,43.644771,-79.373306,LCBO,43.642944,-79.37244,Liquor Store
19,"Little Portugal, Trinity",0.022727,0,43.647927,-79.41975,Paris Paris Bar,43.649237,-79.421436,Wine Bar
19,"Little Portugal, Trinity",0.022727,0,43.647927,-79.41975,Bellwoods Brewery Bottle Shop,43.64712,-79.420044,Beer Store
19,"Little Portugal, Trinity",0.022727,0,43.647927,-79.41975,Pho Rua Vang (Golden Turtle),43.646893,-79.419778,Vietnamese Restaurant
19,"Little Portugal, Trinity",0.022727,0,43.647927,-79.41975,Mamakas Taverna,43.645908,-79.419654,Greek Restaurant


## Displaying a map of the Japanese Restaurant Clusters

In [21]:
# Creating a map using folium to display the clusters
map_clusters = folium.Map(location=[lat_toronto, lon_toronto],zoom_start=12)

# add markers to the map
markers_colors={}
markers_colors[0] = 'red'
markers_colors[1] = 'blue'
markers_colors[2] = 'green'
markers_colors[3] = 'yellow'
markers_colors[4] = 'cyan'
markers_colors[5] = 'black'
for lat, lon, cluster in zip(to_merged['Neighborhood Latitude'], to_merged['Neighborhood Longitude'], to_merged['Cluster Labels']):
    
    
    folium.CircleMarker(
        [lat, lon],
        radius=5,
       
        color =markers_colors[cluster],
        fill_color=markers_colors[cluster],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Analyzing each Cluster

In [22]:
#Cluster 0
to_merged.loc[(to_merged['Cluster Labels'] ==0) & (to_merged['Venue Category'] == 'Japanese Restaurant') ]

Unnamed: 0,Neighborhood,Japanese Restaurant,Cluster Labels,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
19,"Little Portugal, Trinity",0.022727,0,43.647927,-79.41975,Bazara,43.648535,-79.420521,Japanese Restaurant
17,"Kensington Market, Chinatown, Grange Park",0.015152,0,43.653206,-79.400049,Gushi,43.652258,-79.404884,Japanese Restaurant
29,St. James Town,0.023256,0,43.651494,-79.375418,Gyu-Kaku Japanese BBQ,43.651422,-79.375047,Japanese Restaurant
29,St. James Town,0.023256,0,43.651494,-79.375418,NAMI,43.650853,-79.375887,Japanese Restaurant
13,"Garden District, Ryerson",0.03,0,43.657162,-79.378937,KAKA,43.657457,-79.384192,Japanese Restaurant
13,"Garden District, Ryerson",0.03,0,43.657162,-79.378937,Kinka Izakaya Original,43.660596,-79.378891,Japanese Restaurant
13,"Garden District, Ryerson",0.03,0,43.657162,-79.378937,Katsuya,43.65986,-79.378788,Japanese Restaurant
37,"Toronto Dominion Centre, Design Exchange",0.03,0,43.647177,-79.381576,Chotto Matte,43.646473,-79.378782,Japanese Restaurant
37,"Toronto Dominion Centre, Design Exchange",0.03,0,43.647177,-79.381576,Ki Modern Japanese + Bar,43.647223,-79.379374,Japanese Restaurant
31,Stn A PO Boxes,0.030612,0,43.646435,-79.374846,NAMI,43.650853,-79.375887,Japanese Restaurant


In [23]:
#Cluster 1
to_merged.loc[(to_merged['Cluster Labels'] ==1) & (to_merged['Venue Category'] == 'Indian Restaurant') ]

Unnamed: 0,Neighborhood,Japanese Restaurant,Cluster Labels,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
36,"The Danforth West, Riverdale",0.0,1,43.679557,-79.352188,Sher-E-Punjab,43.677308,-79.353066,Indian Restaurant
34,"The Annex, North Midtown, Yorkville",0.0,1,43.67271,-79.405678,Roti Cuisine of India,43.674618,-79.408249,Indian Restaurant
14,"Harbourfront East, Union Station, Toronto Islands",0.01,1,43.640816,-79.381752,Indian Roti House,43.63906,-79.385422,Indian Restaurant


In [24]:
#Cluster 2
to_merged.loc[(to_merged['Cluster Labels'] ==2) & (to_merged['Venue Category'] == 'Indian Restaurant') ]

Unnamed: 0,Neighborhood,Japanese Restaurant,Cluster Labels,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
6,Church and Wellesley,0.055556,2,43.66586,-79.38316,Kothur Indian Cuisine,43.667872,-79.385659,Indian Restaurant
4,Central Bay Street,0.045455,2,43.657952,-79.387383,Colaba Junction,43.66094,-79.385635,Indian Restaurant


# Quick Analysis

##### Most of the Japanese Restaurants are located in Cluster 0. I would Recommend that a Restaurant open in Cluster 2 near Central Bay Street.

#### This is because Cluster 2 has the least amount of Japanese restaurants.