# IBM Data Science Capstone

### Initially we will import all the required modules

In [171]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

#### Using requests module we will scrape the following link:
https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M <br>
After that we will fetch only the data.

In [172]:
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

#### Using BeautifulSoup I have processed the html data which has been stored in soup variable.

In [173]:
soup = BeautifulSoup(data,'lxml')

#### Below we have use find function to find the table tag
And Then we have use <b>read_html</b> function to read the html table into Dataframe<br>
Since the function returns a list therefore we have taken the first table from the list.

In [174]:
table = soup.find('table')
df = pd.read_html(str(table))
df = df[0]
df = df.rename(columns=df.iloc[0]).drop(df.index[0])
print(df.shape)
df.head(10)

(288, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned
10,M8A,Not assigned,Not assigned


#### All the rows where Borough is  Not Assigned have been droped.
#### Futhermore the Not Assigned value have been replaced with the Borough value respectively

In [175]:
df = df.replace('Not assigned',np.nan)
df.dropna(subset = ['Borough'],inplace=True)
df['Neighbourhood'] = df['Neighbourhood'].fillna(df['Borough'])
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Queen's Park
11,M9A,Etobicoke,Islington Avenue
12,M1B,Scarborough,Rouge
13,M1B,Scarborough,Malvern


In [176]:
df.shape

(211, 3)

### Since we need Latitude and Longitude for respective postal code so we will be joining two Dataframes

In [177]:
lat = pd.read_csv('Geospatial_Coordinates.csv')
lat.rename(columns={'Postal Code': 'Postcode'}, inplace=True)
lat.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Resultant DataFrame

In [178]:
result = pd.merge(df, lat,on = 'Postcode')
result.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
4,M6A,North York,Lawrence Heights,43.718518,-79.464763


### We are filtering the data to get the data of Toronto city only

In [179]:
toronto_data = result[result['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
1,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
2,M5B,Downtown Toronto,Ryerson,43.657162,-79.378937
3,M5B,Downtown Toronto,Garden District,43.657162,-79.378937
4,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418


## Now is the time to make the clusters

### I am going to cluster the data on the basis of number of Restraunt in the respective Area.
#### Now to get the data of all the restraunts in the toronto I will be using FourSquare Api

In [180]:
CLIENT_ID = '00IVVZKELM2KPVHLPFMUXMY5QZHDGZPX5SSB4HURZJ2KKKMT' # your Foursquare ID
CLIENT_SECRET = 'ZZQNWZTOEZSYA2MXCQBU12OIELDNEQQH3YKKJBDNIWYZ0ZF4' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [181]:
#Max number of venues within radius
LIMIT = 100
radius=500
def getRestaurant(names, latitudes, longitudes, category):
    
    Restaurant_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&categoryId={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius,
            category,
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['venues']

        # return only relevant information for each nearby venue
        Restaurant_list.append([(
            name, 
            lat, 
            lng, 
            v['name'], 
            v['location']['lat'], 
            v['location']['lng'],  
            v['categories'][0]['name']) for v in results])
        

    nearby_Restaurant = pd.DataFrame([item for Restaurant in Restaurant_list for item in Restaurant])
    nearby_Restaurant.columns = ['Postal Code', 
                  'Postal Code Latitude', 
                  'Postal Code Longitude', 
                  'Venue', 
                  'Restaurant Latitude', 
                  'Restaurant Longitude', 
                  'Category']
    return(nearby_Restaurant)

Restaurant = '4bf58dd8d48988d1c4941735'#category code for cafe

toronto_Restaurants = getRestaurant(names=toronto_data['Postcode'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude'],
                                   category=Restaurant
                                  )


In [182]:
toronto_Restaurants.head(20)

Unnamed: 0,Postal Code,Postal Code Latitude,Postal Code Longitude,Venue,Restaurant Latitude,Restaurant Longitude,Category
0,M5A,43.65426,-79.360636,Dominion Pub and Kitchen,43.656919,-79.358967,Pub
1,M5A,43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant
2,M5A,43.65426,-79.360636,Flame Shack,43.656844,-79.358917,Restaurant
3,M5A,43.65426,-79.360636,The Cannery - Bldg 58,43.649952,-79.359086,Restaurant
4,M5A,43.65426,-79.360636,Mom and Pops,43.64998,-79.363883,Restaurant
5,M5A,43.65426,-79.360636,Pure Pizza & Burger,43.659454,-79.366029,Restaurant
6,M5A,43.65426,-79.360636,Beyond Food,43.653094,-79.365379,Restaurant
7,M5A,43.65426,-79.360636,Dominion Pub and Kitchen,43.656919,-79.358967,Pub
8,M5A,43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant
9,M5A,43.65426,-79.360636,Flame Shack,43.656844,-79.358917,Restaurant


In [183]:
toronto_Restaurants = toronto_Restaurants.groupby(['Postal Code']).size().reset_index(name='Restaurants')

#Add in neighborhood names for convenience
toronto_Restaurants = toronto_Restaurants.set_index('Postal Code').join(result.set_index('Postcode'))
# toronto_Restaurants.reset_index(inplace=True)
toronto_Restaurants

Unnamed: 0,Restaurants,Borough,Neighbourhood,Latitude,Longitude
M4X,12,Downtown Toronto,Cabbagetown,43.667967,-79.367675
M4X,12,Downtown Toronto,St. James Town,43.667967,-79.367675
M4Y,24,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
M5A,14,Downtown Toronto,Harbourfront,43.65426,-79.360636
M5A,14,Downtown Toronto,Regent Park,43.65426,-79.360636
M5B,74,Downtown Toronto,Ryerson,43.657162,-79.378937
M5B,74,Downtown Toronto,Garden District,43.657162,-79.378937
M5C,44,Downtown Toronto,St. James Town,43.651494,-79.375418
M5E,29,Downtown Toronto,Berczy Park,43.644771,-79.373306
M5G,34,Downtown Toronto,Central Bay Street,43.657952,-79.387383


In [184]:
# set number of clusters
kclusters = 5

manhattan_grouped_clustering = toronto_Restaurants[['Latitude','Longitude']]

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(manhattan_grouped_clustering)

# check cluster labels generated for each row in the dataframe
toronto_Restaurants['Cluster Labels'] = kmeans.labels_

In [185]:
toronto_rest

Unnamed: 0,index,Restaurants,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels
0,M4X,12,Downtown Toronto,Cabbagetown,43.667967,-79.367675,4
1,M4X,12,Downtown Toronto,St. James Town,43.667967,-79.367675,4
2,M4Y,24,Downtown Toronto,Church and Wellesley,43.66586,-79.38316,3
3,M5A,14,Downtown Toronto,Harbourfront,43.65426,-79.360636,4
4,M5A,14,Downtown Toronto,Regent Park,43.65426,-79.360636,4
5,M5B,74,Downtown Toronto,Ryerson,43.657162,-79.378937,3
6,M5B,74,Downtown Toronto,Garden District,43.657162,-79.378937,3
7,M5C,44,Downtown Toronto,St. James Town,43.651494,-79.375418,0
8,M5E,29,Downtown Toronto,Berczy Park,43.644771,-79.373306,0
9,M5G,34,Downtown Toronto,Central Bay Street,43.657952,-79.387383,3
