# Data Science Clustering Project 

### In this project, I will be scraping a wikipedia page and using the information with the Foursquare API in order to create a visual interpretation of the data using a Machine Learning algorithm called, K-Means Clustering

#### Import All Libraries for this project

In [77]:
from bs4 import BeautifulSoup
import requests
import xml

import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
#!conda install -c conda-forge folium
#!pip install folium
import folium

## PART 1: Scraping the Wikipedia Page

In [107]:
# Step 1: Use the BeautifulSoup python library to extract HTML from the website URL

URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(URL).text
soup = BeautifulSoup(response, 'lxml')
#print(soup) returns the html code of the page

########################################################################################################################

# Step 2: Create a new dataframe with the transferred data from the Wikipedia page

create_table = soup.find('table')
field = create_table.find_all('td')#Length of the rows

postcode, borough, neighbourhood = [], [], []#Column name as arrays

# Add the data into the dataframe
for i in range(0, len(field), 3):
    postcode.append(field[i].text.strip())
    borough.append(field[i+1].text.strip())
    neighbourhood.append(field[i+2].text.strip())
        
w_df = pd.DataFrame(data=[postcode, borough, neighbourhood]).transpose()   #transposing it for meaningful indexes
w_df.columns = ['Postcode', 'Borough', 'Neighbourhood']  #adding it as columns
    
########################################################################################################################


# Step 3: Clean the data with assignment specifications

#Remove all rows with 'Not assigned' borough values
w_df.drop(w_df[w_df['Borough']=="Not assigned"].index,axis=0, inplace=True)

#If postal code is listed twice, combine the neighbourhoods
final_w_df = w_df.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()

#If borough but no neighbourhood, then the neighbourhood will be the same as the borough
final_w_df.loc[final_w_df['Neighbourhood'] == "Not assigned",'Neighbourhood'] = final_w_df.loc[final_w_df['Neighbourhood'] == "Not assigned",'Borough']

### The Resultant Dataframe is...

In [108]:
final_w_df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [109]:
final_w_df.shape

(103, 3)

## PART 2: Adding Latitude and Longitude Coordinates to each Postal Code

In [112]:
# The code was removed by Watson Studio for sharing.

### The Resultant Dataframe is...

In [115]:
coordinate_w_final_df

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [116]:
coordinate_w_final_df.shape

(103, 5)

## PART 3: Explore and Cluster Neighbourhoods in Toronto

In [135]:
# Create map with neighbourhood markers

address = 'Toronto, ON'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

# create map of Manhattan using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighbourhood in zip(coordinate_w_final_df['Latitude'], coordinate_w_final_df['Longitude'], coordinate_w_final_df['Borough'], coordinate_w_final_df['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=4,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [152]:
# The code was removed by Watson Studio for sharing.

In [None]:
# The code was removed by Watson Studio for sharing.

In [None]:
LIMIT = 100 # limit of number of venues returned by Foursquare API

radius = 500 # define radius

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    coordinates_latitude, 
    coordinates_longitude, 
    radius, 
    LIMIT)

results = requests.get(url).json()

# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']
    
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

### Top 5 Venues in this Neighbourhood

In [150]:
nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Mr Congee Chinese Cuisine 龍粥記,Chinese Restaurant,43.798879,-79.318335
1,Phoenix Restaurant 金鳳餐廳,Chinese Restaurant,43.798198,-79.318432
2,Subway,Sandwich Place,43.798983,-79.318838
3,Price Chopper,Grocery Store,43.799445,-79.318563
4,KFC,Fast Food Restaurant,43.798938,-79.318854
