# Explore and cluster the neighborhoods in Toronto
## Scrape the Wiki page to get the list of PostalCode, Borough, and Neighborhood
### Only processing the cells that have an assigned borough. Ignoring the cells with a borough that is Not assigned
### cancating the neighborhoods based on postal code, seperated by ','
### print heads of dataframes
### Analyze neighborhoods details like we did in lab (Newyork, NY)

Let's start with importing the packages. we will use wiki pakage to scrape details from wiki and will use geopy to get latitude and longitude.. 


In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

!conda install -c conda-forge wikipedia
import wikipedia as wp

print('Libraries imported.')

Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
geopy                     1.18.1                     py_0    conda-forge
Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
folium                    0.5.0                      py_0    conda-forge
Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
wikipedia                 1.4.0                    py35_0    conda-forge
Libraries imported.


# Scraping Wiki to prepare Toronto dataframe

#### Tranform the data into a *pandas* dataframe

#### Borough and Neighbourhoods inforamtion extracted from Wiki, but there is no latitude and longitude information.  To get the latitude and longitude values of Borough and Neighbourhood of Toronto use geopy library and update each row of dataset


Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.


In [2]:
html = wp.page("List of postal codes of Canada: M").html().encode("UTF-8")
df = pd.read_html(html)[0]

#df.to_csv('beautifulsoup_pandas.csv',header=0,index=False)
df=df.rename(columns={0: 'PostalCode', 1:"Borough", 2: "Neighborhood"})
df.drop(df[df.Borough =="Not assigned"].index, inplace=True)

df.head(100)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,Postcode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned
11,M9A,Etobicoke,Islington Avenue
12,M1B,Scarborough,Rouge


More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.

If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. So for the 9th cell in the table on the Wikipedia page, the value of the Borough and the Neighborhood columns will be Queen's Park.

In [38]:
grouped= df.groupby(['PostalCode', 'Borough'], as_index=False, sort=True).apply(lambda group: ', '.join(group['Neighborhood'])).reset_index()
grouped.rename(columns={0 : 'Neighborhood'}, inplace=True)

for key, data in grouped.iterrows():
   # print(data['Neighbourhoods'])
   
    if data['Neighborhood'] =='Not assigned':
        grouped.loc[key,'Neighborhood'] = data['Borough']
grouped.head(100)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [39]:
grouped.shape


(104, 3)

If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. So for the 9th cell in the table on the Wikipedia page, the value of the Borough and the Neighborhood columns will be Queen's Park

Update the Latitude and Longitude columns 

In [40]:


grouped.add('Latitude', fill_value=None)
grouped.add('Longitude', fill_value=None)
# Exclude the first row..
neighborhoods=grouped
#neighborhoods.dropna(thresh=2, inplace=True)

for key, data in neighborhoods.iterrows():
   # print(data['Neighbourhoods'])
   try: 
    address = data['Borough'] + ',' + data['PostalCode']
       #print(address)
    geolocator = Nominatim()
    location = geolocator.geocode(address)
    neighborhoods.loc[key,'Latitude'] = location.latitude
    neighborhoods.loc[key, 'Longitude'] = location.longitude
       #print( location.latitude, location.longitude)
   except:
        #print(address)
        pass
neighborhoods.head(100)



Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",54.28476,-0.409034
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",54.28476,-0.409034
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",,
3,M1G,Scarborough,Woburn,43.762669,-79.230861
4,M1H,Scarborough,Cedarbrae,,
5,M1J,Scarborough,Scarborough Village,,
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",,
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",,
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",,
9,M1N,Scarborough,"Birch Cliff, Cliffside West",,


In [41]:
neighborhoods.dropna(inplace=True)
neighborhoods.head(100)                    

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",54.28476,-0.409034
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",54.28476,-0.409034
3,M1G,Scarborough,Woburn,43.762669,-79.230861
15,M1W,Scarborough,"L'Amoreaux West, Steeles West",43.773077,-79.257774
21,M2M,North York,"Newtonbrook, Willowdale",43.763531,-79.411147
22,M2N,North York,Willowdale South,43.754326,-79.449117
27,M3C,North York,"Flemingdon Park, Don Mills South",43.732822,-79.346961
28,M3H,North York,"Bathurst Manor, Downsview North, Wilson Heights",43.756199,-79.439802
29,M3J,North York,"Northwood Park, York University",43.754326,-79.449117
56,M5E,Downtown Toronto,Berczy Park,43.647744,-79.370378


In [42]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)

The dataframe has 7 boroughs and 17 neighborhoods.
