# Segmenting and Clustering Neighborhoods in Toronto Project

Author: Dalyah Aljamal, 
Date: Jan 2020

## Step 1: Getting the data from Wikipedia

In [111]:
data_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [2]:
!conda install -c conda-forge lxml --yes
#sudo apt-get install python-lxml


Solving environment: done


  current version: 4.5.11
  latest version: 4.8.0

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - lxml


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    scikit-learn-0.20.1        |   py36h22eb022_0         5.7 MB
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    liblapack-3.8.0            |      11_openblas          10 KB  conda-forge
    liblapacke-3.8.0           |      11_openblas          10 KB  conda-forge
    libopenblas-0.3.6          |       h5a2b251_2         7.7 MB
    numpy-1.17.3               |   py36h95a1406_0         5.2 MB  conda-forge
    scipy-1.4.1                |   py36h921218d_0        18.9 MB  conda-forge
    libcblas-3.8.0             |      11_openblas         

In [112]:
import requests
import lxml.html as lh
import pandas as pd
import numpy as np

In [113]:
#Create a handle, page, to handle the contents of the website
page = requests.get(data_url)
#Store the contents of the website under doc
doc = lh.fromstring(page.content)
#Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')


In [114]:
[len(T) for T in tr_elements[:12]]

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

In [115]:

#Create col list
col=[]
#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    header=t.text_content()
    col.append((header,[]))

In [28]:
col

[('Postcode', []), ('Borough', []), ('Neighborhood\n', [])]

In [29]:
#Since out first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 10, the //tr data is not from our table 
    if len(T)!=3:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1

In [30]:
Dict={title:column for (title,column) in col}
toronto_data = pd.DataFrame(Dict)

In [31]:
toronto_data.head()

Unnamed: 0,Postcode,Borough,Neighborhood\n
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


### Drop the rows with 'Not Assigned' Borough values

In [32]:
toronto_data['Borough'].replace('Not assigned', np.nan, inplace=True)

In [33]:
toronto_data.dropna(subset =['Borough'], axis=0, inplace=True)

In [34]:
toronto_data.head()

Unnamed: 0,Postcode,Borough,Neighborhood\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n
5,M6A,North York,Lawrence Heights\n
6,M6A,North York,Lawrence Manor\n


### Replace "Not assigned" values in ['Neighborhood'] with ['Borough'] values

In [35]:
toronto_data['Neighborhood\n'].replace('Not assigned', toronto_data['Borough'], inplace=True)

In [36]:
toronto_data

Unnamed: 0,Postcode,Borough,Neighborhood\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n
5,M6A,North York,Lawrence Heights\n
6,M6A,North York,Lawrence Manor\n
...,...,...,...
281,M8Z,Etobicoke,Kingsway Park South West\n
282,M8Z,Etobicoke,Mimico NW\n
283,M8Z,Etobicoke,The Queensway West\n
284,M8Z,Etobicoke,Royal York South West\n


### Combine Neighborhoods for each postcode

In [40]:
toronto_data['Neighborhood\n']= toronto_data.groupby(toronto_data['Postcode'])['Neighborhood\n'].transform(lambda x: ','.join(x))

In [57]:
toronto_data = toronto_data.drop_duplicates()

### Remove the "\n" from Neighborhood Column

In [58]:
toronto_data['Neighborhood\n'].replace({'\n':''}, regex=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [108]:
toronto_data.rename(columns={'Neighborhood\n':'Neighborhood'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


In [109]:
toronto_data.shape

(103, 5)

In [65]:
toronto_data.iloc[0,0]

'M3A'

## Step2: getting the latitude and the longitude of each neighborhood 

In [74]:
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library

print('Folium installed')
print('Libraries imported.')

Solving environment: done


  current version: 4.5.11
  latest version: 4.8.0

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.

Solving environment: done


  current version: 4.5.11
  latest version: 4.8.0

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.

Folium installed
Libraries imported.


In [75]:
CLIENT_ID = 'DH01K1OPPGDSEFMMWPYKYDYB0X0GY1XKMUBHN0YSNFQWKZD4' # your Foursquare ID
CLIENT_SECRET = 'T3ESFKYUUTRAZPQGZG5IGOWKENS4QMSPVN21L353JXS55Q1H' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)


Your credentails:
CLIENT_ID: DH01K1OPPGDSEFMMWPYKYDYB0X0GY1XKMUBHN0YSNFQWKZD4
CLIENT_SECRET:T3ESFKYUUTRAZPQGZG5IGOWKENS4QMSPVN21L353JXS55Q1H


In [100]:
latList = []
logList = []
for i in range(0, len(toronto_data)-1):
    neighbor = toronto_data.iloc[i,2]
    geolocator = Nominatim(user_agent="foursquare_agent")
    location = None
    while (location is None):
        location = geolocator.geocode(neighbor)
    latitude = location.latitude
    longitude = location.longitude
    #print(latitude, longitude)
    latList.append(latitude)
    logList.append(longitude)

GeocoderServiceError: [Errno 99] Cannot assign requested address

### Getting the Lta and log from the csv file

In [105]:
latnlog = pd.read_csv('Geospatial_Coordinates.csv')

In [106]:
toronto_data['Latitude'] = latnlog['Latitude']
toronto_data['Longitude'] = latnlog['Longitude']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [110]:
toronto_data.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.763573,-79.188711
3,M4A,North York,Victoria Village,43.770992,-79.216917
4,M5A,Downtown Toronto,Harbourfront,43.773136,-79.239476
5,M6A,North York,"Lawrence Heights,Lawrence Manor",43.744734,-79.239476
7,M7A,Downtown Toronto,Queen's Park,43.711112,-79.284577
