# Segmenting and Clustering Neighborhoods in Toronto

## Essential Libraries

In [None]:
import numpy as np 
import pandas as pd 
import requests 
from bs4 import BeautifulSoup
import geocoder 
from sklearn.cluster import KMeans

## Task 1: Data Collection

### Data Scrapping

Data is scrapped from the [wikipidia page](https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M) and converted to a pandas dataframe.

In [None]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M") 
  
soup = BeautifulSoup(r.content, 'html5lib') 
table = soup.find('div', attrs = {'id':'container'}) 

### Dataframe building

In [None]:
data = {
    'postalCodes':[],
    'boroughs':[],
    'neighborhoods':[],
}

columnNum = 1;
passVal = False

for row in soup.find_all('td'):
    for cell in row:
        if cell.string and cell.string[0].isalpha() and len(cell.string) > 2:
            passVal = False
            if columnNum == 1:
                if passVal == False and cell.string[1].isdigit():
                    data['postalCodes'].append(cell.string);   
                    columnNum = 2
                else:
                    continue
            elif columnNum == 2 :
                if cell.string == 'Not assigned':
                    passVal = True
                    del data['postalCodes'][-1]
                    columnNum = 1
                    continue
                else:
                    data['boroughs'].append(cell.string);      
                    columnNum = 3
            elif columnNum == 3 :
                if cell.string == 'Not assigned\n':
                    data['neighborhoods'].append(data['boroughs'][-1])
                else:
                    data['neighborhoods'].append(cell.string); 
                columnNum = 1
                
neighbors = pd.DataFrame.from_dict(data)

neighbors.head()

## Task 2: Coordinates Computing

Locating the Postal codes using `geocoder`. Since the number of processes is limited per 24 hrs. The tables account for any access issues. If the code is executed the next day, it should show the results.

In [15]:
latitude = []
longitude = []

for p in neighbors['postalCodes'].values:
    g = geocoder.google('{}, Toronto, Ontario'.format(p))
    lat_lng_coords = g.latlng
    latitude.append(lat_lng_coords[0] if lat_lng_coords else 'Access Denied!')
    longitude.append(lat_lng_coords[1] if lat_lng_coords else 'Access Denied!')

In [16]:
neighbors.insert(3, 'latitude', latitude)
neighbors.insert(4, 'longitude', longitude)

In [17]:
neighbors.head()

Unnamed: 0,postalCodes,boroughs,neighborhoods,latitude,longitude
0,M1A\n,Not assigned\n,Not assigned\n,Access Denied!,Access Denied!
1,M2A\n,Not assigned\n,Not assigned\n,Access Denied!,Access Denied!
2,M3A\n,North York\n,Parkwoods\n,Access Denied!,Access Denied!
3,M4A\n,North York\n,Victoria Village\n,Access Denied!,Access Denied!
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n",Access Denied!,Access Denied!


In [18]:
neighbors.shape

(180, 5)