# Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

### For this assignment, you will be required to explore and cluster the neighborhoods in Toronto.

#### 1- Start by creating a new Notebook for this assignment.
#### 2- Use the Notebook to build the code to scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, in order to obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe

### Pre-processing

In [62]:
# importing libraries
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

In [63]:
# getting data from internet
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
raw_wikipedia_page= requests.get(url).text

# using beautiful soup to parse the HTML/XML codes.
soup = BeautifulSoup(raw_wikipedia_page,'xml')
#print(soup.prettify())

### Processing-part-1: extracting raw table (from webpage)

In [64]:
# extracting the raw table inside that webpage
table = soup.find('table')

Postcode      = []
Borough       = []
Neighbourhood = []

# print(table)

# extracting a clean form of the table
for tr_cell in table.find_all('tr'):
    
    counter = 1
    Postcode_var      = -1
    Borough_var       = -1
    Neighbourhood_var = -1
    
    for td_cell in tr_cell.find_all('td'):
        if counter == 1: 
            Postcode_var = td_cell.text
        if counter == 2: 
            Borough_var = td_cell.text
            tag_a_Borough = td_cell.find('a')
            
        if counter == 3: 
            Neighbourhood_var = str(td_cell.text).strip()
            tag_a_Neighbourhood = td_cell.find('a')
            
        counter +=1
        
    if (Postcode_var == 'Not assigned' or Borough_var == 'Not assigned' or Neighbourhood_var == 'Not assigned'): 
        continue
    try:
        if ((tag_a_Borough is None) or (tag_a_Neighbourhood is None)):
            continue
    except:
        pass
    if(Postcode_var == -1 or Borough_var == -1 or Neighbourhood_var == -1):
        continue
        
    Postcode.append(Postcode_var)
    Borough.append(Borough_var)
    Neighbourhood.append(Neighbourhood_var)
    

### Processing-part-2: integrating Postal codes with more than 1 neighbour

In [65]:

unique_p = set(Postcode)
print('num of unique Postal codes:', len(unique_p))
Postcode_u      = []
Borough_u       = []
Neighbourhood_u = []


for postcode_unique_element in unique_p:
    p_var = ''; b_var = ''; n_var = ''; 
    for postcode_idx, postcode_element in enumerate(Postcode):
        if postcode_unique_element == postcode_element:
            p_var = postcode_element;
            b_var = Borough[postcode_idx]
            if n_var == '': 
                n_var = Neighbourhood[postcode_idx]
            else:
                n_var = n_var + ', ' + Neighbourhood[postcode_idx]
    Postcode_u.append(p_var)
    Borough_u.append(b_var)
    Neighbourhood_u.append(n_var)

    

num of unique Postal codes: 84


### Post-processing: creating an appropriate Pandas Dataframe

In [66]:
toronto_dict = {'Postcode':Postcode_u, 'Borough':Borough_u, 'Neighbourhood':Neighbourhood_u}
df_toronto = pd.DataFrame.from_dict(toronto_dict)
df_toronto.to_csv('toronto_part1.csv')
df_toronto.head(14)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M4E,East Toronto,The Beaches
1,M4V,Central Toronto,"Deer Park, Rathnelly, South Hill"
2,M2K,North York,Bayview Village
3,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
4,M1W,Scarborough,Steeles West
5,M5H,Downtown Toronto,"Adelaide, King"
6,M6R,West Toronto,"Parkdale, Roncesvalles"
7,M5M,North York,Bedford Park
8,M6K,West Toronto,"Exhibition Place, Parkdale Village"
9,M5J,Downtown Toronto,"Toronto Islands, Union Station"


In [67]:
df_toronto.shape

(84, 3)

In [68]:
df_location=pd.read_csv(r'C:\Users\Thor\Downloads\Geospatial_Coordinates.csv')
df_location.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [69]:
x=pd.merge(df_toronto,df_location,left_on='Postcode',right_on='Postal Code')
x

Unnamed: 0,Postcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,M4E,East Toronto,The Beaches,M4E,43.676357,-79.293031
1,M4V,Central Toronto,"Deer Park, Rathnelly, South Hill",M4V,43.686412,-79.400049
2,M2K,North York,Bayview Village,M2K,43.786947,-79.385975
3,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",M1L,43.711112,-79.284577
4,M1W,Scarborough,Steeles West,M1W,43.799525,-79.318389
5,M5H,Downtown Toronto,"Adelaide, King",M5H,43.650571,-79.384568
6,M6R,West Toronto,"Parkdale, Roncesvalles",M6R,43.648960,-79.456325
7,M5M,North York,Bedford Park,M5M,43.733283,-79.419750
8,M6K,West Toronto,"Exhibition Place, Parkdale Village",M6K,43.636847,-79.428191
9,M5J,Downtown Toronto,"Toronto Islands, Union Station",M5J,43.640816,-79.381752


In [70]:
x.drop(['Postal Code'], axis=1)


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4V,Central Toronto,"Deer Park, Rathnelly, South Hill",43.686412,-79.400049
2,M2K,North York,Bayview Village,43.786947,-79.385975
3,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
4,M1W,Scarborough,Steeles West,43.799525,-79.318389
5,M5H,Downtown Toronto,"Adelaide, King",43.650571,-79.384568
6,M6R,West Toronto,"Parkdale, Roncesvalles",43.648960,-79.456325
7,M5M,North York,Bedford Park,43.733283,-79.419750
8,M6K,West Toronto,"Exhibition Place, Parkdale Village",43.636847,-79.428191
9,M5J,Downtown Toronto,"Toronto Islands, Union Station",43.640816,-79.381752
