# Welcome to the first part of Clustering Toronto neighborhoods

In [53]:
#step 1 : importing libraries

import requests 
import pandas as pd 
import numpy as np
from bs4 import BeautifulSoup

In [32]:
#step 2 : Getting a dataframe

web_text = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(web_text,'xml')

toronto = soup.find('table',{'class':'wikitable sortable'})
toronto_rows = toronto.find_all('tr')

data = []
for row in toronto_rows:
    data.append([t.text.strip() for t in row.find_all('td')])

toronto_df = pd.DataFrame(data, columns=['PostalCode', 'Borough', 'Neighbourhood'])

toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,,,
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


### Now, we will clean our data through several steps.

In [42]:
#Step 1 : deleting not assigned data

toronto_cleaned1 = toronto_df[toronto_df.Borough != 'Not assigned'].reset_index(drop=True)
toronto_cleaned1.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,,,
1,M3A,North York,Parkwoods
2,M4A,North York,Victoria Village
3,M5A,Downtown Toronto,"Regent Park, Harbourfront"
4,M6A,North York,"Lawrence Manor, Lawrence Heights"


In [43]:
#Step 2 : merging neighborhoods with same postal code, using a coma

toronto_cleaned2 = toronto_cleaned1.groupby(['PostalCode','Borough'], as_index=False).agg(lambda x: ','.join(x))
toronto_cleaned2.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [45]:
#Step 3 : not assigned neighborhood : then the neighborhood = borough

absentneigh = toronto_cleaned2['Neighbourhood'] == "Not assigned"
toronto_cleaned2.loc[absentneigh, 'Neighbourhood'] = toronto_cleaned2.loc[absentneigh, 'Borough']
toronto_cleaned2.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Let us now look at the shape of our data 

In [52]:
toronto_cleaned2.shape[0]

103

### This is the end of the first part of the task. 