## Segmenting and Clustering Neighborhoods in Toronto

In [1]:
# import libraries

import requests
import lxml.html as lh
import pandas as pd
print('done')

done


In [2]:
# create a page for the website data, then parse stored data

url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

page= requests.get(url)

web_cont=lh.fromstring(page.content)

sd_1=web_cont.xpath('//tr')

In [3]:
# display the length of parsed data to know how many columns per row
[len(a) for a in sd_1[:10]]

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

In [4]:
# make the first row as the header, then create an empty list for headers to be stored
sd_1=web_cont.xpath('//tr')

col=[]
b=0

for hd in sd_1[0]:
    b+=1
    name= hd.text_content()
    print ('%d:"%s"'%(b,name))
    col.append((name,[]))

1:"Postcode"
2:"Borough"
3:"Neighborhood
"


## Start to create a Pandas Dataframe

In [5]:
#Create a pandas dataframe by appending each header to a tuple and an empty list

for c in range(1,len(sd_1)):
    a=sd_1[c]
    
    if len(a)!=3:
        break
        
    b=0
    
    for hd in a.iterchildren():
        data=hd.text_content()
        if b>0:
            try:
                data=int(data)
            except:
                pass
            
        col[b][1].append(data)
        
        b+=1

In [6]:
# ensure that all columns are the same length

[len(d) for (title,d) in col]

[287, 287, 287]

In [7]:
# continue creating the pandas dataframe

set_1={title:column for (title,column) in col}

df=pd.DataFrame(set_1)

In [8]:
# display top 5 rows of the dataframe

df.head()

Unnamed: 0,Postcode,Borough,Neighborhood\n
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


## Define Headers and Clean Up the Dataframe

In [9]:
df.columns=['Postcode', 'Borough', 'Neighbourhood']

cols=df.columns.tolist()
cols

cols=cols[-1:] + cols[:-1]

df= df[cols]

df.head()

Unnamed: 0,Neighbourhood,Postcode,Borough
0,Not assigned\n,M1A,Not assigned
1,Not assigned\n,M2A,Not assigned
2,Parkwoods\n,M3A,North York
3,Victoria Village\n,M4A,North York
4,Harbourfront\n,M5A,Downtown Toronto


In [10]:
df=df.replace('\n',' ', regex=True)
df.head()

Unnamed: 0,Neighbourhood,Postcode,Borough
0,Not assigned,M1A,Not assigned
1,Not assigned,M2A,Not assigned
2,Parkwoods,M3A,North York
3,Victoria Village,M4A,North York
4,Harbourfront,M5A,Downtown Toronto


## Dropping and Combining Cells in the Dataframe 

In [11]:
# drop not assigned cells and rest the index due to the dropped cells

df.drop(df.index[df['Borough']=='Not assigned'], inplace=True)

df=df.reset_index(drop=True)

df.head()

Unnamed: 0,Neighbourhood,Postcode,Borough
0,Parkwoods,M3A,North York
1,Victoria Village,M4A,North York
2,Harbourfront,M5A,Downtown Toronto
3,Lawrence Heights,M6A,North York
4,Lawrence Manor,M6A,North York


In [12]:
# combine neighbourhoods based on postcodes and boroughs

df=df.groupby(['Postcode','Borough']) ['Neighbourhood'].apply(','.join).reset_index()
df.columns = ['Postcode','Borough','Neighbourhood']
df.head(15)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge ,Malvern"
1,M1C,Scarborough,"Highland Creek ,Rouge Hill ,Port Union"
2,M1E,Scarborough,"Guildwood ,Morningside ,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park ,Ionview ,Kennedy Park"
7,M1L,Scarborough,"Clairlea ,Golden Mile ,Oakridge"
8,M1M,Scarborough,"Cliffcrest ,Cliffside ,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff ,Cliffside West"


In [13]:
# remove extra spacing in order to re-assign borough vales

df['Neighbourhood'] = df["Neighbourhood"].str.strip()

In [14]:
#assigning Borough vales when not assigned is displayed

df.loc[df["Neighbourhood"]=="Not assigned", "Neighbourhood"] = df ['Borough']

In [15]:
# Check to see if Queen's Park changed
df[df['Borough']=='Queen\'s Park']

Unnamed: 0,Postcode,Borough,Neighbourhood
93,M9A,Queen's Park,Queen's Park


##  Dataframe Shape

In [16]:
df.shape

(103, 3)

## Save to a file for the next section of this assignment

In [17]:
df.to_csv(r'week_3_Q1.csv')