## _<center> Webscraping (Part-1)</center>_

### Importing necassary packages 

In [268]:
import pandas as pd
import numpy as np

Importing necessary packages for webscraping

In [269]:
from bs4 import BeautifulSoup as bs
import requests
import re

### Obtaining dataset from Wikipedia

In [3]:
#wikipedia url
wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [4]:
#getting website data
wiki_response = requests.get(wiki_url).content
htmlsoup = bs(wiki_response, 'html.parser')

#### Getting web content - Toronto table 

In [152]:
toronto_table = htmlsoup.table.text
#see the table
toronto_table[:100]

'\n\nPostal Code\n\nBorough\n\nNeighborhood\n\n\nM1A\n\nNot assigned\n\n\n\n\nM2A\n\nNot assigned\n\n\n\n\nM3A\n\nNorth York\n\n'

####  Making the Toronto Postal Code dataframe

In [159]:
#removing escape sequence
table_values = re.sub('\n',' ', toronto_table)
table_values[:100]

'  Postal Code  Borough  Neighborhood   M1A  Not assigned     M2A  Not assigned     M3A  North York  '

In [176]:
table_list = table_values.split('   ')
table_list[:300]

['  Postal Code  Borough  Neighborhood',
 'M1A  Not assigned',
 '  M2A  Not assigned',
 '  M3A  North York  Parkwoods',
 'M4A  North York  Victoria Village',
 'M5A  Downtown Toronto  Regent Park, Harbourfront',
 'M6A  North York  Lawrence Manor, Lawrence Heights',
 "M7A  Downtown Toronto  Queen's Park, Ontario Provincial Government",
 'M8A  Not assigned',
 '  M9A  Etobicoke  Islington Avenue, Humber Valley Village',
 'M1B  Scarborough  Malvern, Rouge',
 'M2B  Not assigned',
 '  M3B  North York  Don Mills',
 'M4B  East York  Parkview Hill, Woodbine Gardens',
 'M5B  Downtown Toronto  Garden District, Ryerson',
 'M6B  North York  Glencairn',
 'M7B  Not assigned',
 '  M8B  Not assigned',
 '  M9B  Etobicoke  West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale',
 'M1C  Scarborough  Rouge Hill, Port Union, Highland Creek',
 'M2C  Not assigned',
 '  M3C  North York  Don Mills',
 'M4C  East York  Woodbine Heights',
 'M5C  Downtown Toronto  St. James Town',
 'M6C  York  Humewo

We dont need leading or trailing white spaces in each entry like for example in " M2A Not assigned" and also the column names.

In [177]:
clean_table_list = table_list[1:]
clean_table_list = [entry.strip() for entry in clean_table_list]
clean_table_list[:20]

['M1A  Not assigned',
 'M2A  Not assigned',
 'M3A  North York  Parkwoods',
 'M4A  North York  Victoria Village',
 'M5A  Downtown Toronto  Regent Park, Harbourfront',
 'M6A  North York  Lawrence Manor, Lawrence Heights',
 "M7A  Downtown Toronto  Queen's Park, Ontario Provincial Government",
 'M8A  Not assigned',
 'M9A  Etobicoke  Islington Avenue, Humber Valley Village',
 'M1B  Scarborough  Malvern, Rouge',
 'M2B  Not assigned',
 'M3B  North York  Don Mills',
 'M4B  East York  Parkview Hill, Woodbine Gardens',
 'M5B  Downtown Toronto  Garden District, Ryerson',
 'M6B  North York  Glencairn',
 'M7B  Not assigned',
 'M8B  Not assigned',
 'M9B  Etobicoke  West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale',
 'M1C  Scarborough  Rouge Hill, Port Union, Highland Creek',
 'M2C  Not assigned']

So now we need to split the elements of each entry and store it into the the column of the dataframe

In [231]:
clean_table_list_v2 = [entry.split('  ') for entry in clean_table_list]
clean_table_list_v2

[['M1A', 'Not assigned'],
 ['M2A', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Regent Park, Harbourfront'],
 ['M6A', 'North York', 'Lawrence Manor, Lawrence Heights'],
 ['M7A', 'Downtown Toronto', "Queen's Park, Ontario Provincial Government"],
 ['M8A', 'Not assigned'],
 ['M9A', 'Etobicoke', 'Islington Avenue, Humber Valley Village'],
 ['M1B', 'Scarborough', 'Malvern, Rouge'],
 ['M2B', 'Not assigned'],
 ['M3B', 'North York', 'Don Mills'],
 ['M4B', 'East York', 'Parkview Hill, Woodbine Gardens'],
 ['M5B', 'Downtown Toronto', 'Garden District, Ryerson'],
 ['M6B', 'North York', 'Glencairn'],
 ['M7B', 'Not assigned'],
 ['M8B', 'Not assigned'],
 ['M9B',
  'Etobicoke',
  'West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale'],
 ['M1C', 'Scarborough', 'Rouge Hill, Port Union, Highland Creek'],
 ['M2C', 'Not assigned'],
 ['M3C', 'North York', 'Don Mills'],
 ['M4C', 'East York', 'Woodbine Heigh

Now each entry is ready

In [263]:
postal_codes = [code[0] for code in clean_table_list_v2[:-1]]
postal_codes[:10]

['M1A', 'M2A', 'M3A', 'M4A', 'M5A', 'M6A', 'M7A', 'M8A', 'M9A', 'M1B']

In [261]:
borough = [bor[1] for bor in clean_table_list_v2[:-1]]
borough[:10]

['Not assigned',
 'Not assigned',
 'North York',
 'North York',
 'Downtown Toronto',
 'North York',
 'Downtown Toronto',
 'Not assigned',
 'Etobicoke',
 'Scarborough']

In [260]:
neighborhood = ['na' if neigh[1] == 'Not assigned' else neigh[2] for neigh in clean_table_list_v2[:-1]]
neighborhood[:10]

['na',
 'na',
 'Parkwoods',
 'Victoria Village',
 'Regent Park, Harbourfront',
 'Lawrence Manor, Lawrence Heights',
 "Queen's Park, Ontario Provincial Government",
 'na',
 'Islington Avenue, Humber Valley Village',
 'Malvern, Rouge']

In [264]:
toronto_df = pd.DataFrame(columns = ['PostalCode', 'Borough', 'Neighborhood'])
toronto_df['PostalCode'] = postal_codes
toronto_df['Borough'] = borough
toronto_df['Neighborhood'] = neighborhood
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,na
1,M2A,Not assigned,na
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### Ignoring cells with Borough = 'Not assigned' 

In [265]:
toronto_df = toronto_df[toronto_df.Borough != 'Not assigned'].reset_index(drop = True)
toronto_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


Checking whether there a borough with no neighborhood assigned or not

In [266]:
toronto_df.Neighborhood[toronto_df.Neighborhood == ''] #all neighborhoods with boroughs assigned do have neighborhood value

Series([], Name: Neighborhood, dtype: object)

### Checking the dimesions of the df - Final step

In [270]:
toronto_df.shape

(103, 3)

# <center> End of Webscraping (Part-1) </center>

In [252]:
toronto_df.shape

(103, 3)

Series([], Name: Neighborhood, dtype: object)

The first three elements are our column names and we can avoid it

# _<center>Final Project Part-2</center>_

# _<center>Final Project Part-3</center>_