# Battle of the Neighbourhoods

#### References

Data & The World: https://www.dataandtheworld.com/2017/06/20/scraping-wikipedia-tables-python-r/

In [2]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

pd.options.display.max_columns = 20

print ('Libraries loaded')

Libraries loaded


In [3]:
### Web Scrapping Wikipedia Table

WIKI_PAGE = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# Grab the page HTML and get the list of rows for the table
req = requests.get(WIKI_PAGE)
page_html = BeautifulSoup(req.text, 'lxml')
wiki_table = page_html.find('table', attrs = {'class':'wikitable sortable'})
row_list = wiki_table.find_all('tr')

# First row in the table is the header, so extract that separately
header_row = row_list.pop(0)
header_th = header_row.find_all('th')
header = [el.text for el in header_th]

table_dict = {x:[] for x in header}

# Now for the rest of the table...
for row in row_list:
    row_td = row.find_all('td')
    for el,td in zip(header,row_td):
        table_dict[el].append(td.text)

df = pd.DataFrame(table_dict)
print(df.head())

  Postcode           Borough     Neighbourhood\n
0      M1A      Not assigned      Not assigned\n
1      M2A      Not assigned      Not assigned\n
2      M3A        North York         Parkwoods\n
3      M4A        North York  Victoria Village\n
4      M5A  Downtown Toronto      Harbourfront\n


In [4]:
## Clean dataframe
df['Neighbourhood\n'] = df['Neighbourhood\n'].str.replace('\n','')

## Replaces \n's
df.columns = df.columns.str.replace('\n','')

print (df.head())

  Postcode           Borough     Neighbourhood
0      M1A      Not assigned      Not assigned
1      M2A      Not assigned      Not assigned
2      M3A        North York         Parkwoods
3      M4A        North York  Victoria Village
4      M5A  Downtown Toronto      Harbourfront


In [5]:
## Filter Not assigned Boroughs
df2 = df[df['Borough'] != 'Not assigned']

In [6]:
## Group Neighbourhoods with same Postal Code
df3 = df2.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()

def CheckNeihgborhood(x):
    if x['Neighbourhood'] == 'Not assigned':
        x['Neighbourhood'] = x['Borough']
    else:
        x['Neighbourhood']
    return x['Neighbourhood']

## Assign Neighbourhood = Borough if Neighbourhood have Not assigned labels
df3['Neighbourhood'] = df3.apply(CheckNeihgborhood, axis=1)

In [10]:
print (df3.head())
print (df3.shape)

  Postcode      Borough                           Neighbourhood
0      M1B  Scarborough                          Rouge, Malvern
1      M1C  Scarborough  Highland Creek, Rouge Hill, Port Union
2      M1E  Scarborough       Guildwood, Morningside, West Hill
3      M1G  Scarborough                                  Woburn
4      M1H  Scarborough                               Cedarbrae
(103, 3)


In [None]:
### Add lattitde and longitude
import geocoder # import geocoder

# initialize your variable to None
lat_lng_coords = None

# loop until you get the coordinates
while(lat_lng_coords is None):
  g = geocoder.google('{}, Toronto, Ontario'.format(df3['Postcode'].values.tolist()))
  lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]


In [None]:
lat_lng_coords