# Torronto Exploration

## Installs the required libraries

In [1]:
!pip install beautifulsoup4



In [2]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv
import json
import xml

## Initiates the initial working dictionary

In [3]:
nei = {"PostalCode":[], "Borough":[], "Neighborhood": []}

## Opens all the html information

In [4]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
source = requests.get(url).text
soup = BeautifulSoup(source, "lxml")
# print(soup.prettify())

## Identifies the table

In [5]:
my_table = soup.find("table", class_="wikitable sortable") # the entire table of the information
# print (my_table.prettify())
# print (type(my_table))

## Creates an initial data frame

In [6]:
# tr-->rows, td-->cells, th-->headings
# headings = my_table.find('th', class_='headerSort')
# print (headings)
for row in my_table.find_all("tr"):
    c = 0
    for cell in row.find_all("td"):
        c = c + 1
        if c == 1:
            nei["PostalCode"].append(cell.text)
        elif c == 2:    
            nei["Borough"].append(cell.text)
        else:    
            nei["Neighborhood"].append(cell.text)

n = pd.DataFrame(nei)
n.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n
5,M5A,Downtown Toronto,Regent Park\n
6,M6A,North York,Lawrence Heights\n
7,M6A,North York,Lawrence Manor\n
8,M7A,Queen's Park,Not assigned\n
9,M8A,Not assigned,Not assigned\n


## Drops Boroughs that are not assigned

In [7]:
counter = 0
index = []
for row in n["Borough"]:
    # print(row, counter)
    if row == "Not assigned":
        index.append(counter)
    counter = counter + 1

for item in index:
    n.iloc[item, 0] = np.nan
    n.iloc[item, 1] = np.nan
    n.iloc[item, 2] = np.nan

n.dropna(inplace=True)
n.reset_index(inplace=True, drop=True)

n.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods\n
1,M4A,North York,Victoria Village\n
2,M5A,Downtown Toronto,Harbourfront\n
3,M5A,Downtown Toronto,Regent Park\n
4,M6A,North York,Lawrence Heights\n
5,M6A,North York,Lawrence Manor\n
6,M7A,Queen's Park,Not assigned\n
7,M9A,Etobicoke,Islington Avenue\n
8,M1B,Scarborough,Rouge\n
9,M1B,Scarborough,Malvern\n


## Check for repeated PostalCodes and format

In [46]:
neig = {"postalCode":[], "borough":[], "neighborhood": []}
counter = 0
for index, row in n.iterrows():
    # print(row['PostalCode'], row['Borough'], row['Neighborhood'])
    if row["PostalCode"] not in neig["postalCode"]:
        neig["postalCode"].append(row["PostalCode"])
        neig["borough"].append(row["Borough"])
        neig["neighborhood"].append(row["Neighborhood"])
    else:
        idx = neig["postalCode"].index(row["PostalCode"])
        new = ', '.join([neig["neighborhood"][idx], row["Neighborhood"]])
        neig["neighborhood"][idx] = new
    counter = counter + 1
    

final = pd.DataFrame(neig)
final = final.replace('\n','', regex=True)
final = final.replace('Not assigned', "Queen's Park", regex=True)
final.head(20)  

final.shape

(103, 3)

## Get the PostCodes

In [64]:
df_ps = pd.read_csv("https://cocl.us/Geospatial_data", dtype='str')
print(df_ps.shape == final.shape)
final["Latitude"] = "def1" 
final["Longitude"] = "def2"

False


## Prepare the dataframe and transfer the information

In [65]:
final.sort_values(by=['postalCode'], inplace=True)
df_ps.sort_values(by=['Postal Code'], inplace=True)
final["Latitude"] = df_ps["Latitude"] 
final["Longitude"] = df_ps["Longitude"]
final.head()

Unnamed: 0,postalCode,borough,neighborhood,Latitude,Longitude
6,M1B,Scarborough,"Rouge, Malvern",43.7279292,-79.2620294
12,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.7942003,-79.2620294
18,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.7785175,-79.3465557
22,M1G,Scarborough,Woburn,43.7701199,-79.4084928
26,M1H,Scarborough,Cedarbrae,43.7459058,-79.352188
