### The following Notebook will scrape and clean a data frame from Wikipedia on Candadian postal codes
#### The first step is to import pandas and read the dataframe

In [1]:
# scrape webpage into pandas df

import pandas as pd

df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
df = df[0]
df.head()

Unnamed: 0,Postal Code,District,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### Step 2 is to remove Not Assigned from the Neighbourhood column

In [2]:
# drop Not Assigned from the District column

mask = df['District'].isin(['Not assigned'])
df[~mask].head()

Unnamed: 0,Postal Code,District,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### Step 3 is to identify neighbourhoods in the same postal code and combine them into one neighborhood separated by a comma

In [3]:
# Combine duplicate postal codes with neighborhoods separated by a comma

df1=df[~mask].groupby("Postal Code").agg(lambda x:','.join(set(x)))
df1.head()

Unnamed: 0_level_0,District,Neighbourhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Malvern, Rouge"
M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae


#### Step 4, if a postal code has a district but a neighbourhood which is unassigned, then the neighbourhood will be the same as the district

In [4]:
# If a cell has a district but a Not assigned neighborhood, then the neighborhood will be the same as the district.

df1.loc[df1['Neighbourhood']=="Not assigned",'Neighbourhood']=df1.loc[df1['Neighbourhood']=="Not assigned",'District']
df1.head()

Unnamed: 0_level_0,District,Neighbourhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Malvern, Rouge"
M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae


#### Finally, print the number of rows (shape) of the dataframe

In [5]:
df1.shape

(103, 2)

### Apply lat and long to postal codes

In [6]:
geo_df = pd.read_csv("https://cocl.us/Geospatial_data")
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [18]:
df1['Latitude'] = geo_df['Latitude'].values
df1['Longitude'] = geo_df['Longitude'].values
df1.head()

Unnamed: 0_level_0,District,Neighbourhood,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
M1G,Scarborough,Woburn,43.770992,-79.216917
M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### Exploratory Analysis
#### 1. Isolate DowntownToronto Districts
#### 2.Create Map

In [8]:
import folium

In [16]:
# only include District = Toronto and replace the index column

toronto_df = df1.loc[df1['District']=='Downtown Toronto'].reset_index(drop=True)
toronto_df.head()

Unnamed: 0,District,Neighbourhood,Latitude,Longitude
0,Downtown Toronto,Rosedale,43.679563,-79.377529
1,Downtown Toronto,"St. James Town, Cabbagetown",43.667967,-79.367675
2,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
3,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [17]:
# Establish Toronto Lat and Long

Lat = 43.6532
Long = -79.3832

map_toronto = folium.Map(location=[Lat,Long], zoom_start=11)

# Add neighborhood markers with for loop

for lat, lng, label in zip(toronto_df['Latitude'],toronto_df['Longitude'],toronto_df['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color="red",
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)

map_toronto
