### IBM Applied Data Science Capstone Assignment Week 3
### Week 3 - Submission No 2 (which include submission No. 1)

#### 1. Python Library Initialization

In [1]:
# library to handle data in a vectorized manner
import numpy as np 

 # library for data analsysis
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# library to handle JSON files
import json 

# convert an address into latitude and longitude values for Four Square API
#!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim 

# library to handle requests
import requests 

# tranform JSON file into a pandas dataframe
from pandas.io.json import json_normalize

# BeautifulSoup - htlm parser
from bs4 import BeautifulSoup

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

# map rendering library
# !conda install -c conda-forge folium=0.5.0 --yes 
import folium 

#### 2. Scrapping data using BeautifulSoup for Toronto Wikipedia Webpage to Python Pandas DataFrame

In [19]:
# scrapping data from website and BeautifulSoup html parsing

Toronto_data = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
Toronto_soup = BeautifulSoup(Toronto_data,"html.parser")

In [22]:
# Storing Toronto_data into Pandas DataFrame
Postal_Code = []
Borough = []
Neighborhood = []

for row in Toronto_soup.find("table").find_all("tr"):
    cells = row.find_all("td")
    if (len(cells) > 0):
      Postal_Code.append(cells[0].text)
      Borough.append(cells[1].text)
      Neighborhood.append(cells[2].text.rstrip("\n"))

DF_Toronto = pd.DataFrame({"Postal Code": Postal_Code, "Borough": Borough, "Neighborhood": Neighborhood})

#Check if data is appended correctly into Pandas DataFrame
DF_Toronto.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [23]:
# Cleaning the DF_Toronto DataFrame (Drop Borough == "Not assigned" and Borough name to replace "Not assigned" Neighborhood)

DF_Toronto_Clean = DF_Toronto[DF_Toronto.Borough != "Not assigned"]

for index, row in DF_Toronto_Clean.iterrows():
  if row["Neighborhood"] == "Not assigned":
     row["Neighborhood"] = row["Borough"]

# Group by Postal Code and Borough

DF_Toronto_Clean_Group = DF_Toronto_Clean.groupby(["Postal Code", "Borough"], as_index=False).agg(lambda x: ",".join(x))
DF_Toronto_Clean_Group.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [24]:
#Creating a subset DataFrame from DF_Toronto_Clean_Group DataFrame as per DataFrame listed in Peer Review Assignment

Sub_df_columns = ["Postal Code", "Borough", "Neighborhood"]
Sub_df = pd.DataFrame(columns=Sub_df_columns)

df_row = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M","M1R", "M9V","M9L","M5V", "M1B", "M5A"]

for postcode in df_row:
    Sub_df = Sub_df.append(DF_Toronto_Clean_Group[DF_Toronto_Clean_Group["Postal Code"]==postcode], ignore_index=True)
Sub_df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M5G,Downtown Toronto,Central Bay Street
1,M2H,North York,Hillcrest Village
2,M4B,East York,"Woodbine Gardens,Parkview Hill"
3,M1J,Scarborough,Scarborough Village
4,M4G,East York,Leaside
5,M4M,East Toronto,Studio District
6,M1R,Scarborough,"Maryvale,Wexford"
7,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam..."
8,M9L,North York,Humber Summit
9,M5V,Downtown Toronto,"CN Tower,Bathurst Quay,Island airport,Harbourf..."


In [18]:
# Print the shape of DF_Toronto_Clean_Group DataFrame

DF_Toronto_Clean_Group.shape

(103, 3)

### Week 3 Submission No. 2 Geospatial Coordinates For Postal Code

In [45]:
# Read geospatial CSV file and load into Geo_position DataFrame
path = "http://cocl.us/Geospatial_data"

Geo_position = pd.read_csv(path)
Geo_position.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [46]:
#Append DF_Toronto_Clean_Group DataFrame with Geo_position DataFrame Lat and Long
DF_Toronto_Clean_Group_Geo = DF_Toronto_Clean_Group.merge(Geo_position, on="Postal Code", how="left")
DF_Toronto_Clean_Group_Geo.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


### Confirming the Toronto Geo Data Frame with Question 2 Data Frame example

In [54]:
#Creating a subset DataFrame from DF_Toronto_Clean_Group_Geo DataFrame as per DataFrame listed in Peer Review Assignment

Sub_Geo_columns = ["Postal Code", "Borough", "Neighborhood", "Latitude", "Longitude"]
Sub_Geo = pd.DataFrame(columns=Sub_Geo_columns)

df_row_Geo = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M","M1R", "M9V","M9L","M5V", "M1B", "M5A"]

for postcode in df_row_Geo:
  Sub_Geo = Sub_Geo.append(DF_Toronto_Clean_Group_Geo[DF_Toronto_Clean_Group_Geo["Postal Code"]==postcode], ignore_index=True)
Sub_Geo

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
1,M2H,North York,Hillcrest Village,43.803762,-79.363452
2,M4B,East York,"Woodbine Gardens,Parkview Hill",43.706397,-79.309937
3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
4,M4G,East York,Leaside,43.70906,-79.363452
5,M4M,East Toronto,Studio District,43.659526,-79.340923
6,M1R,Scarborough,"Maryvale,Wexford",43.750072,-79.295849
7,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",43.739416,-79.588437
8,M9L,North York,Humber Summit,43.756303,-79.565963
9,M5V,Downtown Toronto,"CN Tower,Bathurst Quay,Island airport,Harbourf...",43.628947,-79.39442
