# Task 1 (from Cource 9, Week 1)

this notebook will be mainly used for the capstone project

In [54]:
import pandas as pd
import numpy as np

In [55]:
print('Hello Capstone Project Course!')

Hello Capstone Project Course!


# Task 2 (from Cource 9, Week 3)

## Part 1: creating a dataframe

In [56]:
# web scraping using the BeautifulSoup package

import requests
from bs4 import BeautifulSoup

URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(URL)
soup = BeautifulSoup(response.text,'html.parser')
table = soup.find('table',{'class':'wikitable sortable'}).tbody
rows = table.find_all('tr')
column_names = [v.text.replace('\n','') for v in rows[0].find_all('th')]

df = pd.DataFrame(columns=column_names)

for i in range(1,len(rows)):
    tds = rows[i].find_all('td')
    if len(tds)==4:
        values = [tds[0].text, tds[1].text, tds[2].text, tds[3].text.replace('\n',''.replace('\xa0',''))]
    else:
        values = [td.text.replace('\n',''.replace('\xa0','')) for td in tds]
    df = df.append(pd.Series(values, index=column_names), ignore_index=True)

df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [57]:
df.shape  # let's see how big is our dataframe

(180, 3)

In [58]:
# delete a "Not assigned" Boroughs

df1=df.replace('Not assigned',np.NaN)         # replace into panda's format
df1=df1.dropna(subset=["Borough"], axis=0)    # delete rows with NaN Boroughs
df1.reset_index(drop=True, inplace=True)      # refresh rows indexes
df1.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [59]:
df1.shape  # let's see how many rows we have lost

(103, 3)

We lost 180 - 103 =  77 rows ((

## Part 2: geting the coordinates of each neighborhood

In [60]:
path = "http://cocl.us/Geospatial_data"
df_coord = pd.read_csv(path, header=0) # read file with geospatial data
df_coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [61]:
df_coord.shape

(103, 3)

In [62]:
# Let's prepare both tables to be joined

df1.sort_values(by='Postal Code', inplace=True) # sort df1 by 'Postal Code' column
df1.reset_index(drop=True, inplace=True)        # refresh rows indexes

df1.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [63]:
df_coord.sort_values(by='Postal Code', inplace=True) # sort df1 by 'Postal Code' column
df_coord.reset_index(drop=True, inplace=True)        # refresh rows indexes

df_coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [81]:
# Let's join both tables in one

neighbourhoods = df1.join(df_coord.set_index('Postal Code'), on='Postal Code')
neighbourhoods.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Part 3: Explore and cluster the neighborhoods in Toronto

#### A) let's download all the dependencies that we will need

In [41]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


#### B) Use geopy library to get the latitude and longitude values of Toronto

In [45]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


#### C) Create a map of Toronto with neighborhoods superimposed on top

In [88]:
# create map of Toronto using latitude and longitude values

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)  
map_toronto

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Postal Code']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto

# I see no reason to make the rest part cause it is usefulness for me