# Capstone Assignment : 
In this assignment, you will be required to explore, segment, and cluster the neighborhoods in the city of Toronto. However, unlike New York, the neighborhood data is not readily available on the internet. What is interesting about the field of data science is that each project can be challenging in its unique way, so you need to learn to be agile and refine the skill to learn new libraries and tools quickly depending on the project.

In [128]:
import sys
!{sys.executable} -m pip install bs4



In [2]:
# importing necessary libraries
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

In [3]:
# getting data from internet
wikipedia_link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
raw_wikipedia_page= requests.get(wikipedia_link).text

# using beautiful soup to parse the HTML/XML codes.
soup = BeautifulSoup(raw_wikipedia_page, "html.parser")

In [4]:
debug = False
table = soup.find('table')

Postcode      = []
Borough       = []
Neighbourhood = []


counter = 0
Postcode_var      = None
Borough_var       = None
Neighbourhood_var = None
# extracting a clean form of the table
for tr_cell in table.find_all('tr'):
    if counter == 0:
        counter = 1
        continue
        
    info = tr_cell.find_all('td')
    Postcode_var = info[0].text.strip()
    Borough_var = info[1].text.strip()
    Neighbourhood_var = info[2].text.replace('/', ',').strip()
    

    if Borough_var == 'Not assigned':
        counter += 1
        continue
    
    if Neighbourhood_var == 'Not assigned':
        Neighbouthood_var = Borough_var
    
    
    Postcode.append(Postcode_var)
    Borough.append(Borough_var)
    Neighbourhood.append(Neighbourhood_var)
    
    if debug: 
        print('counter :', counter)
        print('Postcode_var :',Postcode_var)
        print('Borough_var :', Borough_var)
        print('Neighbourhood_var :',Neighbourhood_var) 
        print('x'*40)
            
    counter += 1

# The wikipedia page has been modified to contain all the neighbourhoods in a single Postcode in a single cell seperated by a '/' so I have simply replaced '/' with a ','.

In [5]:
toronto_dict = {'Postcode':Postcode, 'Borough':Borough, 'Neighbourhood':Neighbourhood}
df_toronto = pd.DataFrame.from_dict(toronto_dict)

### To check if there are any repetitions of postcodes I compared the number of rows to the number of unique postcodes

In [6]:
print('Number of rows in the dataframe: {} Rows'.format(df_toronto.shape[0]))
print('Number of unique values in the dataframe: ')
df_toronto.Postcode.unique

Number of rows in the dataframe: 103 Rows
Number of unique values in the dataframe: 


<bound method Series.unique of 0      M3A
1      M4A
2      M5A
3      M6A
4      M7A
      ... 
98     M8X
99     M4Y
100    M7Y
101    M8Y
102    M8Z
Name: Postcode, Length: 103, dtype: object>

In [7]:
# checking correctness of the data
df_toronto[df_toronto['Postcode']=='M5G']

Unnamed: 0,Postcode,Borough,Neighbourhood
24,M5G,Downtown Toronto,Central Bay Street


In [8]:
#shape of the dataframe: 
df_toronto.shape

(103, 3)

In [12]:
df_toronto

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing Centre
101,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,..."


In [16]:
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



## creating a function to retrive latitude and longitude information

In [17]:
def getlatlog(postal_code):
    address = '{}, Toronto, Ontario'.format(postal_code)
    geolocator = Nominatim(user_agent="foursquare_agent")
    location = geolocator.geocode(address)
    try:
        latitude = location.latitude
        longitude = location.longitude
    except:
        latitude = np.nan
        longitude = np.nan
    
    return latitude, longitude

In [None]:
latlist = []
lonlist = []
for each in df_toronto['Postcode']:
    lat, lon = getlatlog(each)
    latlist.append(lat)
    lonlist.append(lon)

In [None]:
print('Number of missing latitude data \n{}'.format(pd.DataFrame(latlist).isnull().sum()))
print('Number of missing longitude data \n{}'.format(pd.DataFrame(lonlist).isnull().sum()))

## Therefore we retrive the data using the Geospatial data provided in the csv file

In [192]:
geospatial_data = pd.read_csv("Geospatial_Coordinates.csv")

In [193]:
geospatial_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [1]:
df_toronto['Latitude'] = np.nan
df_toronto['Longitude'] = np.nan
for each in df_toronto['Postcode']:
    df_toronto.loc[df_toronto['Postcode']==each, 'Latitude'] = geospatial_data[geospatial_data['Postal Code']==each]['Latitude'].values[0]
    df_toronto.loc[df_toronto['Postcode']==each,'Longitude'] = geospatial_data[geospatial_data['Postal Code']==each]['Longitude'].values[0]


NameError: name 'np' is not defined

In [195]:
df_toronto

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.628841,-79.520999
1,M4A,North York,Victoria Village,43.628841,-79.520999
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.628841,-79.520999
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.628841,-79.520999
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.628841,-79.520999
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North",43.628841,-79.520999
99,M4Y,Downtown Toronto,Church and Wellesley,43.628841,-79.520999
100,M7Y,East Toronto,Business reply mail Processing Centre,43.628841,-79.520999
101,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,...",43.628841,-79.520999


In [196]:
# checking correctness of the data
df_toronto[df_toronto['Postcode']=='M5G']

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
24,M5G,Downtown Toronto,Central Bay Street,43.628841,-79.520999
