## Import the libraries

In [1]:
import pandas as pd
import numpy as np

## Web Scraping
### Reading the data

In [2]:
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', na_values = 'Not assigned', flavor = 'html5lib')

In [3]:
df

[    Postal code           Borough  \
 0           M1A               NaN   
 1           M2A               NaN   
 2           M3A        North York   
 3           M4A        North York   
 4           M5A  Downtown Toronto   
 ..          ...               ...   
 175         M5Z               NaN   
 176         M6Z               NaN   
 177         M7Z               NaN   
 178         M8Z         Etobicoke   
 179         M9Z               NaN   
 
                                           Neighborhood  
 0                                                  NaN  
 1                                                  NaN  
 2                                            Parkwoods  
 3                                     Victoria Village  
 4                           Regent Park / Harbourfront  
 ..                                                 ...  
 175                                                NaN  
 176                                                NaN  
 177                

In [4]:
print(type(df[0]))
df = df[0]

<class 'pandas.core.frame.DataFrame'>


### Checking the names of the columns

In [5]:
df.columns

Index(['Postal code', 'Borough', 'Neighborhood'], dtype='object')

In [6]:
df

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
...,...,...,...
175,M5Z,,
176,M6Z,,
177,M7Z,,
178,M8Z,Etobicoke,Mimico NW / The Queensway West / South of Bloo...


## Droping the NaN values

In [7]:
print(df.isnull().sum())
df.drop(df.index[0], inplace = True)
df.dropna(axis = 0, inplace = True)
print(df.isnull().sum())
df

Postal code      0
Borough         77
Neighborhood    77
dtype: int64
Postal code     0
Borough         0
Neighborhood    0
dtype: int64


Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
...,...,...,...
160,M8X,Etobicoke,The Kingsway / Montgomery Road / Old Mill North
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,Business reply mail Processing CentrE
169,M8Y,Etobicoke,Old Mill South / King's Mill Park / Sunnylea /...


## Check no of Boroughs and Neighborhoods

In [8]:
print('The dataframe has {} boroughs and {} neighbourhoods.'.format(

len(df['Borough'].unique()),

len(df['Neighborhood'].unique())))
print(df.shape)

The dataframe has 10 boroughs and 98 neighbourhoods.
(103, 3)


### Creating a new csv 

In [9]:
df.to_csv('Toronto_df.csv')

In [10]:
df.shape

(103, 3)

## Adding latitude and longitude coordinates to the DataFrame

### Getting the coordinates and creating a new csv

In [13]:
!wget -q -O 'Geospatial_Coordinates.csv'  http://cocl.us/Geospatial_data
df_lon_lat = pd.read_csv('Geospatial_Coordinates.csv')
df_lon_lat.head()

'wget' is not recognized as an internal or external command,
operable program or batch file.


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Checking the columns of the coordinates csv

In [19]:
df_lon_lat.columns=['Postal code','Latitude','Longitude']
df_lon_lat.head()

Unnamed: 0,Postal code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merging the two csv files

In [21]:
Toronto_df = pd.merge(df,
                 df_lon_lat[['Postal code','Latitude', 'Longitude']],
                 on='Postal code')
Toronto_df

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.654260,-79.360636
3,M6A,North York,Lawrence Manor / Lawrence Heights,43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government,43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,The Kingsway / Montgomery Road / Old Mill North,43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,Business reply mail Processing CentrE,43.662744,-79.321558
101,M8Y,Etobicoke,Old Mill South / King's Mill Park / Sunnylea /...,43.636258,-79.498509


In [23]:
Toronto_df.shape

(103, 5)