##                                  Segmenting Neighborhoods in Toronto

### Importing libraries

In [1]:
#!conda update -n base -c defaults conda
#!conda install -c conda-forge geopy --yes 
#!conda install -c conda-forge folium=0.5.0 --yes 
#!conda install -c anaconda beautifulsoup4 
#print('Beautiful soup is installed')

In [2]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
import json # library to handle JSON files

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

In [5]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

In [6]:
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# import k-means from clustering stage
from sklearn.cluster import KMeans

In [7]:
import folium # map rendering library

In [8]:
from bs4 import BeautifulSoup
print('Libraries imported.')

Libraries imported.


---

### Taking data from internet page and adding  table cells text to numpy array

In [9]:
# Take html from internet page
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source,'lxml')
#print(soup.prettify())

In [10]:
# Check page title
file_title = soup.title.text
print(file_title)

List of postal codes of Canada: M - Wikipedia


In [11]:
# Take first table from html
table = soup.find('table', class_ = 'wikitable sortable') 
#table

In [12]:
# Numpy array takes all cells of table one by one and clears \n
toronto_np = []
for col in table.find_all('td'):
    toronto_np.append(col.text.rstrip('\n'))
#toronto_np    

### Add data from numpy array to dataframe with checking 'Not assigned' values

In [13]:
# New dataframe
column_names = ['Postalcode','Borough', 'Neighborhood'] 
toronto = pd.DataFrame(columns=column_names)
print(toronto)

# Fill the dataframe from numpy array
for j in range(len(toronto_np)-1):
    if (j%3 == 0):                               # Taking every 3 cells from 1 dimension numpy array
        if (toronto_np[j+1] != 'Not assigned'):  # ignoring if Borough is not assigned
#            if (toronto_np[j] == toronto_np[j-3]) & (toronto_np[j+1] == toronto_np[j-2]): #cheking if Botough doubles the previous one
#                print(toronto_np[j])
            Postalcode = toronto_np[j]
            Borough = toronto_np[j+1]
            if (toronto_np[j+2] == 'Not assigned'): # Cheking if neighborhood is not assigned
                Neighborhood = Borough
            else:
                Neighborhood = toronto_np[j+2]
            toronto = toronto.append({'Postalcode': Postalcode,   # Add row to the dataframe
                                    'Borough' : Borough,
                                    'Neighborhood': Neighborhood}, ignore_index=True)

Empty DataFrame
Columns: [Postalcode, Borough, Neighborhood]
Index: []


In [14]:
toronto.shape

(211, 3)

### Grouping lines with similar Neighborhoods, adding comma

In [15]:
toronto2 = toronto.groupby(['Postalcode','Borough'], as_index = False, sort = False).agg(lambda x : ', '.join(x))

### Result

In [16]:
print(toronto2.shape)

(103, 3)


In [17]:
toronto2.sort_values(by='Postalcode', inplace = True)

In [18]:
toronto2.reset_index(drop = True, inplace = True)
toronto2.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## Adding coordinates


In [19]:
#Reading file with coordinates
path = "http://cocl.us/Geospatial_data"
lat_lng = pd.read_csv(path)
lat_lng.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [20]:
print(lat_lng.shape)

(103, 3)


In [21]:
for j in range(len(toronto2)):
    if (toronto2['Postalcode'][j] == lat_lng['Postal Code'][j]):
        toronto2.at[j,'Latitude'] = lat_lng['Latitude'][j]
        toronto2.at[j,'Longitude'] = lat_lng['Longitude'][j]

In [22]:
toronto2.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
