# Welcome to the Final Assignment
## Battle of Neighborhood (week 5)

## -------------------------
#### Importing and Instalation stage
## -------------------------

In [1]:
!pip install beautifulsoup4
!pip install lxml
!pip3 install lxml
!pip install geopy --user
!conda install -c conda-forge geocoder --yes
from bs4 import BeautifulSoup
import requests
import pandas as pd
import geocoder
from geopy.geocoders import Nominatim
import folium
import json
from pandas.io.json import json_normalize
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors
import random

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



## -------------------------
#### 2.-Scrapping stage
## -------------------------

In [2]:
wiki = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wikipedia_page = requests.get(wiki)

df_raw = pd.read_html(wikipedia_page.content, header=0)[0]
df_new = df_raw[df_raw.Borough != 'Not assigned']

df_new.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [3]:
df_new.loc[df_new.Neighbourhood == 'Not assigned']

Unnamed: 0,Postal Code,Borough,Neighbourhood


## -------------------------
#### 3.-Dataframe processing stage
## -------------------------

### 3.1-Grouping PC - Borough - Neigh

In [4]:
df_toronto = df_new.groupby(['Postal Code', 'Borough'])['Neighbourhood'].apply(lambda x: ', '.join(x))
df_toronto = df_toronto.reset_index()

### 3.2-Making easier later coding

In [5]:
df_toronto.rename(columns = {'Postal Code':'PostalCode'}, inplace = True)
df_toronto.rename(columns = {'Neighbourhood':'Neighborhood'}, inplace = True)

### 3.3-Showing results

In [6]:
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## -------------------------
#### Dataframe scope stage
## -------------------------

In [7]:
df_toronto.shape

(103, 3)

## -------------------------
#### 4.Extracting geospational information
## -------------------------

In [8]:
url = 'http://cocl.us/Geospatial_data'
df_geo=pd.read_csv(url)
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## -------------------------
#### 5.Merging information into our dataframe
## -------------------------

In [9]:
df_toronto = df_toronto.join(df_geo.set_index('Postal Code'), on='PostalCode')
df_toronto

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437


## -------------------------
#### 6.Creating map of Toronto using latitude and longitude values
## -------------------------

In [10]:
address = 'Toronto, Ontario'
geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        ).add_to(map_Toronto)  
    
map_Toronto

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


## -------------------------
#### 7.Connecting to Foursquares to retrieve information
## -------------------------

In [11]:

CLIENT_ID = 'G3YTIYHVKXA5PB3BKLMVNIQCVX2TSKK32YCOAH5WSVF2NA2Q' # Foursquare ID
CLIENT_SECRET = 'SJABNEZXGQEZT3XDSAWCDLNJOGYWGG3NRX13OG3NAWVFBAEX' #  Foursquare Secret
ACCESS_TOKEN = 'H1QFRME3HB40HW2BR50ZQ1JCKORQUFMURFLLNEDYSDQA44N0' #  FourSquare Access Token
VERSION = '20180604'

LIMIT = 100
radius = 500


## -------------------------
#### 8.Getting Results, merging and filtering information into the data frame
## -------------------------

In [12]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

## -------------------------
#### 9.Preparing Toronto Venues information
## -------------------------

In [13]:

toronto_venues = getNearbyVenues(names=df_toronto['Neighborhood'],
                                   latitudes=df_toronto['Latitude'],
                                   longitudes=df_toronto['Longitude']
                                  )

print(toronto_venues.shape)
toronto_venues.head()

toronto_venues.groupby('Neighborhood').count()


Malvern, Rouge
Rouge Hill, Port Union, Highland Creek
Guildwood, Morningside, West Hill
Woburn
Cedarbrae
Scarborough Village
Kennedy Park, Ionview, East Birchmount Park
Golden Mile, Clairlea, Oakridge
Cliffside, Cliffcrest, Scarborough Village West
Birch Cliff, Cliffside West
Dorset Park, Wexford Heights, Scarborough Town Centre
Wexford, Maryvale
Agincourt
Clarks Corners, Tam O'Shanter, Sullivan
Milliken, Agincourt North, Steeles East, L'Amoreaux East
Steeles West, L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview, Henry Farm, Oriole
Bayview Village
York Mills, Silver Hills
Willowdale, Newtonbrook
Willowdale, Willowdale East
York Mills West
Willowdale, Willowdale West
Parkwoods
Don Mills
Don Mills
Bathurst Manor, Wilson Heights, Downsview North
Northwood Park, York University
Downsview
Downsview
Downsview
Downsview
Victoria Village
Parkview Hill, Woodbine Gardens
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto, Broadview North (Old East York)
The Danforth West, 

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,4,4,4,4,4,4
"Alderwood, Long Branch",7,7,7,7,7,7
"Bathurst Manor, Wilson Heights, Downsview North",21,21,21,21,21,21
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",23,23,23,23,23,23
...,...,...,...,...,...,...
"Willowdale, Willowdale West",6,6,6,6,6,6
Woburn,3,3,3,3,3,3
Woodbine Heights,7,7,7,7,7,7
York Mills West,4,4,4,4,4,4


In [14]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 271 uniques categories.


## -------------------------
#### 10.Encoding and column adaptation
## -------------------------

In [15]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()
toronto_onehot.shape

(2107, 271)

In [16]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
toronto_grouped.shape

(97, 271)

## -------------------------
#### 11.Including Random User information
## -------------------------

In [18]:
category_list = list(toronto_grouped)
del category_list[0]

category_amount = random.randint(1,11)
user_categories = random.sample(category_list, category_amount)


aux_list = []
aux_list2 = []
for i in range(len(category_list)):
    aux_list2.append(1)
    if category_list[i] in user_categories:
        aux_list.append(1)
    else:
        aux_list.append(0)
user_categories

['German Restaurant', 'Baseball Field', 'Skating Rink']

In [19]:
user_df = pd.DataFrame.from_records([aux_list], columns=category_list)
user_df.head()

Unnamed: 0,Yoga Studio,Accessories Store,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
aux_df = pd.DataFrame(aux_list2)
user_profile = user_df.transpose().dot(aux_df.transpose()[0])

## -------------------------
#### 12.Getting the categories table for all neighborhoods
## -------------------------

In [21]:
categories_table = toronto_grouped.set_index(toronto_grouped['Neighborhood'])
categories_table = categories_table.drop('Neighborhood', 1)
categories_table.head()

Unnamed: 0_level_0,Yoga Studio,Accessories Store,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## -------------------------
#### 13.Applying the recommendation Matrix to the user needs
## -------------------------

In [22]:
recommendationTable_df = ((categories_table*user_profile).sum(axis=1))
recommendationTable_df.head()

Neighborhood
Agincourt                                          0.250000
Alderwood, Long Branch                             0.142857
Bathurst Manor, Wilson Heights, Downsview North    0.000000
Bayview Village                                    0.000000
Bedford Park, Lawrence Manor East                  0.000000
dtype: float64

In [23]:
recommendationTable_df = pd.DataFrame(recommendationTable_df.sort_values(ascending=False))
recommendationTable_df.columns = ['Score']
recommendationTableFinal_df = recommendationTable_df.reset_index()
recommendationTableFinal_df.head()

Unnamed: 0,Neighborhood,Score
0,"Old Mill South, King's Mill Park, Sunnylea, Hu...",1.0
1,"Humberlea, Emery",1.0
2,Woodbine Heights,0.285714
3,Agincourt,0.25
4,"Birch Cliff, Cliffside West",0.25


## -------------------------
#### 14.Joinning the table Scores with the table Location
## -------------------------

In [24]:
df_neighborhood_score = df_toronto.join(recommendationTableFinal_df.set_index('Neighborhood'), on ='Neighborhood')
df_neighborhood_score.sort_values(by='Score', ascending=False, inplace = True)
df_neighborhood_score.reset_index(drop=True, inplace = True)
df_neighborhood_score.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Score
0,M9M,North York,"Humberlea, Emery",43.724766,-79.532242,1.0
1,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509,1.0
2,M4C,East York,Woodbine Heights,43.695344,-79.318389,0.285714
3,M1S,Scarborough,Agincourt,43.7942,-79.262029,0.25
4,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848,0.25


## -------------------------
#### 15.Printing on the map the final result of our Algorithm
## -------------------------


In [25]:
recommendation_map = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = 5
ys = [i + x + (i*x)**2 for i in range(5)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, score, index in zip(df_neighborhood_score['Latitude'], df_neighborhood_score['Longitude'], df_neighborhood_score['Neighborhood'], df_neighborhood_score['Score'], range(5)):
    label = folium.Popup(str(poi) + ' Score ' + str(score), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(index)],
        fill=True,
        fill_color=rainbow[int(index)],
        fill_opacity=0.7).add_to(recommendation_map)
       
recommendation_map