In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [95]:
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199

## (1) Setting up the raw data from Calgary

In [3]:
calgary_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_T'
html_data = requests.get(calgary_url).text
soup = BeautifulSoup(html_data,"html5lib") 

In [5]:
calgary_df_content = []
table = soup.find("table")
for row in table.find_all("td") :
    cell = {}
    if row.span.text=='Not assigned':
        pass
    
    else:
        borough = row.span.text.split('(')[0]
        
        if borough == 'Calgary':
            cell['PostalCode'] = row.text[:3]
            cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
            calgary_df_content.append(cell)
            
        else:
            pass

In [12]:
calgary_df1=pd.DataFrame(data=calgary_df_content)
print(calgary_df1.shape)
calgary_df1.head()

(31, 2)


Unnamed: 0,PostalCode,Neighborhood
0,T2A,"Penbrooke Meadows, Marlborough"
1,T3A,"Dalhousie, Edgemont, Hamptons, Hidden Valley"
2,T2B,"Forest Lawn, Dover, Erin Woods"
3,T3B,"Montgomery, Bowness, Silver Springs, Greenwood"
4,T2C,"Lynnwood Ridge, Ogden, Foothills Industrial, Great Plains"


In [13]:
# Had to find coordinates with https://www.latlong.net/
calgary_coordinates_url ='https://raw.githubusercontent.com/Armando12pdf/Coursera_Capstone/main/Calgary%20coordinates.csv'
calgary_coordinates_df=pd.read_csv(calgary_coordinates_url)
print(calgary_coordinates_df.shape)
calgary_coordinates_df.head()

(31, 3)


Unnamed: 0,Postal code,Latitude,Longitude
0,T2A,51.04968,-113.96432
1,T3A,51.12454,-114.14289
2,T2B,51.02533,-113.9789
3,T3B,51.08963,-114.19751
4,T2C,50.98122,-113.99786


In [18]:
#Concatenate with respective latitude and longitude
calgary_df1['Latitude'] = calgary_df1.PostalCode.map(calgary_coordinates_df.set_index('Postal code')['Latitude'].to_dict())
calgary_df1['Longitude'] = calgary_df1.PostalCode.map(calgary_coordinates_df.set_index('Postal code')['Longitude'].to_dict())
print(calgary_df1.shape)
calgary_df1.head()

(31, 4)


Unnamed: 0,PostalCode,Neighborhood,Latitude,Longitude
0,T2A,"Penbrooke Meadows, Marlborough",51.04968,-113.96432
1,T3A,"Dalhousie, Edgemont, Hamptons, Hidden Valley",51.12454,-114.14289
2,T2B,"Forest Lawn, Dover, Erin Woods",51.02533,-113.9789
3,T3B,"Montgomery, Bowness, Silver Springs, Greenwood",51.08963,-114.19751
4,T2C,"Lynnwood Ridge, Ogden, Foothills Industrial, Great Plains",50.98122,-113.99786


In [19]:
#Define foursquare credentials
CLIENT_ID = 'ZZBEKKTJGWTU0HNGJ2ROO5JDHSKAXVABHUJKBBYZG1WNWQ0W' # your Foursquare ID
CLIENT_SECRET = 'JS1SDJDRRFBWX0SXVSS5GPCXVG4J3BAKY0AWJMLE2IJH4OFG' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

In [None]:
#Function that gets venues

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [24]:
#Get venues for calgary neighborhoods
calgary_venues = getNearbyVenues(names=calgary_df1['Neighborhood'],
                                   latitudes=calgary_df1['Latitude'],
                                   longitudes=calgary_df1['Longitude'])
calgary_venues.head()

Penbrooke Meadows, Marlborough
Dalhousie, Edgemont, Hamptons, Hidden Valley
Forest Lawn, Dover, Erin Woods
Montgomery, Bowness, Silver Springs, Greenwood
Lynnwood Ridge, Ogden, Foothills Industrial, Great Plains
Rosscarrock, Westgate, Wildwood, Shaganappi, Sunalta
Bridgeland, Greenview, Zoo, YYC
Lakeview, Glendale, Killarney, Glamorgan
Inglewood, Burnsland, Chinatown, East Victoria Park, Saddledome
Hawkwood, Arbour Lake, Citadel, Ranchlands, Royal Oak, Rocky Ridge
Highfield, Burns Industrial
Discovery Ridge, Signal Hill, West Springs,Christie Park, Patterson, Cougar Ridge
Queensland, Lake Bonavista, Willow Park, Acadia
Martindale, Taradale, Falconridge, Saddle Ridge
Thorncliffe, Tuxedo Park
Sandstone, MacEwan Glen, Beddington, Harvest Hills, Coventry Hills, Panorama Hills
Brentwood, Collingwood, Nose Hill
Tuscany, Scenic Acres
Mount Pleasant, Capitol Hill, Banff Trail
Cranston, Auburn Bay, Mahogany
Kensington, Westmont, Parkdale, University
City Centre, Calgary Tower
Symons Valley
Conn

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Dalhousie, Edgemont, Hamptons, Hidden Valley",51.12454,-114.14289,Edgemont Construction Ltd,51.123928,-114.144313,Construction & Landscaping
1,"Dalhousie, Edgemont, Hamptons, Hidden Valley",51.12454,-114.14289,Edgemont City,51.126473,-114.138997,Asian Restaurant
2,"Dalhousie, Edgemont, Hamptons, Hidden Valley",51.12454,-114.14289,Friends Cappuccino Bar & Bake Shop,51.12637,-114.138676,Café
3,"Dalhousie, Edgemont, Hamptons, Hidden Valley",51.12454,-114.14289,On the Rocks,51.126893,-114.139355,Wine Shop
4,"Forest Lawn, Dover, Erin Woods",51.02533,-113.9789,Subway,51.026616,-113.981135,Sandwich Place


In [25]:
#One hot encoding of calgary venues
calgary_onehot_df = pd.get_dummies(calgary_venues[['Venue Category']], prefix="", prefix_sep="")
calgary_onehot_df.insert(loc=0, column='Neighborhood name', value=calgary_venues['Neighborhood'].values.tolist()) 
print(calgary_onehot_df.shape)
calgary_onehot_df.head()

(238, 101)


Unnamed: 0,Neighborhood name,American Restaurant,Asian Restaurant,Athletics & Sports,Bakery,Bank,Bar,Beer Bar,Bookstore,Bowling Alley,Brazilian Restaurant,Breakfast Spot,Brewery,Burger Joint,Café,Camera Store,Cheese Shop,Chinese Restaurant,Clothing Store,Cocktail Bar,Coffee Shop,College Classroom,Construction & Landscaping,Convenience Store,Department Store,Diner,Dive Bar,Dog Run,Donut Shop,Dry Cleaner,Eastern European Restaurant,Elementary School,Fast Food Restaurant,Food,Food & Drink Shop,French Restaurant,Fried Chicken Joint,Frozen Yogurt Shop,Gas Station,Gift Shop,Golf Course,Gourmet Shop,Grocery Store,Gym,Gym / Fitness Center,History Museum,Hobby Shop,Hockey Arena,Hockey Field,Hotel,Ice Cream Shop,Indian Restaurant,Indie Movie Theater,Inn,Insurance Office,Italian Restaurant,Japanese Restaurant,Juice Bar,Korean Restaurant,Lake,Latin American Restaurant,Library,Liquor Store,Lounge,Market,Massage Studio,Mediterranean Restaurant,Middle Eastern Restaurant,Modern European Restaurant,Moroccan Restaurant,Music Store,New American Restaurant,Other Repair Shop,Park,Pharmacy,Pier,Pizza Place,Playground,Poutine Place,Professional & Other Places,Pub,Restaurant,Sandwich Place,Scandinavian Restaurant,Seafood Restaurant,Shopping Mall,Soccer Field,Sporting Goods Shop,Sports Bar,Stadium,Steakhouse,Summer Camp,Supermarket,Sushi Restaurant,Thai Restaurant,Theater,Thrift / Vintage Store,Vietnamese Restaurant,Water Park,Wine Shop,Yoga Studio
0,"Dalhousie, Edgemont, Hamptons, Hidden Valley",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,"Dalhousie, Edgemont, Hamptons, Hidden Valley",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,"Dalhousie, Edgemont, Hamptons, Hidden Valley",0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,"Dalhousie, Edgemont, Hamptons, Hidden Valley",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,"Forest Lawn, Dover, Erin Woods",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [26]:
#Regroup calgary venues. This will be the data used with the ML models
calgary_grouped_df = calgary_onehot_df.groupby('Neighborhood name').mean().reset_index()
print(calgary_grouped_df.shape)
calgary_grouped_df.head()

(28, 101)


Unnamed: 0,Neighborhood name,American Restaurant,Asian Restaurant,Athletics & Sports,Bakery,Bank,Bar,Beer Bar,Bookstore,Bowling Alley,Brazilian Restaurant,Breakfast Spot,Brewery,Burger Joint,Café,Camera Store,Cheese Shop,Chinese Restaurant,Clothing Store,Cocktail Bar,Coffee Shop,College Classroom,Construction & Landscaping,Convenience Store,Department Store,Diner,Dive Bar,Dog Run,Donut Shop,Dry Cleaner,Eastern European Restaurant,Elementary School,Fast Food Restaurant,Food,Food & Drink Shop,French Restaurant,Fried Chicken Joint,Frozen Yogurt Shop,Gas Station,Gift Shop,Golf Course,Gourmet Shop,Grocery Store,Gym,Gym / Fitness Center,History Museum,Hobby Shop,Hockey Arena,Hockey Field,Hotel,Ice Cream Shop,Indian Restaurant,Indie Movie Theater,Inn,Insurance Office,Italian Restaurant,Japanese Restaurant,Juice Bar,Korean Restaurant,Lake,Latin American Restaurant,Library,Liquor Store,Lounge,Market,Massage Studio,Mediterranean Restaurant,Middle Eastern Restaurant,Modern European Restaurant,Moroccan Restaurant,Music Store,New American Restaurant,Other Repair Shop,Park,Pharmacy,Pier,Pizza Place,Playground,Poutine Place,Professional & Other Places,Pub,Restaurant,Sandwich Place,Scandinavian Restaurant,Seafood Restaurant,Shopping Mall,Soccer Field,Sporting Goods Shop,Sports Bar,Stadium,Steakhouse,Summer Camp,Supermarket,Sushi Restaurant,Thai Restaurant,Theater,Thrift / Vintage Store,Vietnamese Restaurant,Water Park,Wine Shop,Yoga Studio
0,"Braeside, Cedarbrae, Woodbine",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Brentwood, Collingwood, Nose Hill",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bridgeland, Greenview, Zoo, YYC",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"City Centre, Calgary Tower",0.0,0.0,0.0,0.022222,0.022222,0.066667,0.0,0.022222,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,0.0,0.0,0.0,0.022222,0.155556,0.0,0.0,0.0,0.022222,0.0,0.0,0.0,0.0,0.0,0.022222,0.0,0.022222,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022222,0.022222,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,0.022222,0.022222,0.0,0.0,0.022222,0.022222,0.022222,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022222,0.0,0.0,0.0,0.0,0.022222,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.044444,0.088889,0.022222,0.0,0.0,0.022222,0.0,0.0,0.0,0.0,0.044444,0.0,0.0,0.044444,0.0,0.0,0.0,0.022222,0.0,0.0,0.0
4,"Connaught, West Victoria Park",0.0,0.0,0.0,0.012821,0.0,0.051282,0.012821,0.012821,0.0,0.012821,0.0,0.038462,0.025641,0.051282,0.012821,0.0,0.012821,0.0,0.012821,0.051282,0.0,0.0,0.0,0.0,0.025641,0.0,0.0,0.012821,0.0,0.0,0.0,0.0,0.0,0.0,0.025641,0.0,0.012821,0.012821,0.012821,0.0,0.012821,0.0,0.0,0.012821,0.012821,0.0,0.0,0.0,0.025641,0.0,0.025641,0.0,0.0,0.0,0.012821,0.025641,0.0,0.012821,0.0,0.012821,0.012821,0.0,0.025641,0.012821,0.0,0.0,0.012821,0.012821,0.012821,0.012821,0.0,0.0,0.012821,0.0,0.0,0.025641,0.0,0.012821,0.0,0.051282,0.051282,0.025641,0.012821,0.012821,0.0,0.0,0.012821,0.0,0.0,0.012821,0.0,0.0,0.025641,0.025641,0.0,0.0,0.038462,0.0,0.0,0.012821


In [28]:
# Some neighborhoods didn't get results, so they will be removed from our initial df. 
print(len(set(calgary_grouped_df['Neighborhood name'])))
print(len(set(calgary_df1['Neighborhood'])))

28
31


In [34]:
# Remove the neighborhoods in original df for which the API did provide information. 
for element in calgary_df1['Neighborhood']:
    if element in set(calgary_grouped_df['Neighborhood name']):
        pass
        
    else:
        print(element)
        calgary_df1 = calgary_df1[calgary_df1.Neighborhood != element]

print(len(calgary_df1['Neighborhood']))

28


## (2) Prepare positively labeled raw data

In [39]:
#Postal codes were gathered manually. Their latitudes and longitudes were obtained from https://www.latlong.net/
positive_coordinates_url ='https://raw.githubusercontent.com/Armando12pdf/Coursera_Capstone/main/Positive%20locations%20coordinates.csv'
example_labs_df=pd.read_csv(positive_coordinates_url)
example_labs_df.head()

Unnamed: 0,Postal code,Latitude,Longitude
0,G1C,46.881771,-71.189369
1,G1E,46.86013,-71.194054
2,G1M,46.81723,-71.269836
3,G6W,46.75756,-71.22557
4,H1K,45.60818,-73.54452


In [40]:
#Add a neighborhood column
labs_labels =[]
for number in range(1,(example_labs_df.shape[0]+1)) :
    new_lab_number = 'example lab ' + str(number)
    labs_labels.append(new_lab_number)

example_labs_df.insert(loc=1, column='Neighborhood', value=labs_labels) 

In [41]:
example_labs_df.head()

Unnamed: 0,Postal code,Neighborhood,Latitude,Longitude
0,G1C,example lab 1,46.881771,-71.189369
1,G1E,example lab 2,46.86013,-71.194054
2,G1M,example lab 3,46.81723,-71.269836
3,G6W,example lab 4,46.75756,-71.22557
4,H1K,example lab 5,45.60818,-73.54452


In [42]:
#Get venues for the labs used as positive examples
example_labs_venues = getNearbyVenues(names=example_labs_df['Neighborhood'],
                                   latitudes=example_labs_df['Latitude'],
                                   longitudes=example_labs_df['Longitude'])

example lab 1
example lab 2
example lab 3
example lab 4
example lab 5
example lab 6
example lab 7
example lab 8
example lab 9
example lab 10
example lab 11
example lab 12
example lab 13
example lab 14
example lab 15
example lab 16
example lab 17
example lab 18
example lab 19
example lab 20
example lab 21
example lab 22
example lab 23
example lab 24
example lab 25
example lab 26
example lab 27
example lab 28
example lab 29
example lab 30
example lab 31
example lab 32
example lab 33
example lab 34
example lab 35
example lab 36
example lab 37
example lab 38
example lab 39
example lab 40
example lab 41
example lab 42
example lab 43
example lab 44
example lab 45
example lab 46
example lab 47
example lab 48
example lab 49
example lab 50
example lab 51
example lab 52
example lab 53
example lab 54
example lab 55
example lab 56
example lab 57
example lab 58
example lab 59
example lab 60
example lab 61
example lab 62
example lab 63
example lab 64
example lab 65
example lab 66
example lab 67
exam

In [44]:
example_labs_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,example lab 1,46.881771,-71.189369,Poliquin Decor Inc,46.882899,-71.190327,Construction & Landscaping
1,example lab 1,46.881771,-71.189369,Groupe Diane Lapointe Designers,46.884144,-71.184635,Construction & Landscaping
2,example lab 2,46.86013,-71.194054,Senor Sombrero,46.861217,-71.188514,Mexican Restaurant
3,example lab 2,46.86013,-71.194054,IGA,46.86216,-71.188334,Grocery Store
4,example lab 2,46.86013,-71.194054,Pharmaprix,46.86076,-71.189769,Pharmacy


In [45]:
#One hot encoding of the example labs
labs_onehot_df = pd.get_dummies(example_labs_venues[['Venue Category']], prefix="", prefix_sep="")
labs_onehot_df.insert(loc=0, column='Neighborhood name', value=example_labs_venues['Neighborhood'].values.tolist()) 
print(labs_onehot_df.shape)
labs_onehot_df.head()

(963, 219)


Unnamed: 0,Neighborhood name,Accessories Store,Adult Boutique,Airport,American Restaurant,Antique Shop,Arcade,Arepa Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Garage,Automotive Shop,BBQ Joint,Baby Store,Bagel Shop,Bakery,Bank,Bar,Baseball Field,Baseball Stadium,Bath House,Beer Bar,Beer Store,Belgian Restaurant,Big Box Store,Bookstore,Boutique,Breakfast Spot,Brewery,Bubble Tea Shop,Burger Joint,Burrito Place,Bus Station,Bus Stop,Business Service,Butcher,Café,Cajun / Creole Restaurant,Camera Store,Candy Store,Caribbean Restaurant,Casino,Cheese Shop,Chinese Restaurant,City,Clothing Store,Cocktail Bar,Coffee Shop,College Bookstore,Comfort Food Restaurant,Comic Shop,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Coworking Space,Cupcake Shop,Dance Studio,Daycare,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dive Bar,Dog Run,Donut Shop,Dry Cleaner,Dumpling Restaurant,Electronics Store,English Restaurant,Event Space,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Field,Financial or Legal Service,Fish & Chips Shop,Fish Market,Flower Shop,Food & Drink Shop,Food Court,Food Truck,French Restaurant,Fried Chicken Joint,Frozen Yogurt Shop,Furniture / Home Store,Gaming Cafe,Garden,Gas Station,Gastropub,Gay Bar,Gift Shop,Golf Course,Gourmet Shop,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Gym Pool,Harbor / Marina,Hardware Store,Health Food Store,Historic Site,History Museum,Hockey Arena,Hockey Rink,Home Service,Hostel,Hotel,Ice Cream Shop,Indian Chinese Restaurant,Indian Restaurant,Indie Movie Theater,Inn,Insurance Office,Intersection,Irish Pub,Italian Restaurant,Japanese Restaurant,Jazz Club,Jewelry Store,Juice Bar,Karaoke Bar,Kids Store,Korean Restaurant,Lawyer,Lingerie Store,Liquor Store,Luggage Store,Market,Massage Studio,Mattress Store,Medical Supply Store,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant,Middle Eastern Restaurant,Military Base,Miscellaneous Shop,Mobile Phone Shop,Modern European Restaurant,Monument / Landmark,Movie Theater,Museum,Music Venue,Nail Salon,Nature Preserve,Neighborhood,New American Restaurant,Newsstand,Noodle House,Office,Organic Grocery,Other Nightlife,Outdoor Supply Store,Park,Persian Restaurant,Pet Store,Pharmacy,Pizza Place,Platform,Playground,Plaza,Pool,Pool Hall,Poutine Place,Pub,Ramen Restaurant,Record Shop,Recreation Center,Rental Car Location,Restaurant,Road,Salad Place,Sandwich Place,Scandinavian Restaurant,Seafood Restaurant,Shipping Store,Shoe Store,Shop & Service,Shopping Mall,Skating Rink,Smoke Shop,Soccer Field,Southern / Soul Food Restaurant,Spa,Sporting Goods Shop,Sports Bar,Sri Lankan Restaurant,Supermarket,Sushi Restaurant,Taco Place,Tapas Restaurant,Tea Room,Tennis Court,Thai Restaurant,Theater,Theme Park,Thrift / Vintage Store,Toy / Game Store,Track,Train,Train Station,Tree,Tunnel,University,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,example lab 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,example lab 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,example lab 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,example lab 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,example lab 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [99]:
#Regroup the venues of the example labs
labs_grouped_df = labs_onehot_df.groupby('Neighborhood name').mean().reset_index()
print(labs_grouped_df.shape)
labs_grouped_df.head()

(95, 219)


Unnamed: 0,Neighborhood name,Accessories Store,Adult Boutique,Airport,American Restaurant,Antique Shop,Arcade,Arepa Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Garage,Automotive Shop,BBQ Joint,Baby Store,Bagel Shop,Bakery,Bank,Bar,Baseball Field,Baseball Stadium,Bath House,Beer Bar,Beer Store,Belgian Restaurant,Big Box Store,Bookstore,Boutique,Breakfast Spot,Brewery,Bubble Tea Shop,Burger Joint,Burrito Place,Bus Station,Bus Stop,Business Service,Butcher,Café,Cajun / Creole Restaurant,Camera Store,Candy Store,Caribbean Restaurant,Casino,Cheese Shop,Chinese Restaurant,City,Clothing Store,Cocktail Bar,Coffee Shop,College Bookstore,Comfort Food Restaurant,Comic Shop,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Coworking Space,Cupcake Shop,Dance Studio,Daycare,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dive Bar,Dog Run,Donut Shop,Dry Cleaner,Dumpling Restaurant,Electronics Store,English Restaurant,Event Space,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Field,Financial or Legal Service,Fish & Chips Shop,Fish Market,Flower Shop,Food & Drink Shop,Food Court,Food Truck,French Restaurant,Fried Chicken Joint,Frozen Yogurt Shop,Furniture / Home Store,Gaming Cafe,Garden,Gas Station,Gastropub,Gay Bar,Gift Shop,Golf Course,Gourmet Shop,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Gym Pool,Harbor / Marina,Hardware Store,Health Food Store,Historic Site,History Museum,Hockey Arena,Hockey Rink,Home Service,Hostel,Hotel,Ice Cream Shop,Indian Chinese Restaurant,Indian Restaurant,Indie Movie Theater,Inn,Insurance Office,Intersection,Irish Pub,Italian Restaurant,Japanese Restaurant,Jazz Club,Jewelry Store,Juice Bar,Karaoke Bar,Kids Store,Korean Restaurant,Lawyer,Lingerie Store,Liquor Store,Luggage Store,Market,Massage Studio,Mattress Store,Medical Supply Store,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant,Middle Eastern Restaurant,Military Base,Miscellaneous Shop,Mobile Phone Shop,Modern European Restaurant,Monument / Landmark,Movie Theater,Museum,Music Venue,Nail Salon,Nature Preserve,Neighborhood,New American Restaurant,Newsstand,Noodle House,Office,Organic Grocery,Other Nightlife,Outdoor Supply Store,Park,Persian Restaurant,Pet Store,Pharmacy,Pizza Place,Platform,Playground,Plaza,Pool,Pool Hall,Poutine Place,Pub,Ramen Restaurant,Record Shop,Recreation Center,Rental Car Location,Restaurant,Road,Salad Place,Sandwich Place,Scandinavian Restaurant,Seafood Restaurant,Shipping Store,Shoe Store,Shop & Service,Shopping Mall,Skating Rink,Smoke Shop,Soccer Field,Southern / Soul Food Restaurant,Spa,Sporting Goods Shop,Sports Bar,Sri Lankan Restaurant,Supermarket,Sushi Restaurant,Taco Place,Tapas Restaurant,Tea Room,Tennis Court,Thai Restaurant,Theater,Theme Park,Thrift / Vintage Store,Toy / Game Store,Track,Train,Train Station,Tree,Tunnel,University,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,example lab 1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,example lab 10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,example lab 100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,example lab 11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,example lab 12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## (3) Train and evaluate the one-class SVM model

In [100]:
# When training a model, the features of the training data set have to match those of the target data set
# They do not match in this situation:
print('features in labs: ' + str(len(labs_grouped_df.columns)))
print('features in calgary: ' + str(len(calgary_grouped_df.columns)))

features in labs: 219
features in calgary: 101


In [103]:
#Columns in the training set that are not within the target data set, will be removed
for element in labs_grouped_df.columns.tolist():
    if element not in calgary_grouped_df.columns.tolist():
        labs_grouped_df.drop(columns=[element], inplace=True)
        
    else:
        pass

print('features in labs: ' + str(len(labs_grouped_df.columns)))
print('features in calgary: ' + str(len(calgary_grouped_df.columns)))

features in labs: 79
features in calgary: 101


In [104]:
# Repeat in other direction for features to match
# Columns in the training set that are not within the target data set, will be removed
for element in calgary_grouped_df.columns.tolist():
    if element not in labs_grouped_df.columns.tolist():
        calgary_grouped_df.drop(columns=[element], inplace=True)
        
    else:
        pass

print('features in labs: ' + str(len(labs_grouped_df.columns)))
print('features in calgary: ' + str(len(calgary_grouped_df.columns)))


features in labs: 79
features in calgary: 79


In [105]:
#Now we're ready to train our model
from sklearn.svm import OneClassSVM

In [106]:
#Split the data. 
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(labs_grouped_df, test_size=0.15, random_state=4)

In [107]:
#Fit
One_class_SVM = OneClassSVM(gamma='auto').fit(X_train.loc[:, X_train.columns != 'Neighborhood name'])

In [108]:
#Test
One_c_SVM_test=One_class_SVM.predict(X_test.loc[:, X_test.columns != 'Neighborhood name'])

In [109]:
#Test how many get labelled as positive
values, counts = np.unique(One_c_SVM_test, return_counts=True)
print(values)
print(counts)
print(counts[1]/(counts[0]+counts[1]))

#The result indicates that 40% of true-positives will be labelled as negatives

[-1  1]
[6 9]
0.6


## (4) Applying the trained one-class SVM model

In [113]:
#Apply the one class SVM model to the calgary data frame
One_c_SVM_result=One_class_SVM.predict(calgary_grouped_df.loc[:, calgary_grouped_df.columns != 'Neighborhood name'])
One_c_SVM_result

array([-1,  1, -1,  1,  1,  1, -1, -1,  1, -1, -1, -1, -1, -1, -1,  1, -1,
       -1, -1, -1,  1, -1, -1, -1, -1, -1,  1, -1])

In [115]:
#Add results to calgary df
calgary_df1['Result'] = One_c_SVM_result
calgary_df1.head()

Unnamed: 0,PostalCode,Neighborhood,Latitude,Longitude,Result
1,T3A,"Dalhousie, Edgemont, Hamptons, Hidden Valley",51.12454,-114.14289,-1
2,T2B,"Forest Lawn, Dover, Erin Woods",51.02533,-113.9789,1
3,T3B,"Montgomery, Bowness, Silver Springs, Greenwood",51.08963,-114.19751,-1
4,T2C,"Lynnwood Ridge, Ogden, Foothills Industrial, Great Plains",50.98122,-113.99786,1
5,T3C,"Rosscarrock, Westgate, Wildwood, Shaganappi, Sunalta",51.04492,-114.1307,1


In [118]:
#Get positive results only
calgary_df1_pos = calgary_df1.loc[calgary_df1['Result']==1]
calgary_df1_pos

Unnamed: 0,PostalCode,Neighborhood,Latitude,Longitude,Result
2,T2B,"Forest Lawn, Dover, Erin Woods",51.02533,-113.9789,1
4,T2C,"Lynnwood Ridge, Ogden, Foothills Industrial, Great Plains",50.98122,-113.99786,1
5,T3C,"Rosscarrock, Westgate, Wildwood, Shaganappi, Sunalta",51.04492,-114.1307,1
6,T2E,"Bridgeland, Greenview, Zoo, YYC",51.07029,-114.04284,1
9,T3G,"Hawkwood, Arbour Lake, Citadel, Ranchlands, Royal Oak, Rocky Ridge",51.13818,-114.20157,1
17,T3L,"Tuscany, Scenic Acres",51.12323,-114.24007,1
22,T3P,Symons Valley,51.17748,-114.10508,1
28,T1Y,"Rundle, Whitehorn, Monterey Park",51.08058,-113.96087,1


## (5) Visualization of one-class SVM model results in a map

In [119]:
# libraries for map
import matplotlib.cm as cm
import matplotlib.colors as colors
!conda install -c conda-forge folium=0.5.0 --yes
import folium

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [123]:
# create map. The latitude and longitude of toronto were found on 
# https://www.latlong.net/place/calgary-ab-canada-29104.html
svm_map = folium.Map(location=[51.049999,  -114.066666], zoom_start=10.4)

#Add markers to map
for lat, lon in zip(calgary_df1_pos['Latitude'], calgary_df1_pos['Longitude']):
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        color='red',
        fill=True,
        fill_color='red',
        fill_opacity=0.7).add_to(svm_map)
     
svm_map

# 3 Using Positive-Unlabelled data

## 3.1 Get unlabelled data set

In [99]:
#Define a function that gets postal codes 
def getPostalCodes1(url, city_of_interest):
    html_data = requests.get(url).text
    soup = BeautifulSoup(html_data,"html5lib") 
    table = soup.find("table")
    for row in table.find_all("td") :
        cell = {}
        if row.span.text=='Not assigned':
            pass
    
        else:
            borough = row.span.text.split('(')[0]
            if borough== city_of_interest:
                cell['PostalCode'] = row.text[:3]
                cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
                unlabelled_df_content.append(cell)
    
            else:
                pass

In [111]:
url_list1=['https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_K',
          'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_R']
cities=['Ottawa', 'Winnipeg']


In [115]:
unlabelled_df_content = []

getPostalCodes1(url=url_list1[0], city_of_interest=cities[0])
getPostalCodes1(url=url_list1[1], city_of_interest=cities[1])

40
73


In [118]:
unlabelled_df=pd.DataFrame(data=unlabelled_df_content)
print(unlabelled_df.shape)
unlabelled_df

(73, 2)


Unnamed: 0,PostalCode,Neighborhood
0,K2A,"Highland Park, McKellar Park,Westboro,Glabar Park,Carlingwood"
1,K4A,Fallingbrook
2,K1B,"Blackburn Hamlet, Pine View, Sheffield Glen"
3,K2B,"Britannia,Whitehaven, Bayshore, Pinecrest"
4,K4B,Navan
5,K1C,Orleans
6,K2C,"Queensway, Copeland Park, Central Park, Bel Air,Carleton Heights"
7,K4C,Cumberland
8,K1E,Queenswood
9,K2E,"Eastern Nepean: Fisher Heights/ Parkwood Hills, Borden Farm, Pine Glen"


(52, 2)

In [97]:
#unlabelled_df_content

In [27]:
#Define a function that gets postal codes 
def getPostalCodes1(url, city_of_interest):
    html_data = requests.get(url).text
    soup = BeautifulSoup(html_data,"html5lib") 
    table = soup.find("table")
    for row in table.find_all("td") :
        cell = {}
        if row.span.text=='Not assigned':
            pass
    
        else:
            try:
                cell['PostalCode'] = row.text[:3]
                cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
                unlabelled_df_content.append(cell)
            
            
            except:
                try:
                    borough = row.span.text.split('(')[0]
                    if borough=='Not assigned':
                        print(borough)
                    else:
                        cell['PostalCode'] = row.text[:3]
                        cell['Neighborhood']=borough
                        unlabelled_df_content.append(cell)
                except:
                    pass

In [49]:
# Extract the rows of the example labs
example_labs_df = calgary_merged[calgary_merged['Neighborhood'].str.contains('lab', regex=False)]
print(example_labs_df.shape)
example_labs_df.head()

(13, 5)


Unnamed: 0,PostalCode,Neighborhood,Latitude,Longitude,Cluster Labels
31,R3P,example lab 1,49.83926,-97.20586,2
32,J8T,example lab 2,45.47843,-75.70476,2
33,K2P,example lab 3,45.41614,-75.6918,2
34,H4R,example lab 4,45.50834,-73.7115,2
35,H7L,example lab 5,45.60557,-73.78127,0


In [57]:
# Count the appearance of each cluster in the example labs dataframe
example_labs_df['Cluster Labels'].value_counts()

2    12
0     1
Name: Cluster Labels, dtype: int64

In [55]:
# Create a quick visual of the results 


AttributeError: 'Series' object has no attribute 'DataFrame'