In [1]:
import geopy
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199

## 1.Setting up the raw data from Calgary

In [3]:
calgary_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_T'
html_data = requests.get(calgary_url).text
soup = BeautifulSoup(html_data,"html5lib") 

In [4]:
calgary_df_content = []
table = soup.find("table")
for row in table.find_all("td") :
    cell = {}
    if row.span.text=='Not assigned':
        pass
    
    else:
        borough = row.span.text.split('(')[0]
        
        if borough == 'Calgary':
            cell['PostalCode'] = row.text[:3]
            cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
            calgary_df_content.append(cell)
            
        else:
            pass

In [5]:
calgary_df1=pd.DataFrame(data=calgary_df_content)
print(calgary_df1.shape)
calgary_df1.head()

(31, 2)


Unnamed: 0,PostalCode,Neighborhood
0,T2A,"Penbrooke Meadows, Marlborough"
1,T3A,"Dalhousie, Edgemont, Hamptons, Hidden Valley"
2,T2B,"Forest Lawn, Dover, Erin Woods"
3,T3B,"Montgomery, Bowness, Silver Springs, Greenwood"
4,T2C,"Lynnwood Ridge, Ogden, Foothills Industrial, Great Plains"


In [6]:
# Had to find coordinates with https://www.latlong.net/
calgary_coordinates_url ='https://raw.githubusercontent.com/Armando12pdf/Coursera_Capstone/main/Calgary%20coordinates.csv'
calgary_coordinates_df=pd.read_csv(calgary_coordinates_url)
print(calgary_coordinates_df.shape)
calgary_coordinates_df.head()

(31, 3)


Unnamed: 0,Postal code,Latitude,Longitude
0,T2A,51.04968,-113.96432
1,T3A,51.12454,-114.14289
2,T2B,51.02533,-113.9789
3,T3B,51.08963,-114.19751
4,T2C,50.98122,-113.99786


In [7]:
#Concatenate with respective latitude and longitude
calgary_df1['Latitude'] = calgary_df1.PostalCode.map(calgary_coordinates_df.set_index('Postal code')['Latitude'].to_dict())
calgary_df1['Longitude'] = calgary_df1.PostalCode.map(calgary_coordinates_df.set_index('Postal code')['Longitude'].to_dict())
print(calgary_df1.shape)
calgary_df1.head()

(31, 4)


Unnamed: 0,PostalCode,Neighborhood,Latitude,Longitude
0,T2A,"Penbrooke Meadows, Marlborough",51.04968,-113.96432
1,T3A,"Dalhousie, Edgemont, Hamptons, Hidden Valley",51.12454,-114.14289
2,T2B,"Forest Lawn, Dover, Erin Woods",51.02533,-113.9789
3,T3B,"Montgomery, Bowness, Silver Springs, Greenwood",51.08963,-114.19751
4,T2C,"Lynnwood Ridge, Ogden, Foothills Industrial, Great Plains",50.98122,-113.99786


In [8]:
#Define foursquare credentials
CLIENT_ID = 'ZZBEKKTJGWTU0HNGJ2ROO5JDHSKAXVABHUJKBBYZG1WNWQ0W' # your Foursquare ID
CLIENT_SECRET = 'JS1SDJDRRFBWX0SXVSS5GPCXVG4J3BAKY0AWJMLE2IJH4OFG' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

In [9]:
#Function that gets venues

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [10]:
#Get venues for calgary neighborhoods
calgary_venues = getNearbyVenues(names=calgary_df1['Neighborhood'],
                                   latitudes=calgary_df1['Latitude'],
                                   longitudes=calgary_df1['Longitude'])
calgary_venues.head()

Penbrooke Meadows, Marlborough
Dalhousie, Edgemont, Hamptons, Hidden Valley
Forest Lawn, Dover, Erin Woods
Montgomery, Bowness, Silver Springs, Greenwood
Lynnwood Ridge, Ogden, Foothills Industrial, Great Plains
Rosscarrock, Westgate, Wildwood, Shaganappi, Sunalta
Bridgeland, Greenview, Zoo, YYC
Lakeview, Glendale, Killarney, Glamorgan
Inglewood, Burnsland, Chinatown, East Victoria Park, Saddledome
Hawkwood, Arbour Lake, Citadel, Ranchlands, Royal Oak, Rocky Ridge
Highfield, Burns Industrial
Discovery Ridge, Signal Hill, West Springs,Christie Park, Patterson, Cougar Ridge
Queensland, Lake Bonavista, Willow Park, Acadia
Martindale, Taradale, Falconridge, Saddle Ridge
Thorncliffe, Tuxedo Park
Sandstone, MacEwan Glen, Beddington, Harvest Hills, Coventry Hills, Panorama Hills
Brentwood, Collingwood, Nose Hill
Tuscany, Scenic Acres
Mount Pleasant, Capitol Hill, Banff Trail
Cranston, Auburn Bay, Mahogany
Kensington, Westmont, Parkdale, University
City Centre, Calgary Tower
Symons Valley
Conn

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Penbrooke Meadows, Marlborough",51.04968,-113.96432,Bearcat General Contracting,51.047779,-113.968599,Construction & Landscaping
1,"Dalhousie, Edgemont, Hamptons, Hidden Valley",51.12454,-114.14289,Edgemont Construction Ltd,51.123928,-114.144313,Construction & Landscaping
2,"Dalhousie, Edgemont, Hamptons, Hidden Valley",51.12454,-114.14289,Edgemont City,51.126473,-114.138997,Asian Restaurant
3,"Dalhousie, Edgemont, Hamptons, Hidden Valley",51.12454,-114.14289,Friends Cappuccino Bar & Bake Shop,51.12637,-114.138676,Café
4,"Dalhousie, Edgemont, Hamptons, Hidden Valley",51.12454,-114.14289,On the Rocks,51.126893,-114.139355,Wine Shop


In [11]:
#One hot encoding of calgary venues
calgary_onehot_df = pd.get_dummies(calgary_venues[['Venue Category']], prefix="", prefix_sep="")
calgary_onehot_df.insert(loc=0, column='Neighborhood name', value=calgary_venues['Neighborhood'].values.tolist()) 
print(calgary_onehot_df.shape)
calgary_onehot_df.head()

(240, 104)


Unnamed: 0,Neighborhood name,ATM,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bakery,Bank,Bar,Baseball Field,Beer Bar,Bookstore,Bowling Alley,Brazilian Restaurant,Breakfast Spot,Brewery,Burger Joint,Bus Station,Business Service,Café,Camera Store,Cheese Shop,Chinese Restaurant,Clothing Store,Cocktail Bar,Coffee Shop,College Classroom,Construction & Landscaping,Convenience Store,Department Store,Diner,Dive Bar,Dog Run,Donut Shop,Dry Cleaner,Eastern European Restaurant,Electronics Store,Elementary School,Fast Food Restaurant,Food,Food & Drink Shop,French Restaurant,Fried Chicken Joint,Frozen Yogurt Shop,Garden,Gas Station,Gift Shop,Golf Course,Gourmet Shop,Grocery Store,Gym / Fitness Center,History Museum,Hobby Shop,Hockey Arena,Hockey Field,Hotel,Ice Cream Shop,Indian Restaurant,Indie Movie Theater,Inn,Insurance Office,Italian Restaurant,Japanese Restaurant,Juice Bar,Korean Restaurant,Lake,Latin American Restaurant,Library,Liquor Store,Lounge,Market,Massage Studio,Mediterranean Restaurant,Middle Eastern Restaurant,Modern European Restaurant,Moroccan Restaurant,Music Store,New American Restaurant,Park,Pharmacy,Pier,Pizza Place,Poutine Place,Professional & Other Places,Pub,Restaurant,Sandwich Place,Scandinavian Restaurant,Scenic Lookout,Seafood Restaurant,Shopping Mall,Soccer Field,Sports Bar,Stadium,Steakhouse,Supermarket,Sushi Restaurant,Thai Restaurant,Theater,Thrift / Vintage Store,Vietnamese Restaurant,Water Park,Wine Shop,Yoga Studio
0,"Penbrooke Meadows, Marlborough",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,"Dalhousie, Edgemont, Hamptons, Hidden Valley",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,"Dalhousie, Edgemont, Hamptons, Hidden Valley",0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,"Dalhousie, Edgemont, Hamptons, Hidden Valley",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,"Dalhousie, Edgemont, Hamptons, Hidden Valley",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [12]:
#Regroup calgary venues. This will be the data used with the ML models
calgary_grouped_df = calgary_onehot_df.groupby('Neighborhood name').mean().reset_index()
print(calgary_grouped_df.shape)
calgary_grouped_df.head()

(30, 104)


Unnamed: 0,Neighborhood name,ATM,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bakery,Bank,Bar,Baseball Field,Beer Bar,Bookstore,Bowling Alley,Brazilian Restaurant,Breakfast Spot,Brewery,Burger Joint,Bus Station,Business Service,Café,Camera Store,Cheese Shop,Chinese Restaurant,Clothing Store,Cocktail Bar,Coffee Shop,College Classroom,Construction & Landscaping,Convenience Store,Department Store,Diner,Dive Bar,Dog Run,Donut Shop,Dry Cleaner,Eastern European Restaurant,Electronics Store,Elementary School,Fast Food Restaurant,Food,Food & Drink Shop,French Restaurant,Fried Chicken Joint,Frozen Yogurt Shop,Garden,Gas Station,Gift Shop,Golf Course,Gourmet Shop,Grocery Store,Gym / Fitness Center,History Museum,Hobby Shop,Hockey Arena,Hockey Field,Hotel,Ice Cream Shop,Indian Restaurant,Indie Movie Theater,Inn,Insurance Office,Italian Restaurant,Japanese Restaurant,Juice Bar,Korean Restaurant,Lake,Latin American Restaurant,Library,Liquor Store,Lounge,Market,Massage Studio,Mediterranean Restaurant,Middle Eastern Restaurant,Modern European Restaurant,Moroccan Restaurant,Music Store,New American Restaurant,Park,Pharmacy,Pier,Pizza Place,Poutine Place,Professional & Other Places,Pub,Restaurant,Sandwich Place,Scandinavian Restaurant,Scenic Lookout,Seafood Restaurant,Shopping Mall,Soccer Field,Sports Bar,Stadium,Steakhouse,Supermarket,Sushi Restaurant,Thai Restaurant,Theater,Thrift / Vintage Store,Vietnamese Restaurant,Water Park,Wine Shop,Yoga Studio
0,"Braeside, Cedarbrae, Woodbine",0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Brentwood, Collingwood, Nose Hill",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bridgeland, Greenview, Zoo, YYC",0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"City Centre, Calgary Tower",0.0,0.0,0.0,0.0,0.0,0.022727,0.022727,0.068182,0.0,0.0,0.022727,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022727,0.0,0.0,0.0,0.0,0.022727,0.204545,0.0,0.0,0.0,0.022727,0.0,0.0,0.0,0.0,0.0,0.022727,0.0,0.0,0.022727,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022727,0.0,0.0,0.0,0.0,0.0,0.068182,0.0,0.022727,0.022727,0.0,0.0,0.022727,0.022727,0.022727,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022727,0.0,0.0,0.0,0.0,0.022727,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.090909,0.022727,0.0,0.0,0.0,0.022727,0.0,0.0,0.0,0.045455,0.0,0.045455,0.0,0.0,0.0,0.022727,0.0,0.0,0.0
4,"Connaught, West Victoria Park",0.0,0.0,0.0,0.0,0.0,0.012987,0.0,0.051948,0.0,0.012987,0.012987,0.0,0.012987,0.0,0.038961,0.025974,0.0,0.0,0.051948,0.012987,0.0,0.012987,0.0,0.012987,0.051948,0.0,0.0,0.0,0.0,0.025974,0.0,0.0,0.012987,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025974,0.0,0.012987,0.0,0.012987,0.012987,0.0,0.012987,0.0,0.012987,0.012987,0.0,0.0,0.0,0.025974,0.0,0.025974,0.0,0.0,0.0,0.012987,0.025974,0.0,0.012987,0.0,0.012987,0.012987,0.0,0.025974,0.012987,0.0,0.0,0.012987,0.012987,0.012987,0.012987,0.0,0.012987,0.0,0.0,0.025974,0.012987,0.0,0.051948,0.051948,0.025974,0.012987,0.0,0.012987,0.0,0.0,0.0,0.0,0.012987,0.0,0.025974,0.025974,0.0,0.0,0.038961,0.0,0.0,0.012987


In [13]:
# Some neighborhoods didn't get results, so they will be removed from our initial df. 
print(len(set(calgary_grouped_df['Neighborhood name'])))
print(len(set(calgary_df1['Neighborhood'])))

30
31


In [14]:
# Remove the neighborhoods in original df for which the API did provide information. 
for element in calgary_df1['Neighborhood']:
    if element in set(calgary_grouped_df['Neighborhood name']):
        pass
        
    else:
        print(element)
        calgary_df1 = calgary_df1[calgary_df1.Neighborhood != element]

print(len(set(calgary_grouped_df['Neighborhood name'])))        
print(len(calgary_df1['Neighborhood']))

Queensland, Lake Bonavista, Willow Park, Acadia
30
30


## 2.Positive data learning with SVM

### 2.1 Prepare positively labeled raw data

In [15]:
#Postal codes were gathered manually. Their latitudes and longitudes were obtained from https://www.latlong.net/
positive_coordinates_url ='https://raw.githubusercontent.com/Armando12pdf/Coursera_Capstone/main/Positive%20locations%20coordinates.csv'
example_labs_df=pd.read_csv(positive_coordinates_url)
example_labs_df.head()

Unnamed: 0,Postal code,Latitude,Longitude
0,G1C,46.881771,-71.189369
1,G1E,46.86013,-71.194054
2,G1M,46.81723,-71.269836
3,G6W,46.75756,-71.22557
4,H1K,45.60818,-73.54452


In [16]:
#Add a neighborhood column
labs_labels =[]
for number in range(1,(example_labs_df.shape[0]+1)) :
    new_lab_number = 'example lab ' + str(number)
    labs_labels.append(new_lab_number)

example_labs_df.insert(loc=1, column='Neighborhood', value=labs_labels) 

In [17]:
example_labs_df.head()

Unnamed: 0,Postal code,Neighborhood,Latitude,Longitude
0,G1C,example lab 1,46.881771,-71.189369
1,G1E,example lab 2,46.86013,-71.194054
2,G1M,example lab 3,46.81723,-71.269836
3,G6W,example lab 4,46.75756,-71.22557
4,H1K,example lab 5,45.60818,-73.54452


In [18]:
#Get venues for the labs used as positive examples
example_labs_venues = getNearbyVenues(names=example_labs_df['Neighborhood'],
                                   latitudes=example_labs_df['Latitude'],
                                   longitudes=example_labs_df['Longitude'])

example lab 1
example lab 2
example lab 3
example lab 4
example lab 5
example lab 6
example lab 7
example lab 8
example lab 9
example lab 10
example lab 11
example lab 12
example lab 13
example lab 14
example lab 15
example lab 16
example lab 17
example lab 18
example lab 19
example lab 20
example lab 21
example lab 22
example lab 23
example lab 24
example lab 25
example lab 26
example lab 27
example lab 28
example lab 29
example lab 30
example lab 31
example lab 32
example lab 33
example lab 34
example lab 35
example lab 36
example lab 37
example lab 38
example lab 39
example lab 40
example lab 41
example lab 42
example lab 43
example lab 44
example lab 45
example lab 46
example lab 47
example lab 48
example lab 49
example lab 50
example lab 51
example lab 52
example lab 53
example lab 54
example lab 55
example lab 56
example lab 57
example lab 58
example lab 59
example lab 60
example lab 61
example lab 62
example lab 63
example lab 64
example lab 65
example lab 66
example lab 67
exam

In [19]:
example_labs_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,example lab 1,46.881771,-71.189369,Assaini-Conseil S D Inc,46.88311,-71.186679,Construction & Landscaping
1,example lab 1,46.881771,-71.189369,Lou-Tec Québec (Beauport),46.882766,-71.185699,Rental Service
2,example lab 2,46.86013,-71.194054,IGA,46.86216,-71.188334,Grocery Store
3,example lab 2,46.86013,-71.194054,Senor Sombrero,46.861217,-71.188514,Mexican Restaurant
4,example lab 2,46.86013,-71.194054,Pharmaprix,46.86076,-71.189769,Pharmacy


In [20]:
#One hot encoding of the example labs
labs_onehot_df = pd.get_dummies(example_labs_venues[['Venue Category']], prefix="", prefix_sep="")
labs_onehot_df.insert(loc=0, column='Neighborhood name', value=example_labs_venues['Neighborhood'].values.tolist()) 
print(labs_onehot_df.shape)
labs_onehot_df.head()

(956, 218)


Unnamed: 0,Neighborhood name,Adult Boutique,Airport,American Restaurant,Antique Shop,Arepa Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Garage,Automotive Shop,BBQ Joint,Baby Store,Badminton Court,Bagel Shop,Bakery,Bank,Bar,Baseball Field,Baseball Stadium,Beer Bar,Beer Store,Belgian Restaurant,Big Box Store,Bookstore,Boutique,Breakfast Spot,Brewery,Bubble Tea Shop,Burger Joint,Burrito Place,Bus Station,Bus Stop,Business Service,Butcher,Café,Cajun / Creole Restaurant,Candy Store,Caribbean Restaurant,Casino,Cheese Shop,Chinese Restaurant,Chocolate Shop,City,Clothing Store,Cocktail Bar,Coffee Shop,College Bookstore,Comfort Food Restaurant,Comic Shop,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Cupcake Shop,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Diner,Discount Store,Dive Bar,Doctor's Office,Dog Run,Donut Shop,Dumpling Restaurant,Electronics Store,English Restaurant,Event Space,Farmers Market,Fast Food Restaurant,Field,Financial or Legal Service,Fish & Chips Shop,Fish Market,Fishing Store,Flower Shop,Food & Drink Shop,Food Court,Food Truck,French Restaurant,Fried Chicken Joint,Frozen Yogurt Shop,Furniture / Home Store,Gaming Cafe,Garden,Gas Station,Gastropub,Gay Bar,Gift Shop,Golf Course,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Gym Pool,Hardware Store,Health & Beauty Service,Health Food Store,Historic Site,History Museum,Hockey Arena,Hockey Rink,Home Service,Hostel,Hotel,Ice Cream Shop,Indian Chinese Restaurant,Indian Restaurant,Indie Movie Theater,Indoor Play Area,Inn,Intersection,Irish Pub,Italian Restaurant,Japanese Restaurant,Juice Bar,Karaoke Bar,Kids Store,Korean Restaurant,Liquor Store,Lounge,Market,Massage Studio,Mattress Store,Medical Supply Store,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant,Middle Eastern Restaurant,Military Base,Miscellaneous Shop,Mobile Phone Shop,Modern European Restaurant,Monument / Landmark,Movie Theater,Museum,Music Store,Music Venue,Nail Salon,Nature Preserve,Neighborhood,New American Restaurant,Newsstand,Nightclub,Noodle House,Office,Optical Shop,Organic Grocery,Other Nightlife,Outdoor Supply Store,Park,Performing Arts Venue,Persian Restaurant,Pet Store,Pharmacy,Pizza Place,Platform,Playground,Plaza,Poke Place,Pool,Pool Hall,Poutine Place,Pub,Ramen Restaurant,Record Shop,Recreation Center,Rental Car Location,Rental Service,Restaurant,Salad Place,Sandwich Place,Scandinavian Restaurant,Scenic Lookout,Seafood Restaurant,Shoe Store,Shop & Service,Shopping Mall,Skating Rink,Soccer Field,Southern / Soul Food Restaurant,Spa,Sporting Goods Shop,Sports Bar,Sri Lankan Restaurant,Steakhouse,Supermarket,Supplement Shop,Sushi Restaurant,Taco Place,Tapas Restaurant,Tea Room,Tennis Court,Thai Restaurant,Theater,Theme Park,Thrift / Vintage Store,Toy / Game Store,Track,Trail,Train,Train Station,Tree,Tunnel,Turkish Home Cooking Restaurant,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,example lab 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,example lab 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,example lab 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,example lab 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,example lab 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [21]:
#Regroup the venues of the example labs
labs_grouped_df = labs_onehot_df.groupby('Neighborhood name').mean().reset_index()
print(labs_grouped_df.shape)
labs_grouped_df.head()

(95, 218)


Unnamed: 0,Neighborhood name,Adult Boutique,Airport,American Restaurant,Antique Shop,Arepa Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Garage,Automotive Shop,BBQ Joint,Baby Store,Badminton Court,Bagel Shop,Bakery,Bank,Bar,Baseball Field,Baseball Stadium,Beer Bar,Beer Store,Belgian Restaurant,Big Box Store,Bookstore,Boutique,Breakfast Spot,Brewery,Bubble Tea Shop,Burger Joint,Burrito Place,Bus Station,Bus Stop,Business Service,Butcher,Café,Cajun / Creole Restaurant,Candy Store,Caribbean Restaurant,Casino,Cheese Shop,Chinese Restaurant,Chocolate Shop,City,Clothing Store,Cocktail Bar,Coffee Shop,College Bookstore,Comfort Food Restaurant,Comic Shop,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Cupcake Shop,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Diner,Discount Store,Dive Bar,Doctor's Office,Dog Run,Donut Shop,Dumpling Restaurant,Electronics Store,English Restaurant,Event Space,Farmers Market,Fast Food Restaurant,Field,Financial or Legal Service,Fish & Chips Shop,Fish Market,Fishing Store,Flower Shop,Food & Drink Shop,Food Court,Food Truck,French Restaurant,Fried Chicken Joint,Frozen Yogurt Shop,Furniture / Home Store,Gaming Cafe,Garden,Gas Station,Gastropub,Gay Bar,Gift Shop,Golf Course,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Gym Pool,Hardware Store,Health & Beauty Service,Health Food Store,Historic Site,History Museum,Hockey Arena,Hockey Rink,Home Service,Hostel,Hotel,Ice Cream Shop,Indian Chinese Restaurant,Indian Restaurant,Indie Movie Theater,Indoor Play Area,Inn,Intersection,Irish Pub,Italian Restaurant,Japanese Restaurant,Juice Bar,Karaoke Bar,Kids Store,Korean Restaurant,Liquor Store,Lounge,Market,Massage Studio,Mattress Store,Medical Supply Store,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant,Middle Eastern Restaurant,Military Base,Miscellaneous Shop,Mobile Phone Shop,Modern European Restaurant,Monument / Landmark,Movie Theater,Museum,Music Store,Music Venue,Nail Salon,Nature Preserve,Neighborhood,New American Restaurant,Newsstand,Nightclub,Noodle House,Office,Optical Shop,Organic Grocery,Other Nightlife,Outdoor Supply Store,Park,Performing Arts Venue,Persian Restaurant,Pet Store,Pharmacy,Pizza Place,Platform,Playground,Plaza,Poke Place,Pool,Pool Hall,Poutine Place,Pub,Ramen Restaurant,Record Shop,Recreation Center,Rental Car Location,Rental Service,Restaurant,Salad Place,Sandwich Place,Scandinavian Restaurant,Scenic Lookout,Seafood Restaurant,Shoe Store,Shop & Service,Shopping Mall,Skating Rink,Soccer Field,Southern / Soul Food Restaurant,Spa,Sporting Goods Shop,Sports Bar,Sri Lankan Restaurant,Steakhouse,Supermarket,Supplement Shop,Sushi Restaurant,Taco Place,Tapas Restaurant,Tea Room,Tennis Court,Thai Restaurant,Theater,Theme Park,Thrift / Vintage Store,Toy / Game Store,Track,Trail,Train,Train Station,Tree,Tunnel,Turkish Home Cooking Restaurant,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,example lab 1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,example lab 10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,example lab 100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,example lab 11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,example lab 12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 2.2 Train and evaluate the one-class SVM model

In [22]:
# When training a model, the features of the training data set have to match those of the target data set
# They do not match in this situation:
print('features in labs: ' + str(len(labs_grouped_df.columns)))
print('features in calgary: ' + str(len(calgary_grouped_df.columns)))

features in labs: 218
features in calgary: 104


In [23]:
#Columns in the training set that are not within the target data set, will be removed
for element in labs_grouped_df.columns.tolist():
    if element not in calgary_grouped_df.columns.tolist():
        labs_grouped_df.drop(columns=[element], inplace=True)
        
    else:
        pass

print('features in labs: ' + str(len(labs_grouped_df.columns)))
print('features in calgary: ' + str(len(calgary_grouped_df.columns)))

features in labs: 82
features in calgary: 104


In [24]:
# Repeat in other direction for features to match
# Columns in the training set that are not within the target data set, will be removed
for element in calgary_grouped_df.columns.tolist():
    if element not in labs_grouped_df.columns.tolist():
        calgary_grouped_df.drop(columns=[element], inplace=True)
        
    else:
        pass

print('features in labs: ' + str(len(labs_grouped_df.columns)))
print('features in calgary: ' + str(len(calgary_grouped_df.columns)))


features in labs: 82
features in calgary: 82


In [91]:
# Now we're ready to start training our model
# First import the relevant libraries
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score

In [134]:
# Test different test sizes to find an optimal one
test_size_array = np.arange(0.05, 0.55, 0.05)

one_class_SVM_eval = []

for test_size in test_size_array:
    X_train_pos, X_test_pos = train_test_split(labs_grouped_df, test_size=test_size, random_state=4)
    y_test_pos = [1]*X_test_pos.shape[0]
    
    One_class_SVM = OneClassSVM(gamma='auto').fit(X_train_pos.loc[:, X_train_pos.columns != 'Neighborhood name'])
    
    yhat_test_pos=One_class_SVM.predict(X_test_pos.loc[:, X_test_pos.columns != 'Neighborhood name'])
    
    Pos_only_jaccard = jaccard_score(y_test_pos, yhat_test_pos, pos_label=1)
    Pos_only_f1_score = f1_score(y_test_pos, yhat_test_pos, average='weighted')
    
    cell = {}
    cell['Test size'] = test_size
    cell['Jaccard score'] = Pos_only_jaccard
    cell['F1 Score'] = Pos_only_f1_score
    one_class_SVM_eval.append(cell)

In [135]:
# Put into a df to visualize
one_clas_SVM_eval_df = pd.DataFrame(data = one_clas_SVM_eval)
one_clas_SVM_eval_df

Unnamed: 0,Test size,Jaccard score,F1 Score
0,0.05,0.6,0.75
1,0.1,0.6,0.75
2,0.15,0.6,0.75
3,0.2,0.736842,0.848485
4,0.25,0.666667,0.8
5,0.3,0.551724,0.711111
6,0.35,0.617647,0.763636
7,0.4,0.605263,0.754098
8,0.45,0.488372,0.65625
9,0.5,0.5,0.666667


In [136]:
# Determine the test size that was determined to give the best results
max_jaccard = one_clas_SVM_eval_df['Jaccard score'].max()
max_jaccard_position = one_clas_SVM_eval_df['Jaccard score'] == max_jaccard
test_size_1 = one_clas_SVM_eval_df['Test size'][max_jaccard_position].values[0]

max_f1 = one_clas_SVM_eval_df['F1 Score'].max()
max_f1_position = one_clas_SVM_eval_df['F1 Score'] == max_f1
test_size_2 = one_clas_SVM_eval_df['Test size'][max_f1_position].values[0]

if test_size_1 == test_size_2:
    print('Test sizes are congruent \n')
    test_size_to_use = test_size_1
print('The optimal test size is: ', test_size_to_use)

Test sizes are congruent 

The optimal test size is:  0.2


In [101]:
# Split the data using the determined optimal test size
X_train_pos, X_test_pos = train_test_split(labs_grouped_df, test_size=test_size_to_use, random_state=4)
y_test_pos = [1]*X_test_pos.shape[0]

# Trains the model
One_class_SVM = OneClassSVM(gamma='auto').fit(X_train_pos.loc[:, X_train_pos.columns != 'Neighborhood name'])


### 2.3 Applying the trained one-class SVM model

In [102]:
#Apply the one class SVM model to the calgary data frame
One_c_SVM_result=One_class_SVM.predict(calgary_grouped_df.loc[:, calgary_grouped_df.columns != 'Neighborhood name'])
One_c_SVM_result

array([-1, -1, -1,  1,  1,  1, -1,  1, -1, -1, -1, -1, -1,  1, -1,  1, -1,
       -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1,  1, -1])

In [104]:
#Add results to calgary df
calgary_df1['Pos_only_Result'] = One_c_SVM_result
calgary_df1.head()

Unnamed: 0,PostalCode,Neighborhood,Latitude,Longitude,Pos_only_Result
0,T2A,"Penbrooke Meadows, Marlborough",51.04968,-113.96432,-1
1,T3A,"Dalhousie, Edgemont, Hamptons, Hidden Valley",51.12454,-114.14289,-1
2,T2B,"Forest Lawn, Dover, Erin Woods",51.02533,-113.9789,-1
3,T3B,"Montgomery, Bowness, Silver Springs, Greenwood",51.08963,-114.19751,1
4,T2C,"Lynnwood Ridge, Ogden, Foothills Industrial, Great Plains",50.98122,-113.99786,1


In [106]:
#Get positive results only
calgary_df1_pos = calgary_df1.loc[calgary_df1['Pos_only_Result']==1]
calgary_df1_pos

Unnamed: 0,PostalCode,Neighborhood,Latitude,Longitude,Pos_only_Result
3,T3B,"Montgomery, Bowness, Silver Springs, Greenwood",51.08963,-114.19751,1
4,T2C,"Lynnwood Ridge, Ogden, Foothills Industrial, Great Plains",50.98122,-113.99786,1
5,T3C,"Rosscarrock, Westgate, Wildwood, Shaganappi, Sunalta",51.04492,-114.1307,1
7,T3E,"Lakeview, Glendale, Killarney, Glamorgan",51.02038,-114.13822,1
14,T2K,"Thorncliffe, Tuxedo Park",51.10199,-114.07128,1
16,T2L,"Brentwood, Collingwood, Nose Hill",51.09035,-114.12176,1
22,T3P,Symons Valley,51.17748,-114.10508,1
29,T2Y,"Millrise, Somerset, Bridlewood, Evergreen",50.9111,-114.09638,1


### 2.4 Visualization of one-class SVM model results in a map

In [107]:
# libraries for map
import matplotlib.cm as cm
import matplotlib.colors as colors
!conda install -c conda-forge folium=0.5.0 --yes
import folium

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [108]:
# create map. The latitude and longitude of toronto were found on 
# https://www.latlong.net/place/calgary-ab-canada-29104.html
svm_map = folium.Map(location=[51.049999,  -114.066666], zoom_start=10.4)

#Add markers to map
for lat, lon in zip(calgary_df1_pos['Latitude'], calgary_df1_pos['Longitude']):
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        color='red',
        fill=True,
        fill_color='red',
        fill_opacity=0.7).add_to(svm_map)
     
svm_map

# 3 Using Positive-Unlabelled data and SVM

## 3.1 Get and prepare the unlabelled data set

In [109]:
#Put the contents in a list
unlabelled_df_content = []

In [110]:
#Define a function that gets postal codes from sites with city and neighborhood in parenthesis
def getPostalCodes1(url, city_of_interest):
    html_data = requests.get(url).text
    soup = BeautifulSoup(html_data,"html5lib") 
    table = soup.find("table")
    for row in table.find_all("td") :
        cell = {}
        if row.span.text=='Not assigned':
            pass
    
        else:
            borough = row.span.text.split('(')[0]
            if borough== city_of_interest:
                example_labs_post_codes = example_labs_df.iloc[:,0].tolist()
                PostalCode=row.text[:3]
            
                if PostalCode in example_labs_post_codes:
                    pass
            
                else:
                    cell['PostalCode'] = PostalCode
                    cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
                    unlabelled_df_content.append(cell)
    
            else:
                pass

In [111]:
url_list1=['https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_K',
          'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_R']
cities=['Ottawa', 'Winnipeg']


In [112]:
#Ottawa and Winnipeg
getPostalCodes1(url=url_list1[0], city_of_interest=cities[0])
getPostalCodes1(url=url_list1[1], city_of_interest=cities[1])

In [113]:
#Define a function that gets postal codes from sites without cities named
def getPostalCodes2(url):
    html_data = requests.get(url).text
    soup = BeautifulSoup(html_data,"html5lib") 
    table = soup.find("table")
    for row in table.find_all("td") :
        cell = {}
        if row.span.text=='Not assigned':
            pass
    
        else:
            example_labs_post_codes = example_labs_df.iloc[:,0].tolist()
            PostalCode=row.text[1:4]
            if PostalCode in example_labs_post_codes:
                pass
            
            else:
                cell['PostalCode'] = PostalCode
                cell['Neighborhood'] = row.span.text
                unlabelled_df_content.append(cell)

In [114]:
#Montreal and Laval
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_H'
getPostalCodes2(url)

In [115]:
#Define a function that gets postal codes from sites with neighborhoods in parathensis
def getPostalCodes3(url):
    html_data = requests.get(url).text
    soup = BeautifulSoup(html_data,"html5lib") 
    table = soup.find("table")
    for row in table.find_all("td") :
        cell = {}
        if row.span.text=='Not assigned':
            pass
    
        else:
            example_labs_post_codes = example_labs_df.iloc[:,0].tolist()
            PostalCode=row.text[1:4]
            if PostalCode in example_labs_post_codes:
                pass
            
            else:
                cell['PostalCode'] = PostalCode
                cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')   
                unlabelled_df_content.append(cell)

In [116]:
#Toronto
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
getPostalCodes3(url)

In [117]:
#Put together in a dataframe
unlabelled_df=pd.DataFrame(data=unlabelled_df_content)
print(unlabelled_df.shape)
print(len(set(unlabelled_df.iloc[:,0].tolist())))
unlabelled_df.head()

(256, 2)
256


Unnamed: 0,PostalCode,Neighborhood
0,K2A,"Highland Park, McKellar Park,Westboro,Glabar Park,Carlingwood"
1,K4A,Fallingbrook
2,K1B,"Blackburn Hamlet, Pine View, Sheffield Glen"
3,K2B,"Britannia,Whitehaven, Bayshore, Pinecrest"
4,K4B,Navan


In [118]:
#Unlabelled coordinates
unlabelled_coordinates_url ='https://raw.githubusercontent.com/Armando12pdf/Coursera_Capstone/main/Unlabelled%20locations%20coordinates.csv'
unlabelled_coordinates_df=pd.read_csv(unlabelled_coordinates_url)
unlabelled_coordinates_df.head()

Unnamed: 0,Postal code,Latitude,Longitude
0,K2A,45.38025,-75.76138
1,K4A,45.46734,-75.47799
2,K1B,45.42042,-75.59603
3,K2B,45.36172,-75.78945
4,K4B,45.41413,-75.40364


In [119]:
#Concatenate with respective latitude and longitude
unlabelled_df['Latitude'] = unlabelled_df.PostalCode.map(unlabelled_coordinates_df.set_index('Postal code')['Latitude'].to_dict())
unlabelled_df['Longitude'] = unlabelled_df.PostalCode.map(unlabelled_coordinates_df.set_index('Postal code')['Longitude'].to_dict())
print(unlabelled_df.shape)
unlabelled_df.head()

(256, 4)


Unnamed: 0,PostalCode,Neighborhood,Latitude,Longitude
0,K2A,"Highland Park, McKellar Park,Westboro,Glabar Park,Carlingwood",45.38025,-75.76138
1,K4A,Fallingbrook,45.46734,-75.47799
2,K1B,"Blackburn Hamlet, Pine View, Sheffield Glen",45.42042,-75.59603
3,K2B,"Britannia,Whitehaven, Bayshore, Pinecrest",45.36172,-75.78945
4,K4B,Navan,45.41413,-75.40364


In [120]:
#Get venues for unlabelled neighborhoods
unlabelled_venues = getNearbyVenues(names=unlabelled_df['Neighborhood'],
                                   latitudes=unlabelled_df['Latitude'],
                                   longitudes=unlabelled_df['Longitude'])

Highland Park, McKellar Park,Westboro,Glabar Park,Carlingwood
Fallingbrook
Blackburn Hamlet, Pine View, Sheffield Glen
Britannia,Whitehaven, Bayshore, Pinecrest
Navan
Queensway, Copeland Park, Central Park, Bel Air,Carleton Heights
Cumberland
Queenswood
Eastern Nepean: Fisher Heights/ Parkwood Hills, Borden Farm, Pine Glen
Centrepointe, Meadowlands, City View, Craig Henry, Tangelwood, Grenfell Glen, Davidson Heights
Bells Corners, Arlington Woods/Redwood, Qualicum, Crystal Beach
Beacon Hill, Cyrville, Carson Grove
Barrhaven
Beaverbrook, South March
Katimavik-Hazeldean, Glen Cairn
Rockcliffe Park, New Edinburgh
Bridlewood
Manotick
Downtown
Greely
Dalhousie Ward
Fallowfield Village, Cedarhill Estates, Orchard Estates
The Glebe, Old Ottawa South, Old Ottawa East, Carleton University, Dow's Lake area
Stittsville
Blossom Park, Greenboro, Leitrim, Findlay Creek
Marchwood
Terry Fox, Palladium
Chapel Hill South, Blackburn
North March
South Gloucester
Civic Hospital, Island Park, Hintonburg, Me

In [121]:
print(unlabelled_venues.shape)
unlabelled_venues.head()

(5014, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Highland Park, McKellar Park,Westboro,Glabar Park,Carlingwood",45.38025,-75.76138,Tillbury Park,45.378744,-75.75959,Park
1,"Highland Park, McKellar Park,Westboro,Glabar Park,Carlingwood",45.38025,-75.76138,McKellar Park,45.382858,-75.765691,Park
2,"Highland Park, McKellar Park,Westboro,Glabar Park,Carlingwood",45.38025,-75.76138,Jademark Technologies,45.383348,-75.756911,Electronics Store
3,Fallingbrook,45.46734,-75.47799,Marcel Lalande Park,45.46996,-75.475977,Park
4,Fallingbrook,45.46734,-75.47799,Shoppers Drugmart Fallingbrook,45.469727,-75.480669,Pharmacy


In [122]:
#One hot encoding of the unlabelled neighborhoods
unlab_onehot_df = pd.get_dummies(unlabelled_venues[['Venue Category']], prefix="", prefix_sep="")
unlab_onehot_df.insert(loc=0, column='Neighborhood name', value=unlabelled_venues['Neighborhood'].values.tolist()) 
print(unlab_onehot_df.shape)
unlab_onehot_df.head()

(5014, 339)


Unnamed: 0,Neighborhood name,ATM,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Arepa Restaurant,Argentinian Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Arts & Entertainment,Asian Restaurant,Athletics & Sports,Auto Dealership,Auto Garage,Automotive Shop,BBQ Joint,Bagel Shop,Bakery,Bank,Bar,Baseball Field,Basketball Court,Basketball Stadium,Beach Bar,Bed & Breakfast,Beer Bar,Beer Store,Belgian Restaurant,Big Box Store,Bike Rental / Bike Share,Bike Shop,Bistro,Boat or Ferry,Bookstore,Boutique,Bowling Alley,Boxing Gym,Brazilian Restaurant,Breakfast Spot,Brewery,Bubble Tea Shop,Building,Burger Joint,Burrito Place,Bus Line,Bus Station,Bus Stop,Business Service,Butcher,Café,Cajun / Creole Restaurant,Cambodian Restaurant,Candy Store,Cantonese Restaurant,Caribbean Restaurant,Cheese Shop,Chinese Restaurant,Chiropractor,Chocolate Shop,Church,Climbing Gym,Clothing Store,Cocktail Bar,Coffee Shop,College Arts Building,College Gym,College Rec Center,College Stadium,College Theater,Colombian Restaurant,Comedy Club,Comfort Food Restaurant,Comic Shop,Community Center,Concert Hall,Construction & Landscaping,Convenience Store,Convention Center,Cosmetics Shop,Costume Shop,Creperie,Cuban Restaurant,Cupcake Shop,Curling Ice,Cycle Studio,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dive Bar,Dog Run,Donut Shop,Drugstore,Dry Cleaner,Dumpling Restaurant,Duty-free Shop,Eastern European Restaurant,Electronics Store,Empanada Restaurant,English Restaurant,Entertainment Service,Escape Room,Ethiopian Restaurant,Event Space,Falafel Restaurant,Farm,Farmers Market,Fast Food Restaurant,Field,Filipino Restaurant,Financial or Legal Service,Fish & Chips Shop,Fish Market,Flea Market,Flower Shop,Food & Drink Shop,Food Court,Food Truck,Football Stadium,Fountain,French Restaurant,Fried Chicken Joint,Frozen Yogurt Shop,Fruit & Vegetable Store,Furniture / Home Store,Gaming Cafe,Garden,Gas Station,Gastropub,Gay Bar,General Entertainment,General Travel,German Restaurant,Gift Shop,Gluten-free Restaurant,Go Kart Track,Golf Course,Golf Driving Range,Gourmet Shop,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Gym Pool,Harbor / Marina,Hardware Store,Hawaiian Restaurant,Health & Beauty Service,Health Food Store,Historic Site,History Museum,Hobby Shop,Hockey Arena,Home Service,Hookah Bar,Hostel,Hot Dog Joint,Hotel,Hotel Bar,Housing Development,IT Services,Ice Cream Shop,Indian Chinese Restaurant,Indian Restaurant,Indie Movie Theater,Intersection,Irish Pub,Italian Restaurant,Japanese Restaurant,Jazz Club,Jewelry Store,Jewish Restaurant,Juice Bar,Karaoke Bar,Kids Store,Kitchen Supply Store,Knitting Store,Korean Restaurant,Lake,Laser Tag,Latin American Restaurant,Lebanese Restaurant,Library,Light Rail Station,Lingerie Store,Liquor Store,Lounge,Malay Restaurant,Market,Martial Arts School,Massage Studio,Medical Center,Mediterranean Restaurant,Memorial Site,Men's Store,Metro Station,Mexican Restaurant,Middle Eastern Restaurant,Military Base,Miscellaneous Shop,Mobile Phone Shop,Modern European Restaurant,Molecular Gastronomy Restaurant,Mongolian Restaurant,Monument / Landmark,Moroccan Restaurant,Motorcycle Shop,Movie Theater,Museum,Music School,Music Store,Music Venue,Neighborhood,New American Restaurant,Newsstand,Nightclub,Noodle House,Office,Opera House,Optical Shop,Organic Grocery,Other Great Outdoors,Other Nightlife,Outdoors & Recreation,Paper / Office Supplies Store,Park,Pastry Shop,Performing Arts Venue,Persian Restaurant,Peruvian Restaurant,Pet Café,Pet Store,Pharmacy,Pie Shop,Pizza Place,Playground,Plaza,Poke Place,Polish Restaurant,Pool,Pool Hall,Portuguese Restaurant,Post Office,Poutine Place,Pub,Public Art,Ramen Restaurant,Record Shop,Recreation Center,Rental Car Location,Rental Service,Residential Building (Apartment / Condo),Restaurant,Rock Climbing Spot,Rock Club,Sake Bar,Salad Place,Salon / Barbershop,Salvadoran Restaurant,Sandwich Place,Sausage Shop,Scenic Lookout,Sculpture Garden,Seafood Restaurant,Shabu-Shabu Restaurant,Shoe Store,Shopping Mall,Shopping Plaza,Skate Park,Skating Rink,Smoke Shop,Smoothie Shop,Snack Place,Soccer Field,Soccer Stadium,Social Club,Soup Place,South American Restaurant,Spa,Spanish Restaurant,Speakeasy,Sporting Goods Shop,Sports Bar,Sri Lankan Restaurant,Stationery Store,Steakhouse,Storage Facility,Street Art,Strip Club,Supermarket,Supplement Shop,Sushi Restaurant,Swiss Restaurant,Szechuan Restaurant,Taco Place,Tailor Shop,Taiwanese Restaurant,Tanning Salon,Tapas Restaurant,Tattoo Parlor,Tea Room,Tennis Court,Tex-Mex Restaurant,Thai Restaurant,Theater,Theme Park,Theme Restaurant,Thrift / Vintage Store,Tibetan Restaurant,Toy / Game Store,Track,Trail,Train,Train Station,Transportation Service,Turkish Restaurant,Vegetarian / Vegan Restaurant,Veterinarian,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,"Highland Park, McKellar Park,Westboro,Glabar Park,Carlingwood",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,"Highland Park, McKellar Park,Westboro,Glabar Park,Carlingwood",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,"Highland Park, McKellar Park,Westboro,Glabar Park,Carlingwood",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Fallingbrook,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Fallingbrook,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [123]:
#Regroup the venues of the example labs
unlab_grouped_df = unlab_onehot_df.groupby('Neighborhood name').mean().reset_index()
print(unlab_grouped_df.shape)
unlab_grouped_df.head()

(248, 339)


Unnamed: 0,Neighborhood name,ATM,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Arepa Restaurant,Argentinian Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Arts & Entertainment,Asian Restaurant,Athletics & Sports,Auto Dealership,Auto Garage,Automotive Shop,BBQ Joint,Bagel Shop,Bakery,Bank,Bar,Baseball Field,Basketball Court,Basketball Stadium,Beach Bar,Bed & Breakfast,Beer Bar,Beer Store,Belgian Restaurant,Big Box Store,Bike Rental / Bike Share,Bike Shop,Bistro,Boat or Ferry,Bookstore,Boutique,Bowling Alley,Boxing Gym,Brazilian Restaurant,Breakfast Spot,Brewery,Bubble Tea Shop,Building,Burger Joint,Burrito Place,Bus Line,Bus Station,Bus Stop,Business Service,Butcher,Café,Cajun / Creole Restaurant,Cambodian Restaurant,Candy Store,Cantonese Restaurant,Caribbean Restaurant,Cheese Shop,Chinese Restaurant,Chiropractor,Chocolate Shop,Church,Climbing Gym,Clothing Store,Cocktail Bar,Coffee Shop,College Arts Building,College Gym,College Rec Center,College Stadium,College Theater,Colombian Restaurant,Comedy Club,Comfort Food Restaurant,Comic Shop,Community Center,Concert Hall,Construction & Landscaping,Convenience Store,Convention Center,Cosmetics Shop,Costume Shop,Creperie,Cuban Restaurant,Cupcake Shop,Curling Ice,Cycle Studio,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dive Bar,Dog Run,Donut Shop,Drugstore,Dry Cleaner,Dumpling Restaurant,Duty-free Shop,Eastern European Restaurant,Electronics Store,Empanada Restaurant,English Restaurant,Entertainment Service,Escape Room,Ethiopian Restaurant,Event Space,Falafel Restaurant,Farm,Farmers Market,Fast Food Restaurant,Field,Filipino Restaurant,Financial or Legal Service,Fish & Chips Shop,Fish Market,Flea Market,Flower Shop,Food & Drink Shop,Food Court,Food Truck,Football Stadium,Fountain,French Restaurant,Fried Chicken Joint,Frozen Yogurt Shop,Fruit & Vegetable Store,Furniture / Home Store,Gaming Cafe,Garden,Gas Station,Gastropub,Gay Bar,General Entertainment,General Travel,German Restaurant,Gift Shop,Gluten-free Restaurant,Go Kart Track,Golf Course,Golf Driving Range,Gourmet Shop,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Gym Pool,Harbor / Marina,Hardware Store,Hawaiian Restaurant,Health & Beauty Service,Health Food Store,Historic Site,History Museum,Hobby Shop,Hockey Arena,Home Service,Hookah Bar,Hostel,Hot Dog Joint,Hotel,Hotel Bar,Housing Development,IT Services,Ice Cream Shop,Indian Chinese Restaurant,Indian Restaurant,Indie Movie Theater,Intersection,Irish Pub,Italian Restaurant,Japanese Restaurant,Jazz Club,Jewelry Store,Jewish Restaurant,Juice Bar,Karaoke Bar,Kids Store,Kitchen Supply Store,Knitting Store,Korean Restaurant,Lake,Laser Tag,Latin American Restaurant,Lebanese Restaurant,Library,Light Rail Station,Lingerie Store,Liquor Store,Lounge,Malay Restaurant,Market,Martial Arts School,Massage Studio,Medical Center,Mediterranean Restaurant,Memorial Site,Men's Store,Metro Station,Mexican Restaurant,Middle Eastern Restaurant,Military Base,Miscellaneous Shop,Mobile Phone Shop,Modern European Restaurant,Molecular Gastronomy Restaurant,Mongolian Restaurant,Monument / Landmark,Moroccan Restaurant,Motorcycle Shop,Movie Theater,Museum,Music School,Music Store,Music Venue,Neighborhood,New American Restaurant,Newsstand,Nightclub,Noodle House,Office,Opera House,Optical Shop,Organic Grocery,Other Great Outdoors,Other Nightlife,Outdoors & Recreation,Paper / Office Supplies Store,Park,Pastry Shop,Performing Arts Venue,Persian Restaurant,Peruvian Restaurant,Pet Café,Pet Store,Pharmacy,Pie Shop,Pizza Place,Playground,Plaza,Poke Place,Polish Restaurant,Pool,Pool Hall,Portuguese Restaurant,Post Office,Poutine Place,Pub,Public Art,Ramen Restaurant,Record Shop,Recreation Center,Rental Car Location,Rental Service,Residential Building (Apartment / Condo),Restaurant,Rock Climbing Spot,Rock Club,Sake Bar,Salad Place,Salon / Barbershop,Salvadoran Restaurant,Sandwich Place,Sausage Shop,Scenic Lookout,Sculpture Garden,Seafood Restaurant,Shabu-Shabu Restaurant,Shoe Store,Shopping Mall,Shopping Plaza,Skate Park,Skating Rink,Smoke Shop,Smoothie Shop,Snack Place,Soccer Field,Soccer Stadium,Social Club,Soup Place,South American Restaurant,Spa,Spanish Restaurant,Speakeasy,Sporting Goods Shop,Sports Bar,Sri Lankan Restaurant,Stationery Store,Steakhouse,Storage Facility,Street Art,Strip Club,Supermarket,Supplement Shop,Sushi Restaurant,Swiss Restaurant,Szechuan Restaurant,Taco Place,Tailor Shop,Taiwanese Restaurant,Tanning Salon,Tapas Restaurant,Tattoo Parlor,Tea Room,Tennis Court,Tex-Mex Restaurant,Thai Restaurant,Theater,Theme Park,Theme Restaurant,Thrift / Vintage Store,Tibetan Restaurant,Toy / Game Store,Track,Trail,Train,Train Station,Transportation Service,Turkish Restaurant,Vegetarian / Vegan Restaurant,Veterinarian,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,\nNot assigned\n,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018519,0.0,0.0,0.0,0.074074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.018519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018519,0.018519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.037037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018519,0.037037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018519,0.0,0.0,0.0,0.0,0.018519,0.0,0.0,0.018519,0.0,0.0,0.018519,0.018519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.037037,0.018519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018519,0.0,0.0,0.018519,0.0,0.0,0.0,0.0,0.0,0.0,0.018519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.074074,0.0,0.0,0.0,0.018519,0.0,0.0,0.018519,0.0,0.0,0.0,0.037037,0.0,0.0,0.0,0.0,0.0,0.018519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Laval-sur-le-Lac,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,AhuntsicCentral,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.095238,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.047619,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.095238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,AhuntsicEast,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,AhuntsicNorth,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.4,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 3.2 Prepare the PU data set

In [124]:
#The postivie data set, unlabelled data, and calgary data set do not match in features
print('Before matching:')
print('Features in unlabelled data set: ' + str(len(unlab_grouped_df.columns)))
print('Features in positive data set: ' + str(len(labs_grouped_df.columns)))
print('Features in calgary data set: ' + str(len(calgary_grouped_df.columns)))

Before matching:
Features in unlabelled data set: 339
Features in positive data set: 82
Features in calgary data set: 82


In [125]:
#Make the postivie data set, unlabelled data, and calgary data set match in features
all_features =[]
all_features = unlab_grouped_df.columns.tolist() + labs_grouped_df.columns.tolist() + calgary_grouped_df.columns.tolist()

print('Total number of features is: ' + str(len(calgary_grouped_df.columns)+len(labs_grouped_df.columns)+len(unlab_grouped_df.columns)))
print('Number of features put together in a single list: ' + str(len(all_features)))


Total number of features is: 503
Number of features put together in a single list: 503


In [126]:
features_values, features_counts = np.unique(all_features, return_counts=True)

features_dict = {'features':features_values, 'counts':features_counts}
features_df1 = pd.DataFrame(features_dict)
features_df1.iloc[1:7,:]

Unnamed: 0,features,counts
1,Accessories Store,1
2,Adult Boutique,1
3,Afghan Restaurant,1
4,Airport,1
5,Airport Lounge,1
6,Airport Service,1


In [127]:
features_df2=features_df1.loc[features_df1['counts']==3]
features_df2.head()

Unnamed: 0,features,counts
8,American Restaurant,3
14,Arts & Crafts Store,3
16,Asian Restaurant,3
17,Athletics & Sports,3
23,Bakery,3


In [128]:
common_features=features_df2['features'].values.tolist()
unlab_grouped_df2 = unlab_grouped_df[common_features]
labs_grouped_df2 = labs_grouped_df[common_features]
calgary_grouped_df2 = calgary_grouped_df[common_features]

print('Before matching:')
print('Features in unlabelled data set: ' + str(len(unlab_grouped_df2.columns)))
print('Features in positive data set: ' + str(len(labs_grouped_df2.columns)))
print('Features in calgary data set: ' + str(len(calgary_grouped_df2.columns)))

Before matching:
Features in unlabelled data set: 80
Features in positive data set: 80
Features in calgary data set: 80


In [129]:
#Add labels column to positive and unlabelled data sets
# '1' is used for the positive data set
# '0' is used for the unlabelled data set

labs_grouped_df2['Label'] = np.ones(labs_grouped_df2.shape[0])
unlab_grouped_df2['Label'] = np.zeros(unlab_grouped_df2.shape[0])


#labs_grouped_df2['Label']=[1.0]*(labs_grouped_df2.shape[0])
#unlab_grouped_df2['Label'] = [0.0]*(unlab_grouped_df2.shape[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labs_grouped_df2['Label'] = np.ones(labs_grouped_df2.shape[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unlab_grouped_df2['Label'] = np.zeros(unlab_grouped_df2.shape[0])


In [130]:
#Concatenate data sets

print('Shape of positive data set: ' + str(labs_grouped_df2.shape))
print('Shape of unlabelled data set: ' + str(unlab_grouped_df2.shape))

pu_df = pd.concat([labs_grouped_df2, unlab_grouped_df2], axis=0, ignore_index=True)

print('Shape of PU data set: ' + str(pu_df.shape))

Shape of positive data set: (95, 81)
Shape of unlabelled data set: (248, 81)
Shape of PU data set: (343, 81)


## 3.3 Train and evaluate an SVM model with the PU data

In [131]:
# Use pulearn library
!pip install pulearn



In [132]:
# Prepare the data for splitting
X_pu = pu_df.drop(columns=['Label', 'Neighborhood name'])
y_pu = pu_df['Label'].tolist()

#Convert to arrays
X_pu = X_pu.to_numpy()
y_pu = np.array(y_pu)

print('Shape of PU set: ' + str(pu_df.shape))
print('Shape of PU features set: ' + str(X_pu.shape))
print('Length of PU labels set: ' + str(len(y_pu)))



Shape of PU set: (343, 81)
Shape of PU features set: (343, 79)
Length of PU labels set: 343


In [133]:
# Needed libraries
from pulearn import ElkanotoPuClassifier
from sklearn.svm import SVC

pu_svc = SVC(C=10, kernel='rbf', gamma=0.4, probability=True)
pu_estimator = ElkanotoPuClassifier(estimator=pu_svc, hold_out_ratio=0.1)

In [140]:
# Test different test sizes to find an optimal one
test_size_array = np.arange(0.05, 0.55, 0.05)

PU_SVM_eval = []

for test_size in test_size_array:
    X_train_pu, X_test_pu, y_train_pu, y_test_pu = train_test_split(X_pu, y_pu, test_size=test_size, random_state=4)
    
    pu_estimator.fit(X_train_pu, y_train_pu)
    
    yhat_test_pu = pu_estimator.predict(X_test_pu)
    
    positive_positions = np.where(y_test_pu == 1)
    
    eval_y_test_pu = y_test_pu[positive_positions]
    eval_yhat_test_pu = yhat_test_pu[positive_positions]
    
    PU_jaccard = jaccard_score(eval_y_test_pu, eval_yhat_test_pu, pos_label=1)
    PU_f1_score = f1_score(eval_y_test_pu, eval_yhat_test_pu, average='weighted')
    
    cell = {}
    cell['Test size'] = test_size
    cell['Jaccard score'] = PU_jaccard
    cell['F1 Score'] = PU_f1_score
    PU_SVM_eval.append(cell)

In [141]:
# Put into a df to visualize
PU_SVM_eval_df = pd.DataFrame(data = PU_SVM_eval)
PU_SVM_eval_df

Unnamed: 0,Test size,Jaccard score,F1 Score
0,0.05,1.0,1.0
1,0.1,1.0,1.0
2,0.15,0.958333,0.978723
3,0.2,1.0,1.0
4,0.25,1.0,1.0
5,0.3,0.966667,0.983051
6,0.35,1.0,1.0
7,0.4,0.947368,0.972973
8,0.45,0.97561,0.987654
9,0.5,0.978723,0.989247


In [159]:
# Use test size
test_size_to_use = 0.20

X_train_pu, X_test_pu, y_train_pu, y_test_pu = train_test_split(X_pu, y_pu, test_size=test_size_to_use, random_state=4)
    
pu_estimator.fit(X_train_pu, y_train_pu)

ElkanotoPuClassifier(estimator=SVC(C=10, gamma=0.4, probability=True))

## 3.4 Apply the model to the calgary data set

In [160]:
#Prepare calgary data set to make prediction
calgary_df_pu_pred = calgary_grouped_df2.loc[:, calgary_grouped_df2.columns != 'Neighborhood name']
calgary_df_pu_pred = calgary_df_pu_pred.to_numpy()


In [161]:
#Make prediction
yhat_calgary_pu = pu_estimator.predict(calgary_df_pu_pred)
yhat_calgary_pu

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [181]:
#All were classified as positive, let's get the 7 with the highest probability
pu_probs = pu_estimator.predict_proba(calgary_df_pu_pred)
pu_probs2 = -np.sort(-pu_probs)
pu_probs2

array([1.02248667, 1.00296934, 0.99573702, 0.9937624 , 0.99335878,
       0.98978023, 0.98969314, 0.98969314, 0.9832716 , 0.9811787 ,
       0.97903245, 0.9770258 , 0.9751606 , 0.96785257, 0.96495589,
       0.96090645, 0.95846523, 0.95509556, 0.94900157, 0.93035933,
       0.92916895, 0.90270512, 0.88206949, 0.84609876, 0.83587888,
       0.81231914, 0.78042138, 0.71955304, 0.67787334, 0.56527977])

In [192]:
#Add results to calgary df
calgary_df1['PU_Result'] = yhat_calgary_pu
calgary_df1['PU_probs_result'] = pu_probs
calgary_df1.head()


Unnamed: 0,PostalCode,Neighborhood,Latitude,Longitude,Pos_only_Result,PU_Result,PU_probs_result
0,T2A,"Penbrooke Meadows, Marlborough",51.04968,-113.96432,-1,1.0,0.975161
1,T3A,"Dalhousie, Edgemont, Hamptons, Hidden Valley",51.12454,-114.14289,-1,1.0,0.960906
2,T2B,"Forest Lawn, Dover, Erin Woods",51.02533,-113.9789,-1,1.0,0.949002
3,T3B,"Montgomery, Bowness, Silver Springs, Greenwood",51.08963,-114.19751,1,1.0,0.977026
4,T2C,"Lynnwood Ridge, Ogden, Foothills Industrial, Great Plains",50.98122,-113.99786,1,1.0,0.993762


In [207]:
#sort based on ascending PU probability values
pu_sorted_calgary_df1=calgary_df1.sort_values('PU_probs_result', ascending = False)

#Get the first seven rows
calgary_df1_pu = pu_sorted_calgary_df1.iloc[0:7, :]
calgary_df1_pu

Unnamed: 0,PostalCode,Neighborhood,Latitude,Longitude,Pos_only_Result,PU_Result,PU_probs_result
14,T2K,"Thorncliffe, Tuxedo Park",51.10199,-114.07128,1,1.0,1.022487
19,T3M,"Cranston, Auburn Bay, Mahogany",50.88795,-113.95621,-1,1.0,1.002969
21,T2P,"City Centre, Calgary Tower",51.0486,-114.07407,-1,1.0,0.995737
4,T2C,"Lynnwood Ridge, Ogden, Foothills Industrial, Great Plains",50.98122,-113.99786,1,1.0,0.993762
17,T3L,"Tuscany, Scenic Acres",51.12323,-114.24007,-1,1.0,0.993359
9,T3G,"Hawkwood, Arbour Lake, Citadel, Ranchlands, Royal Oak, Rocky Ridge",51.13818,-114.20157,-1,1.0,0.98978
5,T3C,"Rosscarrock, Westgate, Wildwood, Shaganappi, Sunalta",51.04492,-114.1307,1,1.0,0.989693


## 3.5 Visualize data on map

In [202]:
# create map. The latitude and longitude of toronto were found on 
# https://www.latlong.net/place/calgary-ab-canada-29104.html
pu_svm_map = folium.Map(location=[51.049999,  -114.066666], zoom_start=10.4)

#Add markers to map
for lat, lon in zip(calgary_df1_pu['Latitude'], calgary_df1_pu['Longitude']):
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        color='red',
        fill=True,
        fill_color='red',
        fill_opacity=0.7).add_to(pu_svm_map)
     
pu_svm_map

# 4 Combine methods to obtain a final result

In [209]:
# Filter those that gave positive in positive-only learning
calgary_df_final = calgary_df1_pu[calgary_df1_pu['Pos_only_Result']==1]
calgary_df_final 


Unnamed: 0,PostalCode,Neighborhood,Latitude,Longitude,Pos_only_Result,PU_Result,PU_probs_result
14,T2K,"Thorncliffe, Tuxedo Park",51.10199,-114.07128,1,1.0,1.022487
4,T2C,"Lynnwood Ridge, Ogden, Foothills Industrial, Great Plains",50.98122,-113.99786,1,1.0,0.993762
5,T3C,"Rosscarrock, Westgate, Wildwood, Shaganappi, Sunalta",51.04492,-114.1307,1,1.0,0.989693


In [210]:
# create map. The latitude and longitude of toronto were found on 
# https://www.latlong.net/place/calgary-ab-canada-29104.html
pu_svm_map = folium.Map(location=[51.049999,  -114.066666], zoom_start=10.4)

#Add markers to map
for lat, lon in zip(calgary_df_final ['Latitude'], calgary_df_final ['Longitude']):
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        color='red',
        fill=True,
        fill_color='red',
        fill_opacity=0.7).add_to(pu_svm_map)
     
pu_svm_map