# <center>Comparing DBSCAN and HDBSCAN clustering

The two clustering models will be trained using data curated by StatCan containing the names, types, and locations of cultural and art facilities across Canada. Focus will be on the museum locations provided across Canada.

In [3]:
# Import libraries
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
import hdbscan
from sklearn.preprocessing import StandardScaler

# Geographical tools
import geopandas as gpd # pandas dataframe-like geodataframes for geographical data
import contextily as ctx # used for obtianing a basemap of Canada
from shapely.geometry import Point

# Functions
# Add the src folder to the Python path so 
sys.path.append(os.path.abspath('../src'))

from data_loader import load_museum_data, download_and_extract_tif

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

### Data Exploration

In [None]:
# Download and extract Canada map for reference
zip_file_url='https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/YcUk-ytgrPkmvZAh5bf7zA/Canada.zip'
download_and_extract_tif(zip_file_url)

In [4]:
# Load data and preview museum dataset
csv_url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/r-maSj5Yegvw2sJraT15FA/ODCAF-v1-0.csv'
df = load_museum_data(csv_url)
df.head()


Downloaded and extracted: Canada.tif
Detected encoding: Windows-1252


Unnamed: 0,Index,Facility_Name,Source_Facility_Type,ODCAF_Facility_Type,Provider,Unit,Street_No,Street_Name,Postal_Code,City,Prov_Terr,Source_Format_Address,CSD_Name,CSDUID,PRUID,Latitude,Longitude
0,1,#Hashtag Gallery,..,gallery,toronto,..,801,dundas st w,M6J 1V2,toronto,on,801 dundas st w,Toronto,3520005,35,43.65169472,-79.40803272
1,2,'Ksan Historical Village & Museum,historic site-building or park,museum,canadian museums association,..,1500,62 hwy,V0J 1Y0,hazelton,bc,1500 hwy 62 hazelton british columbia v0j 1y0 ...,Hazelton,5949022,59,55.2645508,-127.6428124
2,3,'School Days' Museum,community/regional museum,museum,canadian museums association,..,427,queen st,E3B 5R6,fredericton,nb,427 queen st fredericton new brunswick e3b 5r6...,Fredericton,1310032,13,45.963283,-66.6419017
3,4,10 Austin Street,built heritage properties,heritage or historic site,moncton,..,10,austin st,E1C 1Z6,moncton,nb,10 austin st,Moncton,1307022,13,46.09247776,-64.78022946
4,5,10 Gates Dancing Inc.,arts,miscellaneous,ottawa,..,..,..,..,ottawa,on,..,Ottawa,3506008,35,45.40856224,-75.71536766


In [10]:
# Check for missing values
df.isna().sum()

Index                    0
Facility_Name            0
Source_Facility_Type     0
ODCAF_Facility_Type      0
Provider                 0
Unit                     0
Street_No                0
Street_Name              0
Postal_Code              0
City                     0
Prov_Terr                0
Source_Format_Address    0
CSD_Name                 0
CSDUID                   0
PRUID                    0
Latitude                 0
Longitude                0
dtype: int64

The missing values are represented by '..'

In [17]:
# Replace '..' with real NaNs
df.replace('..', np.nan, inplace=True)

#Check missing values
df.isna().sum()

Index                       0
Facility_Name               4
Source_Facility_Type     2533
ODCAF_Facility_Type         0
Provider                    0
Unit                     7623
Street_No                1284
Street_Name              1149
Postal_Code              1454
City                      116
Prov_Terr                   2
Source_Format_Address     804
CSD_Name                  301
CSDUID                    301
PRUID                      90
Latitude                 1224
Longitude                1224
dtype: int64

In [6]:
# Check data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7972 entries, 0 to 7971
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Index                  7972 non-null   int64 
 1   Facility_Name          7972 non-null   object
 2   Source_Facility_Type   7972 non-null   object
 3   ODCAF_Facility_Type    7972 non-null   object
 4   Provider               7972 non-null   object
 5   Unit                   7972 non-null   object
 6   Street_No              7972 non-null   object
 7   Street_Name            7972 non-null   object
 8   Postal_Code            7972 non-null   object
 9   City                   7972 non-null   object
 10  Prov_Terr              7972 non-null   object
 11  Source_Format_Address  7972 non-null   object
 12  CSD_Name               7972 non-null   object
 13  CSDUID                 7972 non-null   object
 14  PRUID                  7972 non-null   object
 15  Latitude             

In [16]:
df[['ODCAF_Facility_Type']].value_counts()

ODCAF_Facility_Type                 
library or archives                     3013
museum                                  1938
gallery                                  810
heritage or historic site                620
theatre/performance and concert hall     583
festival site                            346
miscellaneous                            343
art or cultural centre                   225
artist                                    94
Name: count, dtype: int64