Geospatial Data Clustering for Fatal Bear Attacks in North America

Density-based spatial clustering (DBSCAN) - Unsupervised ML methods

Dataset: https://www.kaggle.com/datasets/danela/fatal-bear-attacks-north-america/data

Tutorial referenced in code: https://geoffboeing.com/2014/08/clustering-to-reduce-spatial-data-set-size/

In [2]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from geopy.distance import great_circle
from shapely.geometry import MultiPoint
import geopandas
import matplotlib.pyplot as plt
import folium
import mapclassify


In [3]:
url = 'https://raw.githubusercontent.com/A-Bin1/Data-Science-Misc/main/bear_attacks_geospatial_dataset.csv'
bear_df = pd.read_csv(url)
coords = bear_df[['Latitude', 'Longitude']].to_numpy()

In [4]:
coords[0:10]

array([[  66.53416   ,  -83.88217   ],
       [  61.16723   ,  -93.85015   ],
       [  62.808913  ,  -92.087741  ],
       [  69.7428    , -163.01125   ],
       [  58.767755  ,  -94.163998  ],
       [  68.360605  , -133.720367  ],
       [  58.767755  ,  -94.163998  ],
       [  46.1334    ,  -79.53293   ],
       [  64.8080879 , -151.00415781],
       [  60.98663   , -149.51277   ]])

In [5]:
def create_clusters(cds, km):
    earth_kms_per_radian = 6371 #radius of the earth
    epsilon = km / earth_kms_per_radian #convert km to radians
    db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(cds))
    cluster_labels = db.labels_
    num_clusters = len(set(cluster_labels))
    clusters = pd.Series([cds[cluster_labels == n] for n in range(num_clusters)])
    print('Number of clusters: {}'.format(num_clusters))
    return clusters

In [6]:
create_clusters(coords,100)

Number of clusters: 85


0             [[66.53416000000004, -83.88216999999997]]
1                      [[61.16723000000008, -93.85015]]
2             [[62.80891300000008, -92.08774099999994]]
3            [[69.74280000000005, -163.01124999999996]]
4     [[58.76775500000008, -94.16399799999994], [58....
                            ...                        
80           [[41.72244511900004, -111.74354625399997]]
81          [[36.578550000000064, -118.29341999999995]]
82    [[34.01158000000004, -118.49226999999996], [34...
83           [[38.45047000000005, -123.12944999999996]]
84           [[34.87404000000004, -118.89168999999998]]
Length: 85, dtype: object

In [7]:
clusters = create_clusters(coords,100)

Number of clusters: 85


In [8]:
def get_centermost_point(cluster):
    centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
    centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
    return tuple(centermost_point)

centermost_points = clusters.map(get_centermost_point)


In [9]:
centermost_points.head()

0     (66.53416000000004, -83.88216999999997)
1              (61.16723000000008, -93.85015)
2     (62.80891300000008, -92.08774099999994)
3    (69.74280000000005, -163.01124999999996)
4     (58.76775500000008, -94.16399799999994)
dtype: object

In [10]:
cdf = pd.DataFrame(centermost_points)
cdf = cdf.rename(columns = {0 :'Location'})
cdf.head()

Unnamed: 0,Location
0,"(66.53416000000004, -83.88216999999997)"
1,"(61.16723000000008, -93.85015)"
2,"(62.80891300000008, -92.08774099999994)"
3,"(69.74280000000005, -163.01124999999996)"
4,"(58.76775500000008, -94.16399799999994)"


In [11]:
new_col_list = ['Latitude','Longitude']
for n,col in enumerate(new_col_list):
    cdf[col] = cdf['Location'].apply(lambda Location: Location[n])

cldf = cdf.drop('Location',axis=1)

In [12]:
cldf.head()

Unnamed: 0,Latitude,Longitude
0,66.53416,-83.88217
1,61.16723,-93.85015
2,62.808913,-92.087741
3,69.7428,-163.01125
4,58.767755,-94.163998


Compare and Contrast full list of coordinates to clustered data.

In [13]:
geodf = geopandas.GeoDataFrame(
    bear_df, geometry=geopandas.points_from_xy(bear_df.Longitude, bear_df.Latitude), crs="EPSG:4327"
)

In [14]:
len(geodf)

156

In [15]:
geodf.explore()

In [16]:
geo_clusterdf = geopandas.GeoDataFrame(
    cldf, geometry=geopandas.points_from_xy(cldf.Longitude, cldf.Latitude), crs="EPSG:4327"
)

In [17]:
len(geo_clusterdf)

85

In [18]:
geo_clusterdf.explore()

In [19]:
#simulate overlays of the datasets
def plot_overlay(how):
    new = geopandas.overlay(geodf, geo_clusterdf, how=how, keep_geom_type=None, make_valid=True)
    return new.explore()

In [20]:
#'intersection',
# 'union',
# 'identity',
# 'symmetric_difference',
# 'difference' 


plot_overlay('intersection')