In [1]:
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.datasets import make_blobs
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
import pandas as pd
import os
from sklearn.cluster import DBSCAN
import geojson
import json

df = pd.read_csv(os.path.join("./data", "df20.csv"))
df.head()

Unnamed: 0.1,Unnamed: 0,folio,start_date,start_hour,weekday,incident_code,end_date,end_hour,start_county,type_incident,lat,lng,class_incident,aid_channel,end_county,geopoint,time_response
0,0,IZ/200101/04110,01/01/2020,08:49:57,WE,affirmative,01/01/2020,10:26:26,TLH,accidente-motociclista,19.30303,-99.05604,Medical emergencies,Radio,TLH,"19.30302999,-99.05604012",96.0
1,1,C5/200101/00743,01/01/2020,01:04:38,WE,affirmative,01/01/2020,01:18:36,AZC,accidente-choque sin lesionados,19.49422,-99.19734,Emergencies,911 call,AZC,"19.49421996,-99.19734012",13.0
2,2,C5/200101/00012,31/12/2019,23:59:58,TU,affirmative,01/01/2020,06:59:44,IZP,accidente-choque sin lesionados,19.32178,-99.05313,Emergencies,911 call,IZP,"19.32177996,-99.05312988",1021.0
3,3,GA/191231/08094,31/12/2019,23:46:07,TU,affirmative,01/01/2020,00:22:00,VCA,accidente-motociclista,19.45154,-99.09475,Medical emergencies,Radio,VCA,"19.45153998,-99.09474984",1405.0
4,4,C5/200101/03024,01/01/2020,05:26:06,WE,not found,01/01/2020,06:55:23,MCO,accidente-choque sin lesionados,19.32809,-99.21489,Emergencies,911 call,MCO,"19.32809004,-99.21489012",89.0


In [2]:
df['incident_code'].unique()

array(['affirmative', 'not found', 'duplicated', 'informative', 'false'],
      dtype=object)

In [3]:
df.shape

(91308, 17)

In [4]:
# ignore all duplicate, not found and false incident_codes
clean_df = df[ ~ df['incident_code'].isin(['duplicated','not found', 'false'])]
clean_df.shape

(32227, 17)

In [5]:
#county codes
counties = [
    'TLH', 
    'AZC', 
    'IZP', 
    'VCA', 
    'XOC', 
    'TLA', 
    'GAM', 
    'MCO', 
    'MHI',
    'CUA', 
    'MAL', 
    'IZC', 
    'AOB', 
    'COY', 
    'BJU',
]

In [6]:
# for each unique county we will iterate through each of the incident types
incident_types = [
    'accidente-choque sin lesionados',
    'accidente-choque con lesionados',
    'accidente-motociclista',
    'lesionado-atropellado',
    'accidente-ciclista',
]

In [7]:
def getClusterData(original_df, county_code, incident_type, min_samps):
    #filter by county_code
    df_county = original_df[ original_df['start_county'] == county_code ]
    #filter by incident_type
    is_of_type =  df_county['type_incident'] == incident_type
    df_county_type = df_county[is_of_type]
    #extract lats and lngs as numpy arrays
    lats = df_county_type['lat'].to_numpy()
    lngs = df_county_type['lng'].to_numpy()
    # use db_scan machine learning model to determine the clusters
    X = np.vstack((lats, lngs)).T
    clustering = DBSCAN(eps =.004, min_samples = min_samps ).fit(X)
    cl_labels = clustering.labels_
    clusters_df = pd.DataFrame(dict(x=X[:,0], y=X[:,1], label=cl_labels))
    # remove the noise cluster from the new dataframe
    is_noise = clusters_df['label'] == -1
    clusters_df = clusters_df[~ is_noise]
    # get number of members per cluster (label)
    member_counts = clusters_df['label'].value_counts().to_dict()
    # create new array to be converted into the 'member_count' series
    label_add_counts = []
    for index, row in clusters_df.iterrows():
         label_add_counts.append( member_counts[ row['label'] ] )
    clusters_df['member_count'] = label_add_counts
    return clusters_df
    

In [8]:
def df_to_geojson(df, properties, lat='lat', lon='lng'):
    geojson = {'type':'FeatureCollection', 'features':[]}
    for _, row in df.iterrows():
        feature = {'type':'Feature',
                   'properties':{},
                   'geometry':{'type':'Point','coordinates':[]}}
        feature['geometry']['coordinates'] = [row[lon],row[lat]]
        for prop in properties:
            feature['properties'][prop] = row[prop]
        geojson['features'].append(feature)
    return geojson

In [9]:
def findBestMinSamps(target_clusters,test_cnty,in_tp):
    #will hold the results of each of our tests
    rows = []
    #test cluster generation using min_samples from 1 to 20
    for i in range(20):
        #get number of clusters found with current min_samples setting (i)
        clusters_found = len( set( getClusterData(clean_df, test_cnty, in_tp, i+1)['label']) )
        #get the margin of error (offset) which is the difference between clusters found and our target clusters
        offset = abs( clusters_found - target_clusters )
        #add the test result to the rows array for comparison 
        rows.append( [i+1, clusters_found, offset ] )   
    #create a dataframe from our collected tests in rows array
    test_clusters_df = pd.DataFrame( rows, columns=['min_samps', 'clusters_found', 'offset'])
    #sort the test dataframe to the the smallest offset (error) at the top
    sorted_df = test_clusters_df.sort_values(by=['offset'])
    #get the number of clusters found and min_samples setting from the best test result
    clusters_found = sorted_df.iloc[0]['clusters_found']
    best_min_samps = sorted_df.iloc[0]['min_samps']
    # print(f"the best min_samps for {test_cnty} / {in_tp} was {best_min_samps} with {clusters_found} clusters")
    #use getClusterData using the best_min_samps discovered
    return getClusterData(clean_df, test_cnty, in_tp, best_min_samps)

In [10]:
cluster_df = findBestMinSamps(15, "TLH", "accidente-choque sin lesionados")
cluster_df

Unnamed: 0,x,y,label,member_count
0,19.29928,-99.04173,0,191
1,19.28386,-99.01049,1,105
2,19.26616,-99.00663,1,105
3,19.28874,-99.00698,1,105
4,19.29649,-99.03594,0,191
...,...,...,...,...
349,19.29510,-99.02071,1,105
350,19.31021,-99.04779,0,191
351,19.30364,-99.05801,0,191
352,19.30434,-99.04019,0,191


In [11]:
geojsons = {}
t_cl = 20
for cnty in counties:
    #object to hold cluster coordinates found for each accident types (without noise) 
    types_clusters = {}
    for tp in incident_types:
        cluster_df = findBestMinSamps(t_cl, cnty, tp)
        print(f"{cnty} - {tp} : {len(set(cluster_df['label']))}")
        geodata = df_to_geojson(cluster_df, properties=['label','member_count'], lat="x", lon="y")
        types_clusters[tp] = geodata
        geojsons[cnty] = types_clusters
    print("...................................................")
print("geojson object is ready")

TLH - accidente-choque sin lesionados : 26
TLH - accidente-choque con lesionados : 26
TLH - accidente-motociclista : 20
TLH - lesionado-atropellado : 22
TLH - accidente-ciclista : 14
...................................................
AZC - accidente-choque sin lesionados : 16
AZC - accidente-choque con lesionados : 24
AZC - accidente-motociclista : 22
AZC - lesionado-atropellado : 18
AZC - accidente-ciclista : 19
...................................................
IZP - accidente-choque sin lesionados : 21
IZP - accidente-choque con lesionados : 20
IZP - accidente-motociclista : 23
IZP - lesionado-atropellado : 23
IZP - accidente-ciclista : 12
...................................................
VCA - accidente-choque sin lesionados : 6
VCA - accidente-choque con lesionados : 10
VCA - accidente-motociclista : 20
VCA - lesionado-atropellado : 16
VCA - accidente-ciclista : 21
...................................................
XOC - accidente-choque sin lesionados : 15
XOC - accidente-ch

In [12]:
output_filename = './leaflet-map/dataset.js'
with open(output_filename, 'w') as output_file:
    output_file.write('var dataset = ')
    json.dump(geojsons, output_file, indent=2) 