In [1]:
import pandas as pd
import numpy
import requests
from sklearn.cluster import KMeans
from sodapy import Socrata
import warnings
warnings.filterwarnings('ignore')



## API Setup

In [2]:
crashes = pd.read_csv('../data/crashes_crashes.csv')

conn = Socrata("data.cityofchicago.org", None)

results = conn.get("85ca-t3if", limit=2000, where = "crash_date > '2024-01-19T02:02:00.000'")

# Convert to pandas DataFrame
api_df = pd.DataFrame.from_records(results)

api_df.columns = api_df.columns.str.upper()

api_df.head()



Unnamed: 0,CRASH_RECORD_ID,CRASH_DATE,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,TRAFFICWAY_TYPE,ALIGNMENT,...,LOCATION,CRASH_DATE_EST_I,INTERSECTION_RELATED_I,WORK_ZONE_I,WORK_ZONE_TYPE,WORKERS_PRESENT_I,STATEMENTS_TAKEN_I,PRIVATE_PROPERTY_I,PHOTOS_TAKEN_I,DOORING_I
0,d065d9f160ab43b283f43a718bc8f328b1dff9157b02a6...,2024-01-24T23:47:00.000,30,NO CONTROLS,NO CONTROLS,FOG/SMOKE/HAZE,"DARKNESS, LIGHTED ROAD",TURNING,NOT DIVIDED,STRAIGHT AND LEVEL,...,"{'type': 'Point', 'coordinates': [-87.70977884...",,,,,,,,,
1,3efc612e2dcfc39c6eaf454a4471a29b7ab253f6a885ab...,2024-01-24T23:37:00.000,30,NO CONTROLS,NO CONTROLS,FOG/SMOKE/HAZE,"DARKNESS, LIGHTED ROAD",TURNING,DIVIDED - W/MEDIAN (NOT RAISED),STRAIGHT AND LEVEL,...,"{'type': 'Point', 'coordinates': [-87.63234470...",,,,,,,,,
2,f0421ae42adcb3db30459dfeede0eea38f9a5e47bfd438...,2024-01-24T22:49:00.000,30,NO CONTROLS,NO CONTROLS,RAIN,"DARKNESS, LIGHTED ROAD",HEAD ON,NOT DIVIDED,STRAIGHT AND LEVEL,...,"{'type': 'Point', 'coordinates': [-87.60585207...",,,,,,,,,
3,93dace3e01523d634733461f80c0e146cce664154537ee...,2024-01-24T22:40:00.000,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,"DARKNESS, LIGHTED ROAD",REAR END,DIVIDED - W/MEDIAN (NOT RAISED),STRAIGHT AND LEVEL,...,"{'type': 'Point', 'coordinates': [-87.66754374...",,,,,,,,,
4,fa5d00bf307758bc2372e2e0e0796b640e8982519c681b...,2024-01-24T22:30:00.000,30,NO CONTROLS,NO CONTROLS,RAIN,"DARKNESS, LIGHTED ROAD",PARKED MOTOR VEHICLE,DRIVEWAY,STRAIGHT AND LEVEL,...,"{'type': 'Point', 'coordinates': [-87.68341369...",Y,,,,,,,,


In [3]:
api_col_num = len(api_df.columns)
dataset_col_num = len(crashes.columns)
shared_cols = list(set(api_df.columns).intersection(set(crashes.columns)))

print(f'api_columns: {api_col_num}, dataset_columns: {dataset_col_num}')
shared_cols

api_columns: 47, dataset_columns: 48


['STREET_NO',
 'NUM_UNITS',
 'LONGITUDE',
 'INJURIES_NO_INDICATION',
 'WORKERS_PRESENT_I',
 'SEC_CONTRIBUTORY_CAUSE',
 'INJURIES_TOTAL',
 'STATEMENTS_TAKEN_I',
 'CRASH_DATE_EST_I',
 'DOORING_I',
 'INJURIES_INCAPACITATING',
 'CRASH_TYPE',
 'CRASH_HOUR',
 'FIRST_CRASH_TYPE',
 'INJURIES_NON_INCAPACITATING',
 'INJURIES_FATAL',
 'DATE_POLICE_NOTIFIED',
 'LATITUDE',
 'CRASH_DATE',
 'DAMAGE',
 'POSTED_SPEED_LIMIT',
 'LIGHTING_CONDITION',
 'CRASH_MONTH',
 'LOCATION',
 'INJURIES_REPORTED_NOT_EVIDENT',
 'WEATHER_CONDITION',
 'MOST_SEVERE_INJURY',
 'STREET_NAME',
 'TRAFFIC_CONTROL_DEVICE',
 'TRAFFICWAY_TYPE',
 'PHOTOS_TAKEN_I',
 'INJURIES_UNKNOWN',
 'CRASH_DAY_OF_WEEK',
 'BEAT_OF_OCCURRENCE',
 'REPORT_TYPE',
 'ROADWAY_SURFACE_COND',
 'PRIM_CONTRIBUTORY_CAUSE',
 'DEVICE_CONDITION',
 'INTERSECTION_RELATED_I',
 'ALIGNMENT',
 'STREET_DIRECTION',
 'WORK_ZONE_I',
 'CRASH_RECORD_ID',
 'HIT_AND_RUN_I',
 'ROAD_DEFECT',
 'WORK_ZONE_TYPE']

In [4]:
crashes.columns

Index(['CRASH_RECORD_ID', 'CRASH_DATE_EST_I', 'CRASH_DATE',
       'POSTED_SPEED_LIMIT', 'TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION',
       'WEATHER_CONDITION', 'LIGHTING_CONDITION', 'FIRST_CRASH_TYPE',
       'TRAFFICWAY_TYPE', 'LANE_CNT', 'ALIGNMENT', 'ROADWAY_SURFACE_COND',
       'ROAD_DEFECT', 'REPORT_TYPE', 'CRASH_TYPE', 'INTERSECTION_RELATED_I',
       'NOT_RIGHT_OF_WAY_I', 'HIT_AND_RUN_I', 'DAMAGE', 'DATE_POLICE_NOTIFIED',
       'PRIM_CONTRIBUTORY_CAUSE', 'SEC_CONTRIBUTORY_CAUSE', 'STREET_NO',
       'STREET_DIRECTION', 'STREET_NAME', 'BEAT_OF_OCCURRENCE',
       'PHOTOS_TAKEN_I', 'STATEMENTS_TAKEN_I', 'DOORING_I', 'WORK_ZONE_I',
       'WORK_ZONE_TYPE', 'WORKERS_PRESENT_I', 'NUM_UNITS',
       'MOST_SEVERE_INJURY', 'INJURIES_TOTAL', 'INJURIES_FATAL',
       'INJURIES_INCAPACITATING', 'INJURIES_NON_INCAPACITATING',
       'INJURIES_REPORTED_NOT_EVIDENT', 'INJURIES_NO_INDICATION',
       'INJURIES_UNKNOWN', 'CRASH_HOUR', 'CRASH_DAY_OF_WEEK', 'CRASH_MONTH',
       'LATITUDE', 

In [5]:
 api_df = api_df.loc[:, shared_cols]

crashes_prime_cause = api_df[(api_df['PRIM_CONTRIBUTORY_CAUSE'] != 'UNABLE TO DETERMINE')\
                              & (api_df['PRIM_CONTRIBUTORY_CAUSE'] != 'NOT APPLICABLE')]

breaking_laws_list = ['DISREGARDING TRAFFIC SIGNALS', 'DISREGARDING STOP SIGN', 'DISREGARDING ROAD MARKINGS', 
                  'DISREGARDING OTHER TRAFFIC SIGNS', 'DISREGARDING YIELD SIGN', 'FAILING TO YIELD RIGHT-OF-WAY'] 

bad_driving_list = ['DRIVING ON WRONG SIDE/WRONG WAY', 'FOLLOWING TOO CLOSELY', 'IMPROPER OVERTAKING/PASSING', 
                    'FAILING TO REDUCE SPEED TO AVOID CRASH', 'TURNING RIGHT ON RED','EXCEEDING SAFE SPEED FOR CONDITIONS',
                    'EXCEEDING AUTHORIZED SPEED LIMIT', 'IMPROPER LANE USAGE', 'PHYSICAL CONDITION OF DRIVER', 
                  'DRIVING SKILLS/KNOWLEDGE/EXPERIENCE','IMPROPER BACKING', 'IMPROPER TURNING/NO SIGNAL']

distraction_list = ['TEXTING', 'DISTRACTION - OTHER ELECTRONIC DEVICE (NAVIGATION DEVICE, DVD PLAYER, ETC.)', 
                    'DISTRACTION - FROM INSIDE VEHICLE','CELL PHONE USE OTHER THAN TEXTING']

drinking_list = ['OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER', 
                 'HAD BEEN DRINKING (USE WHEN ARREST IS NOT MADE)', 'UNDER THE INFLUENCE OF ALCOHOL/DRUGS (USE WHEN ARREST IS EFFECTED)']

road_list = ['DISTRACTION - FROM OUTSIDE VEHICLE', 'ROAD ENGINEERING/SURFACE/MARKING DEFECTS', 'ROAD CONSTRUCTION/MAINTENANCE', 'EQUIPMENT - VEHICLE CONDITION', 
             'VISION OBSCURED (SIGNS, TREE LIMBS, BUILDINGS, ETC.)', 'WEATHER']

other_list= ['PASSING STOPPED SCHOOL BUS', 'OBSTRUCTED CROSSWALKS', 'BICYCLE ADVANCING LEGALLY ON RED LIGHT', 
             'MOTORCYCLE ADVANCING LEGALLY ON RED LIGHT', 'EVASIVE ACTION DUE TO ANIMAL, OBJECT, NONMOTORIST', 'ANIMAL', 'TURNING RIGHT ON RED', 
             'RELATED TO BUS STOP'] 

binning_list = [breaking_laws_list, bad_driving_list, distraction_list, drinking_list, road_list, other_list]
value_list = ['BREAKING LAW', 'BAD DRIVING', 'DISTRACTION INSIDE VEHICLE', 'DRINKING/DRUGS', 'OUTSIDE FACTORS', 'OTHER']



for group, value in zip(binning_list, value_list):
    crashes_prime_cause['PRIM_CONTRIBUTORY_CAUSE'] = crashes_prime_cause['PRIM_CONTRIBUTORY_CAUSE'].replace(to_replace = group, value = value)

crashes_prime_cause['PRIM_CONTRIBUTORY_CAUSE'].value_counts()

PRIM_CONTRIBUTORY_CAUSE
BAD DRIVING                   487
BREAKING LAW                  228
OUTSIDE FACTORS               152
DRINKING/DRUGS                 20
DISTRACTION INSIDE VEHICLE      7
OTHER                           7
Name: count, dtype: int64

In [6]:
crashes_prime_cause_filled = crashes_prime_cause.fillna({'INTERSECTION_RELATED_I': 'N', 'NOT_RIGHT_OF_WAY_I': 
                                                         'N', 'HIT_AND_RUN_I':'N'})

crashes_prime_cause_filled = crashes_prime_cause_filled.dropna(subset=['LATITUDE', 'LONGITUDE', 'INJURIES_TOTAL', 'INJURIES_FATAL', 
                                                                           'MOST_SEVERE_INJURY'])

In [None]:
n_clusters = 30  # Number of clusters to create
X = crashes_prime_cause_filled[['LONGITUDE', 'LATITUDE']]

# Create a K-Means clustering model
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
kmeans.fit(X)

# Add cluster labels to your data
cluster_labels = kmeans.labels_
crashes_prime_cause_filled['GEO_KMEANS_Cluster'] = cluster_labels