# Load the three datasets and create a minimal training table with a few numeric and categorical columns.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report

# 1. Load -----------------------------------------------------------
crashes = pd.read_csv('Traffic_Crashes_-_Crashes_20250617.csv', low_memory=False)
people = pd.read_csv('Traffic_Crashes_-_People_20250617.csv', low_memory=False,
                     usecols=['CRASH_RECORD_ID','PERSON_ID','AGE','SEX'])
vehicles = pd.read_csv('Traffic_Crashes_-_Vehicles_20250617.csv', low_memory=False,
                       usecols=['CRASH_RECORD_ID','VEHICLE_ID','VEHICLE_TYPE'])

# 2. Simple aggregation to crash-level -----------------------------
people_agg = people.groupby('CRASH_RECORD_ID').agg({'AGE':'median',
                                                    'SEX':lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan})
people_agg.columns = ['AGE','GENDER']
vehicles_agg = vehicles.groupby('CRASH_RECORD_ID').agg({'VEHICLE_TYPE':lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan})

# merge
crash_df = crashes[['CRASH_RECORD_ID','STREET_NO','SPEED_LIMIT','PRIM_CONTRIBUTORY_CAUSE']].merge(
    people_agg, left_on='CRASH_RECORD_ID', right_index=True, how='left'
).merge(
    vehicles_agg, left_on='CRASH_RECORD_ID', right_index=True, how='left'
)

# Drop rows with missing target
crash_df = crash_df.dropna(subset=['PRIM_CONTRIBUTORY_CAUSE'])

# 3. Train/val split ------------------------------------------------
X = crash_df[['STREET_NO','AGE','SPEED_LIMIT','GENDER','VEHICLE_TYPE']]
y = crash_df['PRIM_CONTRIBUTORY_CAUSE']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 4. Preprocessing & model -----------------------------------------
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse=True)

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, ['STREET_NO','AGE','SPEED_LIMIT']),
    ('cat', categorical_transformer, ['GENDER','VEHICLE_TYPE'])
])

model = LGBMClassifier(n_estimators=300, learning_rate=0.05, class_weight='balanced', random_state=42)

pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', model)
])

pipe.fit(X_train, y_train)

# 5. Evaluation ------------------------------------------------------
y_pred = pipe.predict(X_val)
report = classification_report(y_val, y_pred, output_dict=False)

print(crash_df.head())
print(report)

In [4]:
# Load the three datasets and create a minimal training table with a few numeric and categorical columns.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report

In [3]:
pip install lightgbm

Note: you may need to restart the kernel to use updated packages.


In [6]:
# 1. Load -----------------------------------------------------------
crashes = pd.read_csv('Traffic_Crashes_-_Crashes_20250617.csv', low_memory=False)
crashes.head()

Unnamed: 0,CRASH_RECORD_ID,CRASH_DATE_EST_I,CRASH_DATE,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,TRAFFICWAY_TYPE,...,INJURIES_NON_INCAPACITATING,INJURIES_REPORTED_NOT_EVIDENT,INJURIES_NO_INDICATION,INJURIES_UNKNOWN,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE,LOCATION
0,01184614b08579d6befa7734427e750d0f10e79dc0aa42...,,04/04/2025 01:57:00 AM,15,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",ANGLE,PARKING LOT,...,0.0,0.0,2.0,0.0,1,6,4,41.766021,-87.572442,POINT (-87.572442196614 41.766020695126)
1,01db2b5a81860ebd2fa3f4564ba0de00508847313ce23a...,,04/24/2025 08:40:00 PM,10,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,REAR TO SIDE,PARKING LOT,...,0.0,0.0,2.0,0.0,20,5,4,41.875048,-87.744921,POINT (-87.744921303711 41.875047815678)
2,0583b2385710470c0a1bc80417d095512e3d04d6f26158...,,04/24/2025 05:45:00 AM,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DAWN,TURNING,NOT DIVIDED,...,0.0,0.0,2.0,0.0,5,5,4,41.981238,-87.806997,POINT (-87.806997368434 41.981238161467)
3,05d78a76ea643730a5771ac904b43c56236f332b42db0e...,,04/24/2025 11:13:00 AM,30,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,FIXED OBJECT,ALLEY,...,0.0,1.0,0.0,0.0,11,5,4,41.742295,-87.652157,POINT (-87.652156998855 41.742294907932)
4,00a530520c84927490b597a6220ff3f2a3347472ef3086...,,01/15/2025 01:50:00 PM,30,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,FIXED OBJECT,RAMP,...,0.0,0.0,1.0,0.0,13,4,1,41.816073,-87.656743,POINT (-87.656742699936 41.816073475493)


In [7]:
crashes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 955200 entries, 0 to 955199
Data columns (total 48 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   CRASH_RECORD_ID                955200 non-null  object 
 1   CRASH_DATE_EST_I               69878 non-null   object 
 2   CRASH_DATE                     955200 non-null  object 
 3   POSTED_SPEED_LIMIT             955200 non-null  int64  
 4   TRAFFIC_CONTROL_DEVICE         955200 non-null  object 
 5   DEVICE_CONDITION               955200 non-null  object 
 6   WEATHER_CONDITION              955200 non-null  object 
 7   LIGHTING_CONDITION             955200 non-null  object 
 8   FIRST_CRASH_TYPE               955200 non-null  object 
 9   TRAFFICWAY_TYPE                955200 non-null  object 
 10  LANE_CNT                       199026 non-null  object 
 11  ALIGNMENT                      955200 non-null  object 
 12  ROADWAY_SURFACE_COND          

In [8]:
people = pd.read_csv('Traffic_Crashes_-_People_20250617.csv', low_memory=False,
                     usecols=['CRASH_RECORD_ID','PERSON_ID','AGE','SEX'])
people.head()

Unnamed: 0,PERSON_ID,CRASH_RECORD_ID,SEX,AGE
0,O749947,81dc0de2ed92aa62baccab641fa377be7feb1cc47e6554...,M,25.0
1,O871921,af84fb5c8d996fcd3aefd36593c3a02e6e7509eeb27568...,M,37.0
2,O10018,71162af7bf22799b776547132ebf134b5b438dcf3dac6b...,X,
3,O10038,c21c476e2ccc41af550b5d858d22aaac4ffc88745a1700...,X,
4,O10039,eb390a4c8e114c69488f5fb8a097fe629f5a92fd528cf4...,X,


In [10]:
people.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2098146 entries, 0 to 2098145
Data columns (total 4 columns):
 #   Column           Dtype  
---  ------           -----  
 0   PERSON_ID        object 
 1   CRASH_RECORD_ID  object 
 2   SEX              object 
 3   AGE              float64
dtypes: float64(1), object(3)
memory usage: 64.0+ MB


In [11]:
vehicles = pd.read_csv('Traffic_Crashes_-_Vehicles_20250617.csv', low_memory=False,
                       usecols=['CRASH_RECORD_ID','VEHICLE_ID','VEHICLE_TYPE'])

In [12]:
vehicles.head()

Unnamed: 0,CRASH_RECORD_ID,VEHICLE_ID,VEHICLE_TYPE
0,2e31858c0e411f0bdcb337fb7c415aa93763cf2f23e02f...,10.0,PASSENGER
1,e73b35bd7651b0c6693162bee0666db159b28901437009...,96.0,SPORT UTILITY VEHICLE (SUV)
2,f2b1adeb85a15112e4fb7db74bff440d6ca53ff7a21e10...,954.0,VAN/MINI-VAN
3,15a3e24fce3ce7cd2b02d44013d1a93ff2fbdca80632ec...,9561.0,PASSENGER
4,1d3c178880366c77deaf06b8c3198429112a1c8e8807ed...,96745.0,SPORT UTILITY VEHICLE (SUV)


In [13]:
vehicles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1948744 entries, 0 to 1948743
Data columns (total 3 columns):
 #   Column           Dtype  
---  ------           -----  
 0   CRASH_RECORD_ID  object 
 1   VEHICLE_ID       float64
 2   VEHICLE_TYPE     object 
dtypes: float64(1), object(2)
memory usage: 44.6+ MB


In [None]:
# 2. Simple aggregation to crash-level -----------------------------
people_agg = people.groupby('CRASH_RECORD_ID').agg({'AGE':'median',
                                                    'SEX':lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan})
people_agg.columns = ['AGE','GENDER']
vehicles_agg = vehicles.groupby('CRASH_RECORD_ID').agg({'VEHICLE_TYPE':lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan})

In [None]:
# merge
crash_df = crashes[['CRASH_RECORD_ID','STREET_NO','SPEED_LIMIT','PRIM_CONTRIBUTORY_CAUSE']].merge(
    people_agg, left_on='CRASH_RECORD_ID', right_index=True, how='left'
).merge(
    vehicles_agg, left_on='CRASH_RECORD_ID', right_index=True, how='left'
)

In [None]:
print(crashes.columns.tolist())

In [None]:
# Drop rows with missing target
crash_df = crash_df.dropna(subset=['PRIM_CONTRIBUTORY_CAUSE'])


In [None]:
# 3. Train/val split ------------------------------------------------
X = crash_df[['STREET_NO','AGE','SPEED_LIMIT','GENDER','VEHICLE_TYPE']]
y = crash_df['PRIM_CONTRIBUTORY_CAUSE']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# 4. Preprocessing & model -----------------------------------------
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse=True)

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, ['STREET_NO','AGE','SPEED_LIMIT']),
    ('cat', categorical_transformer, ['GENDER','VEHICLE_TYPE'])
])

model = LGBMClassifier(n_estimators=300, learning_rate=0.05, class_weight='balanced', random_state=42)

pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', model)
])

pipe.fit(X_train, y_train)

In [None]:
# 5. Evaluation ------------------------------------------------------
y_pred = pipe.predict(X_val)
report = classification_report(y_val, y_pred, output_dict=False)

print(crash_df.head())
print(report)