In [17]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Load dataset
df = pd.read_csv("traffic_accidents.csv")

# Clean column names
df.columns = df.columns.str.strip()

# Display first few rows
print(df.head())


               crash_date traffic_control_device weather_condition  \
0  07/29/2023 01:00:00 PM         TRAFFIC SIGNAL             CLEAR   
1  08/13/2023 12:11:00 AM         TRAFFIC SIGNAL             CLEAR   
2  12/09/2021 10:30:00 AM         TRAFFIC SIGNAL             CLEAR   
3  08/09/2023 07:55:00 PM         TRAFFIC SIGNAL             CLEAR   
4  08/19/2023 02:55:00 PM         TRAFFIC SIGNAL             CLEAR   

       lighting_condition first_crash_type trafficway_type  \
0                DAYLIGHT          TURNING     NOT DIVIDED   
1  DARKNESS, LIGHTED ROAD          TURNING        FOUR WAY   
2                DAYLIGHT         REAR END  T-INTERSECTION   
3                DAYLIGHT            ANGLE        FOUR WAY   
4                DAYLIGHT         REAR END  T-INTERSECTION   

            alignment roadway_surface_cond road_defect  \
0  STRAIGHT AND LEVEL              UNKNOWN     UNKNOWN   
1  STRAIGHT AND LEVEL                  DRY  NO DEFECTS   
2  STRAIGHT AND LEVEL           

In [18]:
# Define features and target
feature_cols = ['traffic_control_device', 'weather_condition', 'lighting_condition', 'roadway_surface_cond']
X = df[feature_cols]  # Independent variables
y = df['first_crash_type']  # Dependent variable (target)

# Ensure no missing values
X = X.fillna(method='ffill')
y = y.fillna(method='ffill')


  X = X.fillna(method='ffill')
  y = y.fillna(method='ffill')


In [19]:
# Convert categorical variables to numeric using one-hot encoding
X = pd.get_dummies(X, drop_first=True)

# Check encoded features
print(X.head())


   traffic_control_device_DELINEATORS  \
0                               False   
1                               False   
2                               False   
3                               False   
4                               False   

   traffic_control_device_FLASHING CONTROL SIGNAL  \
0                                           False   
1                                           False   
2                                           False   
3                                           False   
4                                           False   

   traffic_control_device_LANE USE MARKING  \
0                                    False   
1                                    False   
2                                    False   
3                                    False   
4                                    False   

   traffic_control_device_NO CONTROLS  traffic_control_device_NO PASSING  \
0                               False                              False   
1    

In [20]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verify split sizes
print(f"Training set: {X_train.shape}, Testing set: {X_test.shape}")


Training set: (167444, 40), Testing set: (41862, 40)


In [23]:
# Initialize logistic regression model
classifier = LogisticRegression(max_iter=1000)

# Fit the model to training data
classifier.fit(X_train, y_train)

# Predict on the test data
y_pred = classifier.predict(X_test)


In [22]:
# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Accuracy Score
print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[ 4811     0     1     0     0     0     0     0     0     1   257     0
      0     0     0     1  5347]
 [    5     0     0     0     0     0     0     0     0     1     0     0
      0     0     0     0     5]
 [  175     0     3     0     0     0     0     0     0     0    54     0
      0     0     0     0   753]
 [   57     0     0     0     0     0     0     0     0     0    10     0
      0     0     0     0   271]
 [    7     0     1     0     0     0     0     0     0     0     2     0
      0     0     0     0    44]
 [   32     0     1     0     0     0     0     0     0     0     6     0
      0     0     0     0   117]
 [    3     0     0     0     0     0     0     0     0     0     1     0
      0     0     0     0    16]
 [  287     0     2     0     0     0     0     0     0     0    30     0
      0     0     0     0   627]
 [  321     0     0     0     0     0     0     0     0     5    15     0
      0     0     0     0   747]
 [  327     0     0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                              precision    recall  f1-score   support

                       ANGLE       0.48      0.46      0.47     10418
                      ANIMAL       0.00      0.00      0.00        11
                FIXED OBJECT       0.30      0.00      0.01       985
                     HEAD ON       0.00      0.00      0.00       338
          OTHER NONCOLLISION       0.00      0.00      0.00        54
                OTHER OBJECT       0.00      0.00      0.00       156
                  OVERTURNED       0.00      0.00      0.00        20
        PARKED MOTOR VEHICLE       0.00      0.00      0.00       946
                PEDALCYCLIST       0.00      0.00      0.00      1088
                  PEDESTRIAN       0.61      0.02      0.04      1744
                    REAR END       0.32      0.06      0.10      8464
               REAR TO FRONT       0.00      0.00      0.00       234
                REAR TO REAR       0.00      0.00      0.00         6
                REA

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
