In [1]:
# IMPORT Dependencies
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st
from tensorflow.keras.utils import to_categorical
import numpy as np

In [2]:
# Import the Australian Road Death Dataset
ardd_data = pd.read_csv('../Resources/ardd_fatal_crashes.csv')
ardd_data

  ardd_data = pd.read_csv('../Resources/ardd_fatal_crashes.csv')


Unnamed: 0,Crash ID,State,Month,Year,Dayweek,Time,Crash Type,Number Fatalities,Bus \nInvolvement,Heavy Rigid Truck Involvement,Articulated Truck Involvement,Speed Limit,National Remoteness Areas,SA4 Name 2016,National LGA Name 2017,National Road Type,Christmas Period,Easter Period,Day of week,Time of Day
0,20221049,NSW,3,2022,Tuesday,06:50,Single,1,No,Yes,No,90,Inner Regional Australia,Capital Region,Eurobodalla,Arterial Road,No,No,Weekday,Day
1,20222018,Vic,3,2022,Friday,13:05,Multiple,1,-9,-9,-9,-9,Major Cities of Australia,Melbourne - Inner,Port Phillip (C),Local Road,No,No,Weekday,Day
2,20224006,SA,3,2022,Friday,09:30,Multiple,1,No,No,No,80,Outer Regional Australia,South Australia - Outback,Port Augusta (C),National or State Highway,No,No,Weekday,Day
3,20225008,WA,3,2022,Saturday,06:00,Single,1,No,No,No,110,,,,,No,No,Weekend,Day
4,20226010,Tas,3,2022,Tuesday,11:47,Single,1,No,No,No,100,Outer Regional Australia,Launceston and North East,Break O'Day (M),National or State Highway,No,No,Weekday,Day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48116,19892204,Vic,1,1989,Friday,13:20,Single,1,No,No,No,60,,,,,No,No,Weekday,Day
48117,19893326,Qld,1,1989,Monday,09:00,Multiple,1,No,-9,No,60,,,,,No,No,Weekday,Day
48118,19892576,Vic,1,1989,Friday,17:15,Multiple,1,No,No,No,60,,,,,No,No,Weekday,Day
48119,19891122,NSW,1,1989,Tuesday,14:10,Multiple,1,No,-9,No,60,,,,,No,No,Weekday,Day


In [3]:
#check the shape of the original CSV
ardd_data.shape

(48121, 20)

In [4]:
# according to the documentation, -9 would mean "Unknown", however after data exploration those data will be of no use for our ML model
ardd_data = ardd_data.replace("-9", np.nan)

In [5]:
#drop rows containing null values
cleaned_ardd_data = ardd_data.dropna()

In [6]:
#check shape 
cleaned_ardd_data.shape

(7122, 20)

In [7]:
cleaned_ardd_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7122 entries, 0 to 8954
Data columns (total 20 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   Crash ID                       7122 non-null   int64 
 1   State                          7122 non-null   object
 2   Month                          7122 non-null   int64 
 3   Year                           7122 non-null   int64 
 4   Dayweek                        7122 non-null   object
 5   Time                           7122 non-null   object
 6   Crash Type                     7122 non-null   object
 7   Number Fatalities              7122 non-null   int64 
 8   Bus 
Involvement               7122 non-null   object
 9   Heavy Rigid Truck Involvement  7122 non-null   object
 10  Articulated Truck Involvement  7122 non-null   object
 11  Speed Limit                    7122 non-null   object
 12  National Remoteness Areas      7122 non-null   object
 13  SA4

In [8]:
cleaned_ardd_data1 = cleaned_ardd_data.filter(['State','Month','Year','Crash Type','Bus \nInvolvement','Heavy Rigid Truck Involvement','Articulated Truck Involvement','Speed Limit','National Remoteness Areas','National Road Type','Christmas Period','Easter Period','Day of week'])
cleaned_ardd_data1.head(20)

Unnamed: 0,State,Month,Year,Crash Type,Bus \nInvolvement,Heavy Rigid Truck Involvement,Articulated Truck Involvement,Speed Limit,National Remoteness Areas,National Road Type,Christmas Period,Easter Period,Day of week
0,NSW,3,2022,Single,No,Yes,No,90,Inner Regional Australia,Arterial Road,No,No,Weekday
2,SA,3,2022,Multiple,No,No,No,80,Outer Regional Australia,National or State Highway,No,No,Weekday
4,Tas,3,2022,Single,No,No,No,100,Outer Regional Australia,National or State Highway,No,No,Weekday
5,NSW,3,2022,Multiple,No,No,No,70,Major Cities of Australia,National or State Highway,No,No,Weekend
6,NSW,3,2022,Single,No,No,No,60,Major Cities of Australia,Arterial Road,No,No,Weekend
9,Qld,3,2022,Single,No,No,No,100,Inner Regional Australia,Local Road,No,No,Weekday
10,Qld,3,2022,Single,No,No,No,60,Major Cities of Australia,Local Road,No,No,Weekday
11,NSW,3,2022,Single,No,No,No,70,Inner Regional Australia,Arterial Road,No,No,Weekday
12,NSW,3,2022,Multiple,No,No,No,110,Inner Regional Australia,National or State Highway,No,No,Weekday
13,Qld,3,2022,Single,No,No,No,50,Outer Regional Australia,Local Road,No,No,Weekday


In [9]:
cleaned_ardd_data2 = cleaned_ardd_data.filter(['State','Month','Year','Crash Type','Speed Limit','National Remoteness Areas','National Road Type','Christmas Period','Easter Period','Day of week'])
cleaned_ardd_data2.head(20)

Unnamed: 0,State,Month,Year,Crash Type,Speed Limit,National Remoteness Areas,National Road Type,Christmas Period,Easter Period,Day of week
0,NSW,3,2022,Single,90,Inner Regional Australia,Arterial Road,No,No,Weekday
2,SA,3,2022,Multiple,80,Outer Regional Australia,National or State Highway,No,No,Weekday
4,Tas,3,2022,Single,100,Outer Regional Australia,National or State Highway,No,No,Weekday
5,NSW,3,2022,Multiple,70,Major Cities of Australia,National or State Highway,No,No,Weekend
6,NSW,3,2022,Single,60,Major Cities of Australia,Arterial Road,No,No,Weekend
9,Qld,3,2022,Single,100,Inner Regional Australia,Local Road,No,No,Weekday
10,Qld,3,2022,Single,60,Major Cities of Australia,Local Road,No,No,Weekday
11,NSW,3,2022,Single,70,Inner Regional Australia,Arterial Road,No,No,Weekday
12,NSW,3,2022,Multiple,110,Inner Regional Australia,National or State Highway,No,No,Weekday
13,Qld,3,2022,Single,50,Outer Regional Australia,Local Road,No,No,Weekday


In [10]:
cleaned_ardd_data = cleaned_ardd_data.rename(columns={'Crash Type': 'crash_type','Bus \nInvolvement': 'bus_involvement','Heavy Rigid Truck Involvement': 'heavy_rigid_truck_involvement','Articulated Truck Involvement': 'articulated_truck_involvement','Speed Limit': 'speed_limit','National Remoteness Areas': 'national_remoteness_area','National Road Type': 'national_road_type','Christmas Period': 'christmas_period','Easter Period': 'easter_period','Day of week': 'day_of_week'})

In [11]:
cleaned_ardd_data2 = cleaned_ardd_data2.rename(columns={'Crash Type': 'crash_type','Speed Limit': 'speed_limit','National Remoteness Areas': 'national_remoteness_area','National Road Type': 'national_road_type','Christmas Period': 'christmas_period','Easter Period': 'easter_period','Day of week': 'day_of_week'})
cleaned_ardd_data2

Unnamed: 0,State,Month,Year,crash_type,speed_limit,national_remoteness_area,national_road_type,christmas_period,easter_period,day_of_week
0,NSW,3,2022,Single,90,Inner Regional Australia,Arterial Road,No,No,Weekday
2,SA,3,2022,Multiple,80,Outer Regional Australia,National or State Highway,No,No,Weekday
4,Tas,3,2022,Single,100,Outer Regional Australia,National or State Highway,No,No,Weekday
5,NSW,3,2022,Multiple,70,Major Cities of Australia,National or State Highway,No,No,Weekend
6,NSW,3,2022,Single,60,Major Cities of Australia,Arterial Road,No,No,Weekend
...,...,...,...,...,...,...,...,...,...,...
8936,Tas,1,2014,Multiple,80,Inner Regional Australia,National or State Highway,No,No,Weekday
8944,WA,1,2014,Single,100,Major Cities of Australia,NATIONAL OR STATE HIGHWAY,Yes,No,Weekday
8948,SA,1,2014,Single,50,Major Cities of Australia,Local Road,No,No,Weekday
8950,SA,1,2014,Single,50,Major Cities of Australia,Local Road,No,No,Weekday


In [12]:
# Set features. 
X = cleaned_ardd_data.drop("crash_type", axis=1)
y = cleaned_ardd_data["crash_type"]
print(X.shape, y.shape)

(7122, 19) (7122,)


In [13]:
# Set features. 
X2 = cleaned_ardd_data2.drop("crash_type", axis=1)
y2 = cleaned_ardd_data2["crash_type"]
print(X2.shape, y2.shape)

(7122, 9) (7122,)


In [14]:
X

Unnamed: 0,Crash ID,State,Month,Year,Dayweek,Time,Number Fatalities,bus_involvement,heavy_rigid_truck_involvement,articulated_truck_involvement,speed_limit,national_remoteness_area,SA4 Name 2016,National LGA Name 2017,national_road_type,christmas_period,easter_period,day_of_week,Time of Day
0,20221049,NSW,3,2022,Tuesday,06:50,1,No,Yes,No,90,Inner Regional Australia,Capital Region,Eurobodalla,Arterial Road,No,No,Weekday,Day
2,20224006,SA,3,2022,Friday,09:30,1,No,No,No,80,Outer Regional Australia,South Australia - Outback,Port Augusta (C),National or State Highway,No,No,Weekday,Day
4,20226010,Tas,3,2022,Tuesday,11:47,1,No,No,No,100,Outer Regional Australia,Launceston and North East,Break O'Day (M),National or State Highway,No,No,Weekday,Day
5,20221025,NSW,3,2022,Sunday,11:10,1,No,No,No,70,Major Cities of Australia,Sydney - Sutherland,Sutherland,National or State Highway,No,No,Weekend,Day
6,20221045,NSW,3,2022,Sunday,15:25,1,No,No,No,60,Major Cities of Australia,Sydney - Inner West,Inner West,Arterial Road,No,No,Weekend,Day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8936,20146015,Tas,1,2014,Monday,12:10,1,No,No,No,80,Inner Regional Australia,Hobart,Kingborough (M),National or State Highway,No,No,Weekday,Day
8944,20145108,WA,1,2014,Wednesday,11:47,1,No,No,No,100,Major Cities of Australia,Perth - South East,Belmont (C),NATIONAL OR STATE HIGHWAY,Yes,No,Weekday,Day
8948,20144007,SA,1,2014,Tuesday,20:00,1,No,No,No,50,Major Cities of Australia,Adelaide - North,Playford (C),Local Road,No,No,Weekday,Night
8950,20144022,SA,1,2014,Monday,09:35,1,No,No,No,50,Major Cities of Australia,Adelaide - North,Tea Tree Gully (C),Local Road,No,No,Weekday,Day


In [15]:
y.unique()

array(['Single', 'Multiple'], dtype=object)

In [16]:
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder()

transformed_X = onehotencoder.fit_transform(X[["State", "Month","Year", "bus_involvement","heavy_rigid_truck_involvement", "articulated_truck_involvement", "speed_limit", "national_remoteness_area", "national_road_type","christmas_period", "easter_period","day_of_week"]])
print(transformed_X)

  (0, 1)	1.0
  (0, 10)	1.0
  (0, 28)	1.0
  (0, 29)	1.0
  (0, 32)	1.0
  (0, 33)	1.0
  (0, 49)	1.0
  (0, 52)	1.0
  (0, 60)	1.0
  (0, 70)	1.0
  (0, 72)	1.0
  (0, 74)	1.0
  (1, 4)	1.0
  (1, 10)	1.0
  (1, 28)	1.0
  (1, 29)	1.0
  (1, 31)	1.0
  (1, 33)	1.0
  (1, 48)	1.0
  (1, 54)	1.0
  (1, 66)	1.0
  (1, 70)	1.0
  (1, 72)	1.0
  (1, 74)	1.0
  (2, 5)	1.0
  :	:
  (7119, 74)	1.0
  (7120, 4)	1.0
  (7120, 8)	1.0
  (7120, 20)	1.0
  (7120, 29)	1.0
  (7120, 31)	1.0
  (7120, 33)	1.0
  (7120, 44)	1.0
  (7120, 53)	1.0
  (7120, 64)	1.0
  (7120, 70)	1.0
  (7120, 72)	1.0
  (7120, 74)	1.0
  (7121, 7)	1.0
  (7121, 8)	1.0
  (7121, 20)	1.0
  (7121, 29)	1.0
  (7121, 31)	1.0
  (7121, 33)	1.0
  (7121, 48)	1.0
  (7121, 53)	1.0
  (7121, 58)	1.0
  (7121, 70)	1.0
  (7121, 72)	1.0
  (7121, 75)	1.0


In [17]:
transformed_X2 = onehotencoder.fit_transform(X2[["State", "Month","Year", "speed_limit", "national_remoteness_area", "national_road_type","christmas_period", "easter_period","day_of_week"]])
print(transformed_X2)

  (0, 1)	1.0
  (0, 10)	1.0
  (0, 28)	1.0
  (0, 43)	1.0
  (0, 46)	1.0
  (0, 54)	1.0
  (0, 64)	1.0
  (0, 66)	1.0
  (0, 68)	1.0
  (1, 4)	1.0
  (1, 10)	1.0
  (1, 28)	1.0
  (1, 42)	1.0
  (1, 48)	1.0
  (1, 60)	1.0
  (1, 64)	1.0
  (1, 66)	1.0
  (1, 68)	1.0
  (2, 5)	1.0
  (2, 10)	1.0
  (2, 28)	1.0
  (2, 30)	1.0
  (2, 48)	1.0
  (2, 60)	1.0
  (2, 64)	1.0
  :	:
  (7119, 20)	1.0
  (7119, 38)	1.0
  (7119, 47)	1.0
  (7119, 58)	1.0
  (7119, 64)	1.0
  (7119, 66)	1.0
  (7119, 68)	1.0
  (7120, 4)	1.0
  (7120, 8)	1.0
  (7120, 20)	1.0
  (7120, 38)	1.0
  (7120, 47)	1.0
  (7120, 58)	1.0
  (7120, 64)	1.0
  (7120, 66)	1.0
  (7120, 68)	1.0
  (7121, 7)	1.0
  (7121, 8)	1.0
  (7121, 20)	1.0
  (7121, 42)	1.0
  (7121, 47)	1.0
  (7121, 52)	1.0
  (7121, 64)	1.0
  (7121, 66)	1.0
  (7121, 69)	1.0


In [18]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)

In [19]:
label_encoder.fit(y2)
encoded_y2 = label_encoder.transform(y2)

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    transformed_X, encoded_y, random_state=1)


In [22]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(
    transformed_X2, encoded_y2, random_state=1)


In [23]:
X_train

<5341x76 sparse matrix of type '<class 'numpy.float64'>'
	with 64092 stored elements in Compressed Sparse Row format>

In [24]:
y_train

array([0, 1, 1, ..., 1, 0, 0])

In [25]:
y_test

array([0, 1, 0, ..., 0, 0, 0])

In [26]:
X_test

<1781x76 sparse matrix of type '<class 'numpy.float64'>'
	with 21372 stored elements in Compressed Sparse Row format>

In [27]:
from sklearn import tree

In [28]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_train, y_train)

0.9717281407976034

In [31]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train2, y_train2)
clf.score(X_train2, y_train2)

0.9574985957685826

In [29]:
clf = clf.fit(X_train, y_train)
clf.score(X_train, y_train)

0.9717281407976034

In [30]:
print(f"Training Data Score: {clf.score(X_train, y_train)}")
print(f"Testing Data Score: {clf.score(X_test, y_test)}")

Training Data Score: 0.9717281407976034
Testing Data Score: 0.6019090398652442


In [32]:
print(f"Training Data Score: {clf.score(X_train2, y_train2)}")
print(f"Testing Data Score: {clf.score(X_test2, y_test2)}")

Training Data Score: 0.9574985957685826
Testing Data Score: 0.5429533969679955


In [35]:
y.value_counts()

Single      4089
Multiple    3033
Name: crash_type, dtype: int64

In [36]:
clf.feature_importances_

array([0.00255965, 0.02933954, 0.00453226, 0.02153416, 0.01707256,
       0.00848199, 0.02914625, 0.02004237, 0.02600313, 0.02134031,
       0.03108317, 0.02739662, 0.02738458, 0.02605259, 0.02620477,
       0.0242366 , 0.02228395, 0.02719232, 0.02580141, 0.02337367,
       0.01105383, 0.03180715, 0.02837682, 0.02631093, 0.02468708,
       0.02733057, 0.0334047 , 0.02682318, 0.01198375, 0.00071518,
       0.02784843, 0.01202893, 0.00026532, 0.        , 0.        ,
       0.00011755, 0.00452029, 0.00042261, 0.01385216, 0.01811257,
       0.01652119, 0.        , 0.01807554, 0.00802356, 0.00037733,
       0.        , 0.01539469, 0.01974444, 0.01241838, 0.00520936,
       0.00648512, 0.00066246, 0.00499244, 0.00485283, 0.02246544,
       0.        , 0.01198908, 0.00383184, 0.01549063, 0.0035585 ,
       0.01601891, 0.        , 0.01771467, 0.00103304, 0.00432616,
       0.00812237, 0.00235165, 0.00486657, 0.01382256, 0.02092827])

In [40]:
top_df = pd.DataFrame({"column":X_train.columns, "importance":clf.feature_importances_}).sort_values(by="importance")

AttributeError: columns not found