In [496]:
!pip install scikit-learn



In [497]:
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler


accident_df = pd.read_csv('../accident.csv')
#vehicle_df = pd.read_csv('../vehicle.csv')
filtered_vehicle_df = pd.read_csv('../filtered_vehicle.csv')
person_df = pd.read_csv('../person.csv')

# merge only on 'ACCIDENT_NO' will cause a huge duplication in data, that leads to 728,905 rows of data, which by checking
# 345,184 rows are actual duplication rows, that it  47.4%.

#merged_df = pd.merge(accident_df, filtered_vehicle_df, on='ACCIDENT_NO', how='inner')
#merged_df = pd.merge(merged_df, person_df, on='ACCIDENT_NO', how='inner')

# new merge method merges on both 'ACCIDENT_NO' and 'VEHICLE_ID'
vp = pd.merge(person_df, filtered_vehicle_df, on=['ACCIDENT_NO', 'VEHICLE_ID'], how='inner')
merged_df = pd.merge(vp, accident_df, on='ACCIDENT_NO', how='left')



Lets first look at the dimension of the data

In [498]:
# show some examples of each feature

for col in merged_df.columns:
    print(f"{col}: {merged_df[col].dropna().unique()[:5]}")


ACCIDENT_NO: ['T20250000606' 'T20240012414' 'T20240001586' 'T20200004467'
 'T20190018772']
PERSON_ID: ['A' '01' 'B' '02' '04']
VEHICLE_ID: ['A' 'B' 'C' 'D' 'E']
SEX: ['M' 'F' 'U']
AGE_GROUP: ['40-49' '26-29' '18-21' '16-17' '22-25']
INJ_LEVEL: [3 4 2 1]
INJ_LEVEL_DESC: ['Other injury' 'Not injured' 'Serious injury' 'Fatality']
SEATING_POSITION: ['D' 'LF' 'OR' 'NK' 'RR']
HELMET_BELT_WORN: [9. 1. 8. 2. 3.]
ROAD_USER_TYPE: [2 3 9 7 1]
ROAD_USER_TYPE_DESC: ['Drivers' 'Passengers' 'Not Known' 'Pedestrians' 'Motorcyclists']
LICENCE_STATE: ['V' 'Z' 'O' 'W' 'D']
TAKEN_HOSPITAL: ['N' 'Y']
EJECTED_CODE: [0. 9. 1. 3. 2.]
VEHICLE_YEAR_MANUF: [2016. 2008. 2012. 2009. 1994.]
VEHICLE_DCA_CODE: [1. 2. 8. 3.]
INITIAL_DIRECTION: ['N' 'SW' 'E' 'S' 'NE']
ROAD_SURFACE_TYPE: [1. 9. 3. 2.]
ROAD_SURFACE_TYPE_DESC: ['Paved' 'Not known' 'Gravel' 'Unpaved']
REG_STATE: ['V' 'S' 'T' 'W' 'N']
VEHICLE_BODY_STYLE: ['SEDAN' 'WAGON' 'S WAG' 'UTIL' 'DC UTE']
VEHICLE_MAKE: ['TOYOTA' 'M MOVE' 'HOLDEN' 'HONDA' 'MAZDA']
VEH

In [499]:
# number of features
merged_df.shape[1]

71

In [500]:
# number of rows 
len(merged_df)

331993

In [501]:
# check if there is any null value
with pd.option_context('display.max_rows', None):
    print(merged_df.isnull().sum().sort_values(ascending=False))


VEHICLE_POWER             331993
CUBIC_CAPACITY            302305
VEHICLE_WEIGHT            282216
CARRY_CAPACITY            282211
TAKEN_HOSPITAL            232911
LICENCE_STATE              75363
EJECTED_CODE               23523
RMA                        12720
VEHICLE_MODEL               2591
VEHICLE_DCA_CODE             706
CONSTRUCTION_TYPE            669
INITIAL_IMPACT               120
VEHICLE_BODY_STYLE            59
SEATING_POSITION              26
SEX                           22
TRAILER_TYPE                  16
VEHICLE_YEAR_MANUF             8
VEHICLE_COLOUR_2               2
HELMET_BELT_WORN               1
VEHICLE_MOVEMENT               1
PERSON_ID                      0
ACCIDENT_NO                    0
VEHICLE_ID                     0
INJ_LEVEL_DESC                 0
INJ_LEVEL                      0
AGE_GROUP                      0
ROAD_SURFACE_TYPE_DESC         0
VEHICLE_MAKE                   0
REG_STATE                      0
ROAD_SURFACE_TYPE              0
VEHICLE_TY

continue process the data, check if there is any uneccessary features

In [502]:
# these features are considered to be useless with respect with training models
drop_feature = ['ACCIDENT_NO',
                'NODE_ID', 
                'PERSON_ID',
                'VEHICLE_ID',
                'ACCIDENT_TIME',
                'VEHICLE_MODEL',
                'VEHICLE_MAKE',
                'VEHICLE_BODY_STYLE',
                'DCA_CODE'
                ]
merged_df = merged_df.drop(columns=drop_feature)


In [503]:
# lets see if theres some features that is categorical
with pd.option_context('display.max_rows', None):
    print(merged_df.nunique())

SEX                          3
AGE_GROUP                   14
INJ_LEVEL                    4
INJ_LEVEL_DESC               4
SEATING_POSITION             9
HELMET_BELT_WORN             9
ROAD_USER_TYPE               8
ROAD_USER_TYPE_DESC          6
LICENCE_STATE               11
TAKEN_HOSPITAL               2
EJECTED_CODE                 5
VEHICLE_YEAR_MANUF          64
VEHICLE_DCA_CODE             4
INITIAL_DIRECTION            9
ROAD_SURFACE_TYPE            4
ROAD_SURFACE_TYPE_DESC       4
REG_STATE                    7
VEHICLE_POWER                0
VEHICLE_TYPE                22
VEHICLE_TYPE_DESC           22
VEHICLE_WEIGHT            1208
CONSTRUCTION_TYPE            3
FUEL_TYPE                    8
NO_OF_WHEELS                10
NO_OF_CYLINDERS             24
SEATING_CAPACITY            61
TARE_WEIGHT               3676
TOTAL_NO_OCCUPANTS          40
CARRY_CAPACITY            3201
CUBIC_CAPACITY              99
FINAL_DIRECTION              9
DRIVER_INTENT               20
VEHICLE_

it seems that there are some features that worth using one-hot to catagorise

first, there are some duplicated data that pairs with their description. to one hot these data, i will drop the desc features and keep others, since they represents the same thing. 

In [504]:
def drop_and_report_desc_fields(df):
    desc_cols = [col for col in df.columns if col.endswith('_DESC')]
    for col in desc_cols:
        print(f" - {col}")
    return df.drop(columns=desc_cols)

In [505]:
merged_df = drop_and_report_desc_fields(merged_df)

 - INJ_LEVEL_DESC
 - ROAD_USER_TYPE_DESC
 - ROAD_SURFACE_TYPE_DESC
 - VEHICLE_TYPE_DESC
 - TRAFFIC_CONTROL_DESC
 - ACCIDENT_TYPE_DESC
 - DAY_WEEK_DESC
 - DCA_DESC
 - ROAD_GEOMETRY_DESC


In [506]:
# checking the new demsion of the cleaned df
for col in merged_df.columns:
    print(f"{col}: {merged_df[col].dropna().unique()[:5]}")


SEX: ['M' 'F' 'U']
AGE_GROUP: ['40-49' '26-29' '18-21' '16-17' '22-25']
INJ_LEVEL: [3 4 2 1]
SEATING_POSITION: ['D' 'LF' 'OR' 'NK' 'RR']
HELMET_BELT_WORN: [9. 1. 8. 2. 3.]
ROAD_USER_TYPE: [2 3 9 7 1]
LICENCE_STATE: ['V' 'Z' 'O' 'W' 'D']
TAKEN_HOSPITAL: ['N' 'Y']
EJECTED_CODE: [0. 9. 1. 3. 2.]
VEHICLE_YEAR_MANUF: [2016. 2008. 2012. 2009. 1994.]
VEHICLE_DCA_CODE: [1. 2. 8. 3.]
INITIAL_DIRECTION: ['N' 'SW' 'E' 'S' 'NE']
ROAD_SURFACE_TYPE: [1. 9. 3. 2.]
REG_STATE: ['V' 'S' 'T' 'W' 'N']
VEHICLE_POWER: []
VEHICLE_TYPE: [ 1  2  4 71 61]
VEHICLE_WEIGHT: [ 2100.  1805. 26000.  2805.  2800.]
CONSTRUCTION_TYPE: ['R' 'P' 'A']
FUEL_TYPE: ['M' 'P' 'D' 'G' 'E']
NO_OF_WHEELS: [ 4.  6.  8. 12. 10.]
NO_OF_CYLINDERS: [4. 6. 8. 5. 3.]
SEATING_CAPACITY: [ 5.  7.  2.  3. 42.]
TARE_WEIGHT: [1570. 1145. 1805. 1875. 1840.]
TOTAL_NO_OCCUPANTS: [1. 2. 5. 3. 4.]
CARRY_CAPACITY: [  530.   560. 15800.   985.  1305.]
CUBIC_CAPACITY: [1900. 2500. 3500. 4000. 3600.]
FINAL_DIRECTION: ['E' 'SW' 'S' 'NE' 'W']
DRIVER_INTE

Create a new feature, vehicle age = accident date - year of manufacture

In [507]:
merged_df["ACCIDENT_DATE"] = pd.to_datetime(merged_df["ACCIDENT_DATE"], errors='coerce')

merged_df["ACCIDENT_YEAR"] = merged_df["ACCIDENT_DATE"].dt.year

merged_df["VEHICLE_AGE"] = merged_df["ACCIDENT_YEAR"] - merged_df["VEHICLE_YEAR_MANUF"]

In [508]:
drop_feature = ['VEHICLE_YEAR_MANUF',
                'ACCIDENT_DATE',
                'ACCIDENT_YEAR'
                ]
cleaned_df = merged_df.drop(columns=drop_feature)

In [509]:
cleaned_df.shape[1]

52

In [510]:
len(cleaned_df)

331993

before one hot, i will keep a version of df for light GBM,

In [511]:
cleaned_df.to_csv('../merged_cleaned.csv', index=False)

In [512]:
'''
def one_hot_encode_expand(df, columns_to_encode, drop_first=True):
    df_encoded = pd.get_dummies(df, columns=columns_to_encode, drop_first=drop_first)
    return df_encoded
'''
# not suitable to expand the one hot encode, it will cause dimension expansion

'\ndef one_hot_encode_expand(df, columns_to_encode, drop_first=True):\n    df_encoded = pd.get_dummies(df, columns=columns_to_encode, drop_first=drop_first)\n    return df_encoded\n'

In [513]:
def one_hot_encode_vectorise(df, columns):
    df_new = df.drop(columns=columns).copy()

    for col in columns:
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        encoded = encoder.fit_transform(df[[col]])
        # save as list 
        encoded_vectors = pd.Series(list(encoded), index=df.index)
        # name the new features as col + '_vec'
        df_new[col + '_vec'] = encoded_vectors
    return df_new


In [514]:
# lets see if theres some features that is categorical
with pd.option_context('display.max_rows', None):
    print(cleaned_df.nunique())

SEX                      3
AGE_GROUP               14
INJ_LEVEL                4
SEATING_POSITION         9
HELMET_BELT_WORN         9
ROAD_USER_TYPE           8
LICENCE_STATE           11
TAKEN_HOSPITAL           2
EJECTED_CODE             5
VEHICLE_DCA_CODE         4
INITIAL_DIRECTION        9
ROAD_SURFACE_TYPE        4
REG_STATE                7
VEHICLE_POWER            0
VEHICLE_TYPE            22
VEHICLE_WEIGHT        1208
CONSTRUCTION_TYPE        3
FUEL_TYPE                8
NO_OF_WHEELS            10
NO_OF_CYLINDERS         24
SEATING_CAPACITY        61
TARE_WEIGHT           3676
TOTAL_NO_OCCUPANTS      40
CARRY_CAPACITY        3201
CUBIC_CAPACITY          99
FINAL_DIRECTION          9
DRIVER_INTENT           20
VEHICLE_MOVEMENT        20
TRAILER_TYPE            12
VEHICLE_COLOUR_1        18
VEHICLE_COLOUR_2        17
CAUGHT_FIRE              4
INITIAL_IMPACT          17
LAMPS                    4
LEVEL_OF_DAMAGE          7
TOWED_AWAY_FLAG          3
TRAFFIC_CONTROL         17
A

In [515]:
one_hot_columns = [
    'SEX', 'AGE_GROUP', 'INJ_LEVEL', 'SEATING_POSITION', 'HELMET_BELT_WORN',
    'ROAD_USER_TYPE', 'LICENCE_STATE', 'TAKEN_HOSPITAL', 'EJECTED_CODE',
    'VEHICLE_DCA_CODE', 'INITIAL_DIRECTION', 'ROAD_SURFACE_TYPE',
    'REG_STATE', 'VEHICLE_TYPE', 'CONSTRUCTION_TYPE',
    'FUEL_TYPE', 'FINAL_DIRECTION', 'TRAILER_TYPE', 'VEHICLE_COLOUR_1',
    'VEHICLE_COLOUR_2', 'INITIAL_IMPACT', 'LEVEL_OF_DAMAGE', 'TOWED_AWAY_FLAG',
    'TRAFFIC_CONTROL', 'ACCIDENT_TYPE', 'DAY_OF_WEEK',
    'LIGHT_CONDITION', 'POLICE_ATTEND', 'ROAD_GEOMETRY', 'RMA'
]


In [517]:
# each one hot one hot category is encoded as a new feature, as result, there is 518 features
# merged_df_onehot_expand_df = one_hot_encode_expand(desc_cleaned_merged_df, one_hot_columns)

# vectorised one hot df, still 56 features, ready for neuron networking
merged_onehot_df = one_hot_encode_vectorise(cleaned_df, one_hot_columns)


MemoryError: Unable to allocate 35.5 MiB for an array with shape (14, 331993) and data type float64

In [None]:
len(merged_onehot_df)

331993

Now, since MLP is very sensitive to N/A values and range of the data, i will fill N/A and normalize the data.

In [None]:
def missing_value_summary(df):
    total = df.isnull().sum()
    percent = (total / len(df)) * 100
    missing_df = pd.DataFrame({'Missing Count': total, 'Missing Percentage': percent})
    missing_df = missing_df[missing_df['Missing Count'] > 0]
    missing_df = missing_df.sort_values(by='Missing Percentage', ascending=False)
    return missing_df


In [None]:
missing_report = missing_value_summary(merged_onehot_df)
print(missing_report)


                    Missing Count  Missing Percentage
VEHICLE_POWER              331993          100.000000
CUBIC_CAPACITY             302305           91.057643
VEHICLE_WEIGHT             282216           85.006612
CARRY_CAPACITY             282211           85.005106
VEHICLE_BODY_STYLE             59            0.017771
VEHICLE_AGE                     8            0.002410
VEHICLE_MOVEMENT                1            0.000301


wow, a very high percentage of some of theses data are missing, then it is necessary to drop them

In [None]:
merged_onehot_df = merged_onehot_df.drop(columns=["VEHICLE_POWER", "CUBIC_CAPACITY", "VEHICLE_WEIGHT", "CARRY_CAPACITY"])

for the other 2, lemme just fill in median


In [None]:
for col in ["VEHICLE_MOVEMENT"]:
    median_val = merged_onehot_df[col].median()
    merged_onehot_df[col] = merged_onehot_df[col].fillna(median_val)


In [None]:
# 1. 找出所有 one-hot 列（以 "_vec" 结尾）
one_hot_columns = [col for col in merged_onehot_df.columns if col.endswith('_vec')]

# 2. 排除 one-hot 和标签列，剩下的都是要归一化的连续数值列
non_onehot_cols = [col for col in merged_onehot_df.columns if col not in one_hot_columns + ['SEVERITY']]

len(non_onehot_cols)

19

In [None]:
len(merged_onehot_df)

331993

In [None]:
scaler = StandardScaler()
merged_onehot_df[non_onehot_cols] = scaler.fit_transform(merged_onehot_df[non_onehot_cols])

ValueError: could not convert string to float: 'SEDAN'

In [461]:
merged_onehot_df.to_csv('../merged_onehot.csv', index=False)

KeyboardInterrupt: 