In [70]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.15.3-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.0-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp312-cp312-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   -- ------------------------------------- 0.8/11.1 MB 3.7 MB/s eta 0:00:03
   ----- ---------------------------------- 1.6/11.1 MB 4.0 MB/s eta 0:00:03
   ------- -------------------------------- 2.1/11.1 MB 3.4 MB/s eta 0:00:03
   ----------- ---------------------------- 3.1/11.1 MB 4.0 MB/s eta 0:00:02
   --------------- ------------------------ 4.2/11.1 MB 4.1 MB/s eta 0:00:02
   ---------------- -----------------

In [71]:
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder


accident_df = pd.read_csv('../accident.csv')
#vehicle_df = pd.read_csv('../vehicle.csv')
filtered_vehicle_df = pd.read_csv('../filtered_vehicle.csv')
person_df = pd.read_csv('../person.csv')

merged_df = pd.merge(accident_df, filtered_vehicle_df, on='ACCIDENT_NO', how='inner')
merged_df = pd.merge(merged_df, person_df, on='ACCIDENT_NO', how='inner')

Lets first look at the dimension of the data

In [72]:
merged_df.head(1)

Unnamed: 0,ACCIDENT_NO,ACCIDENT_DATE,ACCIDENT_TIME,ACCIDENT_TYPE,ACCIDENT_TYPE_DESC,DAY_OF_WEEK,DAY_WEEK_DESC,DCA_CODE,DCA_DESC,LIGHT_CONDITION,...,AGE_GROUP,INJ_LEVEL,INJ_LEVEL_DESC,SEATING_POSITION,HELMET_BELT_WORN,ROAD_USER_TYPE,ROAD_USER_TYPE_DESC,LICENCE_STATE,TAKEN_HOSPITAL,EJECTED_CODE
0,T20120000046,2012-01-01,16:25:00,2,Struck Pedestrian,1,Sunday,102,FAR SIDE. PED HIT BY VEHICLE FROM THE LEFT,1,...,40-49,3,Other injury,,8.0,1,Pedestrians,V,Y,0.0


72 features

In [73]:
with pd.option_context('display.max_rows', None):
    print(merged_df.isnull().sum().sort_values(ascending=False))


VEHICLE_POWER             728905
CUBIC_CAPACITY            665709
VEHICLE_WEIGHT            617479
CARRY_CAPACITY            617474
TAKEN_HOSPITAL            508074
LICENCE_STATE             175374
EJECTED_CODE               53965
RMA                        24889
SEATING_POSITION           14855
VEHICLE_ID_y               14837
VEHICLE_MODEL               5317
VEHICLE_DCA_CODE            2605
CONSTRUCTION_TYPE           1486
INITIAL_IMPACT               413
VEHICLE_BODY_STYLE           151
SEX                           52
TRAILER_TYPE                  29
VEHICLE_YEAR_MANUF            14
VEHICLE_COLOUR_2               5
VEHICLE_MOVEMENT               4
HELMET_BELT_WORN               2
ACCIDENT_TIME                  0
ACCIDENT_DATE                  0
ACCIDENT_NO                    0
DCA_CODE                       0
DAY_WEEK_DESC                  0
DAY_OF_WEEK                    0
ACCIDENT_TYPE_DESC             0
ACCIDENT_TYPE                  0
VEHICLE_ID_x                   0
SEVERITY  

20 features has N/A values

In [74]:
with pd.option_context('display.max_rows', None):
    print(merged_df.nunique())

ACCIDENT_NO               153997
ACCIDENT_DATE               4565
ACCIDENT_TIME               1439
ACCIDENT_TYPE                  9
ACCIDENT_TYPE_DESC             9
DAY_OF_WEEK                    8
DAY_WEEK_DESC                  7
DCA_CODE                      81
DCA_DESC                      81
LIGHT_CONDITION                7
NODE_ID                   109271
NO_OF_VEHICLES                17
NO_PERSONS_KILLED              6
NO_PERSONS_INJ_2              13
NO_PERSONS_INJ_3              23
NO_PERSONS_NOT_INJ            39
NO_PERSONS                    45
POLICE_ATTEND                  3
ROAD_GEOMETRY                  9
ROAD_GEOMETRY_DESC             9
SEVERITY                       4
SPEED_ZONE                    13
RMA                            5
VEHICLE_ID_x                  21
VEHICLE_YEAR_MANUF            64
VEHICLE_DCA_CODE               4
INITIAL_DIRECTION              9
ROAD_SURFACE_TYPE              4
ROAD_SURFACE_TYPE_DESC         4
REG_STATE                      7
VEHICLE_BO

it seems that there are some features that worth using one-hot to catagorise

first, there are some duplicated data that pairs with their description. to one hot these data, i will keep the desc features and drop others, since they represents the same thing. 

In [75]:
def drop_encoded_keep_desc(df):
    cols = df.columns
    cols_to_drop = []

    for col in cols:
        if col.endswith('_DESC'):
            prefix = col[:-5]  # 去掉 "_DESC"
            if prefix in cols:
                cols_to_drop.append(prefix)

    df_cleaned = df.drop(columns=cols_to_drop)
    return df_cleaned, cols_to_drop


In [76]:
desc_cleaned_merged_df, dropped_columns = drop_encoded_keep_desc(merged_df)
print("Dropped columns:", dropped_columns)

Dropped columns: ['ACCIDENT_TYPE', 'ROAD_GEOMETRY', 'ROAD_SURFACE_TYPE', 'VEHICLE_TYPE', 'TRAFFIC_CONTROL', 'INJ_LEVEL', 'ROAD_USER_TYPE']


In [77]:
def one_hot_encode_expand(df, columns_to_encode, drop_first=True):
    df_encoded = pd.get_dummies(df, columns=columns_to_encode, drop_first=drop_first)
    return df_encoded

In [81]:
def one_hot_encode_vectorise(df, columns):
    df_new = df.drop(columns=columns).copy()

    for col in columns:
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        encoded = encoder.fit_transform(df[[col]])
        # save as list 
        encoded_vectors = pd.Series(list(encoded), index=df.index)
        # name the new features as col + '_vec'
        df_new[col + '_vec'] = encoded_vectors
    return df_new


In [82]:
one_hot_columns = [
    'ACCIDENT_TYPE_DESC',
    'DAY_WEEK_DESC',
    'LIGHT_CONDITION',
    'ROAD_GEOMETRY_DESC',
    'SEVERITY',
    'SPEED_ZONE',
    'RMA',
    'VEHICLE_MAKE',
    'VEHICLE_BODY_STYLE',
    'VEHICLE_TYPE_DESC',
    'REG_STATE',
    'FUEL_TYPE',
    'SEX',
    'AGE_GROUP',
    'INJ_LEVEL_DESC',
    'SEATING_POSITION',
    'HELMET_BELT_WORN',
    'ROAD_USER_TYPE_DESC',
    'LICENCE_STATE',
    'TRAFFIC_CONTROL_DESC'
]

In [None]:
# each one hot one hot category is encoded as a new feature, as result, there is 518 features
merged_df_onehot_expand_df = one_hot_encode_expand(desc_cleaned_merged_df, one_hot_columns)

# vectorised one hot df, still 72 features, ready for neuron networking
merged_df_onehot_vectorized_df = one_hot_encode_vectorise(merged_df, one_hot_columns)


In [87]:
merged_df_onehot_vectorized_df.head(1)

Unnamed: 0,ACCIDENT_NO,ACCIDENT_DATE,ACCIDENT_TIME,ACCIDENT_TYPE,DAY_OF_WEEK,DCA_CODE,DCA_DESC,NODE_ID,NO_OF_VEHICLES,NO_PERSONS_KILLED,...,REG_STATE_vec,FUEL_TYPE_vec,SEX_vec,AGE_GROUP_vec,INJ_LEVEL_DESC_vec,SEATING_POSITION_vec,HELMET_BELT_WORN_vec,ROAD_USER_TYPE_DESC_vec,LICENCE_STATE_vec,TRAFFIC_CONTROL_DESC_vec
0,T20120000046,2012-01-01,16:25:00,2,1,102,FAR SIDE. PED HIT BY VEHICLE FROM THE LEFT,47545,1,0,...,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]","[1.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[0.0, 0.0, 1.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
