In [1]:
# Import dependencies
import pandas as pd
from pathlib import Path
import numpy as np

In [2]:
# File to Load
cleaned_person_path = Path("../Cleaned_Datasets/person_cleaned.csv")

In [3]:
# Read the CSV file
cleaned_person_data = pd.read_csv(cleaned_person_path, low_memory=False)

In [4]:
# Store it in a Dataframe
cleaned_person_data_df = pd.DataFrame(cleaned_person_data)
cleaned_person_data_df.head()

Unnamed: 0,ACCIDENT_NO,PERSON_ID,VEHICLE_ID,ACCIDENT_VEHICLE_ID,SEX,AGE,Age_Group,INJ_LEVEL,Inj_Level_Desc,SEATING_POSITION,HELMET_BELT_WORN,ROAD_USER_TYPE,Road_User_Type_Desc,LICENCE_STATE,PEDEST_MOVEMENT,POSTCODE,TAKEN_HOSPITAL,EJECTED_CODE
0,T20060000010,01,A,T20060000010A,F,,unknown,4,Not injured,LF,1,3,Passengers,,0,3130.0,,0
1,T20060000010,02,C,T20060000010C,M,43.0,40-49,4,Not injured,LF,1,3,Passengers,,0,7310.0,,0
2,T20060000010,03,C,T20060000010C,M,22.0,22-25,4,Not injured,LR,1,3,Passengers,,0,7310.0,,0
3,T20060000010,A,A,T20060000010A,M,72.0,70+,4,Not injured,D,1,2,Drivers,V,0,3130.0,,0
4,T20060000010,B,B,T20060000010B,F,62.0,60-64,3,Other injury,D,1,2,Drivers,V,0,,N,0


In [5]:
# Print column names
column_names = cleaned_person_data_df.columns

print(column_names)

Index(['ACCIDENT_NO', 'PERSON_ID', 'VEHICLE_ID', 'ACCIDENT_VEHICLE_ID', 'SEX',
       'AGE', 'Age_Group', 'INJ_LEVEL', 'Inj_Level_Desc', 'SEATING_POSITION',
       'HELMET_BELT_WORN', 'ROAD_USER_TYPE', 'Road_User_Type_Desc',
       'LICENCE_STATE', 'PEDEST_MOVEMENT', 'POSTCODE', 'TAKEN_HOSPITAL',
       'EJECTED_CODE'],
      dtype='object')


In [6]:
# Columns to drop
columns_to_drop = ["PERSON_ID", 
                   "VEHICLE_ID", 
                   "SEX","AGE", 
                   "INJ_LEVEL", 
                   "SEATING_POSITION",
                   "HELMET_BELT_WORN", 
                   "ROAD_USER_TYPE",
                   "Road_User_Type_Desc",
                   "LICENCE_STATE",
                   "PEDEST_MOVEMENT",
                   "POSTCODE",
                   "TAKEN_HOSPITAL",
                   "EJECTED_CODE"
                  ]

# Use the drop method to remove the specified columns
shorted_column_df = cleaned_person_data_df.drop(columns=columns_to_drop)
print(shorted_column_df)

         ACCIDENT_NO ACCIDENT_VEHICLE_ID Age_Group  Inj_Level_Desc
0       T20060000010       T20060000010A   unknown     Not injured
1       T20060000010       T20060000010C     40-49     Not injured
2       T20060000010       T20060000010C     22-25     Not injured
3       T20060000010       T20060000010A       70+     Not injured
4       T20060000010       T20060000010B     60-64    Other injury
...              ...                 ...       ...             ...
490943  T20200019247       T20200019247A     17-21  Serious injury
490944  T20200019250       T20200019250A     40-49     Not injured
490945  T20200019250       T20200019250B     50-59  Serious injury
490946  T20200019253       T20200019253A     64-69  Serious injury
490947  T20200019417       T20200019417A     17-21  Serious injury

[490948 rows x 4 columns]


In [7]:
# Checking the data types
shorted_column_df.dtypes

ACCIDENT_NO            object
ACCIDENT_VEHICLE_ID    object
Age_Group              object
Inj_Level_Desc         object
dtype: object

In [19]:
# Changing data type into a string
shorted_column_df['Age_Group'] = shorted_column_df['Age_Group'].astype("string")
shorted_column_df.dtypes

ACCIDENT_NO            object
ACCIDENT_VEHICLE_ID    object
Age_Group              string
Inj_Level_Desc         object
Injury_Category        object
dtype: object

In [20]:
# Replace '5-Dec' with '5-12' in the 'Age_Group' column
shorted_column_df['Age_Group'] = shorted_column_df['Age_Group'].str.replace('5-Dec', ' 5-12')

In [21]:
# Get the unique age group
age_unique_values = shorted_column_df['Age_Group'].unique()

# Print the unique values
for value in age_unique_values:
    print(value)

unknown
40-49
22-25
70+
60-64
30-39
17-21
50-59
26-29
16-17
13-15
5-12
0-4
64-69


In [22]:
# Get the unique injury level
inj_unique_values = shorted_column_df['Inj_Level_Desc'].unique()

# Print the unique values
for value in inj_unique_values:
    print(value)

Not injured
Other injury
Serious injury
Fatality
Unknown


In [23]:
# Create new category and insert into a new column
# Define the conditions and corresponding categories
conditions = [
    (shorted_column_df['Inj_Level_Desc'] == 'Not injured'),
    (shorted_column_df['Inj_Level_Desc'] == 'Other injury'),
    (shorted_column_df['Inj_Level_Desc'] == 'Serious injury'),
    (shorted_column_df['Inj_Level_Desc'] == 'Unknown'),
    (shorted_column_df['Inj_Level_Desc'] == 'Fatality')
]

categories = ['Non Fatal Accident', 'Non Fatal Accident', 'Non Fatal Accident', 'Non Fatal Accident', 'Fatal Accident']

# Use np.select() to create the new column
shorted_column_df['Injury_Category'] = np.select(conditions, categories, default='Other Category')

# Display the DataFrame with the new 'Injury_Category' column
print(shorted_column_df)

         ACCIDENT_NO ACCIDENT_VEHICLE_ID Age_Group  Inj_Level_Desc  \
0       T20060000010       T20060000010A   unknown     Not injured   
1       T20060000010       T20060000010C     40-49     Not injured   
2       T20060000010       T20060000010C     22-25     Not injured   
3       T20060000010       T20060000010A       70+     Not injured   
4       T20060000010       T20060000010B     60-64    Other injury   
...              ...                 ...       ...             ...   
490943  T20200019247       T20200019247A     17-21  Serious injury   
490944  T20200019250       T20200019250A     40-49     Not injured   
490945  T20200019250       T20200019250B     50-59  Serious injury   
490946  T20200019253       T20200019253A     64-69  Serious injury   
490947  T20200019417       T20200019417A     17-21  Serious injury   

           Injury_Category  
0       Non Fatal Accident  
1       Non Fatal Accident  
2       Non Fatal Accident  
3       Non Fatal Accident  
4       Non Fa

In [24]:
# Group by 'Age_Group' and count the occurrences of 'Fatality' and 'Non-Fatality'
aggregated_result_df = shorted_column_df.groupby('Age_Group')['Injury_Category'].value_counts().unstack(fill_value=0)
aggregated_result_df

Injury_Category,Fatal Accident,Non Fatal Accident
Age_Group,Unnamed: 1_level_1,Unnamed: 2_level_1
0-4,48,10445
13-15,48,8412
16-17,106,9758
17-21,466,53612
22-25,331,48901
26-29,306,41555
30-39,592,84047
40-49,527,71697
5-12,52,17486
50-59,471,55390


In [25]:
# Storing file in Data folder
aggregated_data_output_path = Path("../Cleaned_Aggregated_Datasets/aggregated_person.csv")
aggregated_result_df.to_csv(aggregated_data_output_path, index=True, header=True)