In [320]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.model_selection import train_test_split

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [321]:
train = pd.read_csv('training_data.csv', low_memory = False)
test = pd.read_csv('test_data.csv')

pd.options.mode.chained_assignment = None

In [322]:
train.columns

Index(['accident_index', 'accident_severity', 'number_of_vehicles',
       'number_of_casualties', 'day_of_week', 'time',
       'local_authority_district', 'local_authority_highway',
       'first_road_class', 'road_type', 'speed_limit', 'junction_detail',
       'second_road_class', 'pedestrian_crossing_human_control',
       'pedestrian_crossing_physical_facilities', 'light_conditions',
       'weather_conditions', 'road_surface_conditions', 'urban_or_rural_area',
       'trunk_road_flag', 'lsoa_of_accident_location', 'casualty_class',
       'sex_of_casualty', 'age_of_casualty', 'age_band_of_casualty',
       'casualty_severity', 'pedestrian_location', 'pedestrian_movement',
       'car_passenger', 'bus_or_coach_passenger',
       'pedestrian_road_maintenance_worker', 'casualty_type',
       'casualty_home_area_type', 'casualty_imd_decile', 'Year', 'Month',
       'Day', 'geometry'],
      dtype='object')

# Dropping columns with little information

In [323]:
train.drop(labels=['accident_index', 'accident_severity', 'first_road_class', 'road_type', 'second_road_class', 'trunk_road_flag', 'lsoa_of_accident_location', 'age_band_of_casualty', 'pedestrian_crossing_human_control', 'geometry', 'bus_or_coach_passenger', 'pedestrian_road_maintenance_worker', 'pedestrian_location', 'pedestrian_movement', 'local_authority_highway'], axis=1, inplace=True)

In [324]:
train.columns

Index(['number_of_vehicles', 'number_of_casualties', 'day_of_week', 'time',
       'local_authority_district', 'speed_limit', 'junction_detail',
       'pedestrian_crossing_physical_facilities', 'light_conditions',
       'weather_conditions', 'road_surface_conditions', 'urban_or_rural_area',
       'casualty_class', 'sex_of_casualty', 'age_of_casualty',
       'casualty_severity', 'car_passenger', 'casualty_type',
       'casualty_home_area_type', 'casualty_imd_decile', 'Year', 'Month',
       'Day'],
      dtype='object')

# Combining columns with similar information

In [325]:
print(train.casualty_type.value_counts(normalize=True))
print(train.casualty_class.value_counts(normalize=True))

casualty_type
Car occupant                                                 0.623618
Pedestrian                                                   0.145257
Cyclist                                                      0.086468
Motorcycle 125cc and under rider or passenger                0.030080
Van / Goods vehicle (3.5 tonnes mgw or under) occupant       0.025195
Motorcycle over 500cc rider or passenger                     0.020567
Bus or coach occupant (17 or more pass seats)                0.020053
Taxi/Private hire car occupant                               0.008998
Motorcycle - unknown cc rider or passenger                   0.007541
Other vehicle occupant                                       0.006513
Motorcycle 50cc and under rider or passenger                 0.005742
Motorcycle over 125cc and up to 500cc rider or  passenger    0.003856
Goods vehicle (unknown weight) occupant                      0.003599
Goods vehicle (7.5 tonnes mgw and over) occupant             0.003599
Mobili

The 'casualty_type' column contains some more granular information about the casualties. Here, casualties are seperated into the following groups:
* Car occupant
* Van / Goods vehicle (3.5 tonnes mgw or under) occupant
* Motorcycle over 500cc rider or passenger
* Cyclist
* Pedestrian
* Taxi/Private hire car occupant
* Bus or coach occupant (17 or more pass seats)
* Motorcycle 125cc and under rider or passenger
* Motorcycle 50cc and under rider or passenger
* Mobility scooter rider
* Motorcycle - unknown cc rider or passenger
* Other vehicle occupant
* Motorcycle over 125cc and up to 500cc rider or passenger
* Electric motorcycle rider or passenger
* Good vehicle (over 3.5t. and under 7.5t.) occupant
* Good vehicle (unknown weight) occupant
* Minibus (8 - 16 passenger seats) occupant
* Tram occupant
* Horse rider
* Agricultural vehicle occupant

Wow, that is a lot of information!

From the normalized value_counts() it is clear that there is more information in certain categories of 'casualty_type' than others. Also, while we now have more information (seperating cyclists from drivers, and differentiating between vehicle types) we have lost the information on whether the occupants were drivers or passengers.

Let's combine the 'casualty_class' and 'casualty_type' columns to create more granular information about the casualties. It will also be helpful for modelling down the line to reduce the number of categories. We will:
1. Differentiate between drivers and passengers for all vehicle types.
2. Combine motorcycle categories.
3. Combine minibus drivers and passengers into car drivers and passengers.
4. Combine goods vehicle categories.
5. Combine bus, taxi, and tram categories as public transports (drivers or passengers).
6. Horse rider, Agricultural vehicle occupant, and Mobility scooter rider contain little information, so we will combine those into the 'Other vehicle occupant category'.

Down the line, we may want to simply combine the casualties into one 'Driver' or 'Passenger' feature, rather than differentiating between vehicle type. For now we will leave it to retain the granularity of the information.

In the cell below we will combine and relabel our new categories (albeit in quite a hacky way).

Let's drop the 'casualty_class' column so we're not repeating information.

In [326]:
train.loc[(train.casualty_class == 'Driver or rider') & ((train.casualty_type == 'Car occupant') | (train.casualty_type == 'Minibus (8 - 16 passenger seats) occupant')), 'casualty_type'] = 'Car driver'
train.loc[(train.casualty_class == 'Passenger') & ((train.casualty_type == 'Car occupant') | (train.casualty_type == 'Minibus (8 - 16 passenger seats) occupant')), 'casualty_type'] = 'Car passenger'
train.loc[(train.casualty_class == 'Driver or rider') & (train.casualty_type.str.contains('motorcycle', case = False)), 'casualty_type'] = 'Motorcycle rider'
train.loc[(train.casualty_class == 'Passenger') & (train.casualty_type.str.contains('motorcycle', case = False)), 'casualty_type'] = 'Motorcycle passenger'

train.loc[train.casualty_type.str.contains('goods vehicle', case = False), 'casualty_type'] = 'Goods vehicle occupant'
train.loc[train.casualty_type.str.contains('bus|taxi|tram', case = False), 'casualty_type'] = 'Public transport occupant'
train.loc[train.casualty_type.str.contains('mobility|agricultural|horse', case = False), 'casualty_type'] = 'Other vehicle occupant'


train.loc[(train.casualty_class == 'Driver or rider') & (train.casualty_type == 'Goods vehicle occupant'), 'casualty_type'] = 'Goods vehicle driver'
train.loc[(train.casualty_class == 'Passenger') & (train.casualty_type == 'Goods vehicle occupant'), 'casualty_type'] = 'Goods vehicle passenger'

train.loc[(train.casualty_class == 'Driver or rider') & (train.casualty_type == 'Public transport occupant'), 'casualty_type'] = 'Public transport driver'
train.loc[(train.casualty_class == 'Passenger') & (train.casualty_type == 'Public transport occupant'), 'casualty_type'] = 'Public transport passenger'

train.drop(labels='casualty_class', inplace=True, axis=1)

In [327]:
train.casualty_type.value_counts()

casualty_type
Car driver                    5994
Pedestrian                    1695
Car passenger                 1306
Cyclist                       1009
Motorcycle rider               775
Goods vehicle driver           342
Public transport passenger     245
Other vehicle occupant         126
Public transport driver        104
Goods vehicle passenger         47
Motorcycle passenger            26
Name: count, dtype: int64

We will fill the unknown pedestrian_crossing_physical_facilities with the most common value

In [328]:
print(train.pedestrian_crossing_physical_facilities.value_counts(dropna=False))
train.loc[train.pedestrian_crossing_physical_facilities.str.contains('Data missing'), 'pedestrian_crossing_physical_facilities'] = train.pedestrian_crossing_physical_facilities.value_counts().sort_values(ascending=False).index[0]

pedestrian_crossing_physical_facilities
No physical crossing facilities within 50 metres                             10023
Pedestrian phase at traffic signal junction                                    492
Pelican, puffin, toucan or similar non-junction pedestrian light crossing      469
Zebra                                                                          365
Central refuge                                                                 278
Footbridge or subway                                                            21
Data missing or out of range                                                    21
Name: count, dtype: int64


In [329]:
print(train.light_conditions.value_counts(dropna=False))
train.loc[train.light_conditions.str.contains('no lighting|lighting unknown|lights unlit', case = False), 'light_conditions'] = 'Darkness'


light_conditions
Daylight                       7978
Darkness - lights lit          2798
Darkness - no lighting          637
Darkness - lighting unknown     154
Darkness - lights unlit         102
Name: count, dtype: int64


We will seperate the wind conditions from the weather conditions, and then combine the raining, fine, and snowing categories in the weather conditions column.

In [330]:
train.weather_conditions.value_counts(dropna=False)

weather_conditions
Fine no high winds       9030
Raining no high winds    1537
Other                     381
Unknown                   186
Fine + high winds         185
Raining + high winds      183
Snowing no high winds      85
Fog or mist                62
Snowing + high winds       20
Name: count, dtype: int64

In [331]:
train['wind_conditions'] = 0
train.loc[train.weather_conditions.str.contains(r'\+ high winds', case = False), 'wind_conditions'] = '1'
train.loc[train.weather_conditions.str.contains('fine', case=False), 'weather_conditions'] = 'Fine'
train.loc[train.weather_conditions.str.contains('rain', case=False), 'weather_conditions'] = 'Raining'
train.loc[train.weather_conditions.str.contains('snow', case=False), 'weather_conditions'] = 'Snowing'
train.loc[train.weather_conditions.str.contains('unknown|missing', case=False), 'weather_conditions'] = train.weather_conditions.value_counts().sort_values(ascending=False).index[0]


In [332]:
train.weather_conditions.value_counts(dropna=False)

weather_conditions
Fine           9401
Raining        1720
Other           381
Snowing         105
Fog or mist      62
Name: count, dtype: int64

Again, we can combine some categories in road_surface_conditions, i.e. Snowing, snow, and frost or ice, raining with wet or damp, dry with fine. Missing data will be populated with the max value.

In [333]:
train.road_surface_conditions.value_counts(dropna=False)

road_surface_conditions
Dry                             8044
Wet or damp                     3404
Frost or ice                     122
Snow                              69
Data missing or out of range      26
Flood over 3cm. deep               4
Name: count, dtype: int64

In [334]:
train.loc[train.road_surface_conditions.str.contains('snow|frost|ice', case = False), 'road_surface_conditions'] = 'Freezing'
train.loc[train.road_surface_conditions.str.contains('dry', case = False), 'road_surface_conditions'] = 'Fine'
train.loc[train.road_surface_conditions.str.contains('wet|damp|flood|raining', case = False), 'road_surface_conditions'] = 'Wet or damp'

train.loc[train.road_surface_conditions.str.contains('unknown|missing', case=False), 'road_surface_conditions' ] = train.weather_conditions.value_counts().sort_values(ascending=False).index[0]

In [335]:
train.road_surface_conditions.value_counts(dropna=False)

road_surface_conditions
Fine           8070
Wet or damp    3408
Freezing        191
Name: count, dtype: int64

In [336]:
train = train[~train.sex_of_casualty.str.contains('Data missing or out of range')]

In [337]:
train = train[train.age_of_casualty != -1]

In [338]:
train.loc[train.car_passenger.str.contains('unknown|missing', case=False), 'car_passenger'] = train.car_passenger.value_counts().sort_values(ascending=False).index[0]

Okay, casualty_home_area_type and casualty_imd_decile have a lot of missing data (~23%), so we need to be a bit clever with filling the values. The data seems to be missing completely at random, so let's try a random imputation.

In [339]:
train.casualty_home_area_type.value_counts(dropna=False, normalize=True)

casualty_home_area_type
Urban area                      0.643176
Data missing or out of range    0.233564
Small town                      0.077913
Rural                           0.045347
Name: proportion, dtype: float64

In [340]:
train.casualty_imd_decile.value_counts(dropna=False, normalize=True)

casualty_imd_decile
Data missing or out of range    0.234002
Most deprived 10%               0.163617
More deprived 10-20%            0.115644
More deprived 20-30%            0.105752
More deprived 30-40%            0.078438
Less deprived 40-50%            0.066182
More deprived 40-50%            0.057428
Less deprived 30-40%            0.055677
Less deprived 20-30%            0.050512
Less deprived 10-20%            0.047798
Least deprived 10%              0.024950
Name: proportion, dtype: float64

In [341]:
train.loc[train.casualty_home_area_type.str.contains('Data missing'), 'casualty_home_area_type'] = np.NaN
train.casualty_home_area_type.fillna(value=random.choice(train[train.casualty_home_area_type != np.NaN]['casualty_home_area_type']), inplace=True)
train.casualty_home_area_type.value_counts(dropna=False)

casualty_home_area_type
Urban area    10015
Small town      890
Rural           518
Name: count, dtype: int64

In [342]:
train.loc[train.casualty_imd_decile.str.contains('Data missing'), 'casualty_imd_decile'] = np.NaN
train.casualty_imd_decile.fillna(value=random.choice(train[train.casualty_imd_decile != np.NaN]['casualty_imd_decile']), inplace=True)
train.casualty_imd_decile.value_counts(dropna=False)

casualty_imd_decile
More deprived 30-40%    3569
Most deprived 10%       1869
More deprived 10-20%    1321
More deprived 20-30%    1208
Less deprived 40-50%     756
More deprived 40-50%     656
Less deprived 30-40%     636
Less deprived 20-30%     577
Less deprived 10-20%     546
Least deprived 10%       285
Name: count, dtype: int64

In [343]:
train.casualty_imd_decile.value_counts(dropna=False, normalize=True)


casualty_imd_decile
More deprived 30-40%    0.312440
Most deprived 10%       0.163617
More deprived 10-20%    0.115644
More deprived 20-30%    0.105752
Less deprived 40-50%    0.066182
More deprived 40-50%    0.057428
Less deprived 30-40%    0.055677
Less deprived 20-30%    0.050512
Less deprived 10-20%    0.047798
Least deprived 10%      0.024950
Name: proportion, dtype: float64

In [344]:
train.casualty_home_area_type.value_counts(dropna=False, normalize=True)


casualty_home_area_type
Urban area    0.876740
Small town    0.077913
Rural         0.045347
Name: proportion, dtype: float64

In [345]:
train.columns

Index(['number_of_vehicles', 'number_of_casualties', 'day_of_week', 'time',
       'local_authority_district', 'speed_limit', 'junction_detail',
       'pedestrian_crossing_physical_facilities', 'light_conditions',
       'weather_conditions', 'road_surface_conditions', 'urban_or_rural_area',
       'sex_of_casualty', 'age_of_casualty', 'casualty_severity',
       'car_passenger', 'casualty_type', 'casualty_home_area_type',
       'casualty_imd_decile', 'Year', 'Month', 'Day', 'wind_conditions'],
      dtype='object')

In [346]:
categorical_feats = train.describe(exclude=[np.number]).columns
print(categorical_feats)

Index(['day_of_week', 'local_authority_district', 'junction_detail',
       'pedestrian_crossing_physical_facilities', 'light_conditions',
       'weather_conditions', 'road_surface_conditions', 'urban_or_rural_area',
       'sex_of_casualty', 'casualty_severity', 'car_passenger',
       'casualty_type', 'casualty_home_area_type', 'casualty_imd_decile',
       'wind_conditions'],
      dtype='object')


In [347]:
train = pd.get_dummies(train, columns=categorical_feats, drop_first=True)