In [1]:
import pandas as pd
import geopandas as gpd
import helper

from shapely.geometry import Point
from pathlib import Path

from warnings import filterwarnings
filterwarnings("ignore")



In [2]:
# import data
accidents = pd.read_csv("./data/accident.csv")
lookup = pd.read_csv(r"./data/road-safety-lookups.csv")

print("accidents: ", accidents.shape)
accidents.head()

accidents:  (91199, 36)


Unnamed: 0,accident_index,accident_year,accident_reference,location_easting_osgr,location_northing_osgr,longitude,latitude,police_force,accident_severity,number_of_vehicles,...,pedestrian_crossing_physical_facilities,light_conditions,weather_conditions,road_surface_conditions,special_conditions_at_site,carriageway_hazards,urban_or_rural_area,did_police_officer_attend_scene_of_accident,trunk_road_flag,lsoa_of_accident_location
0,2020010219808,2020,10219808,521389.0,175144.0,-0.254001,51.462262,1,3,1,...,9,1,9,9,0,0,1,3,2,E01004576
1,2020010220496,2020,10220496,529337.0,176237.0,-0.139253,51.470327,1,3,1,...,4,1,1,1,0,0,1,1,2,E01003034
2,2020010228005,2020,10228005,526432.0,182761.0,-0.178719,51.529614,1,3,1,...,0,4,1,2,0,0,1,1,2,E01004726
3,2020010228006,2020,10228006,538676.0,184371.0,-0.001683,51.54121,1,2,1,...,4,4,1,1,0,0,1,1,2,E01003617
4,2020010228011,2020,10228011,529324.0,181286.0,-0.137592,51.515704,1,3,1,...,0,4,1,1,0,0,1,1,2,E01004763


### Clean data

In [3]:
# find missing values in each column
accidents.isnull().sum()

accident_index                                  0
accident_year                                   0
accident_reference                              0
location_easting_osgr                          14
location_northing_osgr                         14
longitude                                      14
latitude                                       14
police_force                                    0
accident_severity                               0
number_of_vehicles                              0
number_of_casualties                            0
date                                            0
day_of_week                                     0
time                                            0
local_authority_district                        0
local_authority_ons_district                    0
local_authority_highway                         0
first_road_class                                0
first_road_number                               0
road_type                                       0


In [4]:
# use listwise deletion since dataset is large with very few missing vals 
accidents = accidents.dropna()
print("Dimension after dropping: ", accidents.shape)

Dimension after dropping:  (91185, 36)


In [5]:
# check for duplicates
accidents.duplicated().sum()

0

### Modify labels for EDA

The given dataset uses numeric labels and their meanings are documented in the **lookup** file. For EDA purposes, we will first convert those to meaningful text labels.

In [6]:
columns_to_convert = ["accident_severity", "urban_or_rural_area", "road_type",\
                      "pedestrian_crossing_physical_facilities", "special_conditions_at_site",\
                      "road_surface_conditions", "weather_conditions", "carriageway_hazards",\
                      "light_conditions"]

for col in columns_to_convert:
    accidents[col] = accidents[col].replace(helper.name_lookup(lookup, col))

**Create useful columns**

In [7]:
# col for filtering major and non-major accidents with boolean masks
accidents["is_major_accident"] = False
major_indices = accidents.query("(accident_severity=='Fatal') & (number_of_casualties>=3)").index
accidents.loc[major_indices, "is_major_accident"] = True 

In [8]:
# extract finer details from datetime column
accidents["hour"] = accidents.time.str.extract(r"(\d+):", expand=False).astype(int)
# accidents['minutes'] = accidents.time.str.extract(r":(\d+)", expand=False).astype(int)
accidents.date = pd.to_datetime(accidents.date, dayfirst=True)

**Make GeoDataFrame**

In [9]:
### Create the geo dataframe unless already made

# look for the geo_df dataframe
my_file = Path("data/geo_df.geojson") # path for geo_df

if my_file.is_file():
    print("GeoDataFrame is already created.")
else:
    # create geo_df dataframe from main dataframe
    geometry = [Point(xy) for xy in zip(accidents["longitude"], accidents["latitude"])]
    geo_df = gpd.GeoDataFrame(accidents, crs="crs", geometry=geometry)
    
    # address will later be fetched from Photon's reverse geocoding API 
    geo_df["address"] = -1 # create column for saving addresses
    
    # save to file
    geo_df.to_file("data/geo_df.geojson", driver="GeoJSON") # export and save geo_df

GeoDataFrame is already created.


**Save data**

In [10]:
accidents.to_csv("./data/accidents_clean.csv", index=False)