In [3]:
import pandas as pd
import numpy as np
import datetime as dt
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit
plt.style.use('ggplot')
import warnings
warnings.simplefilter('ignore')

# loading the accident dataset

In [4]:
accidents = pd.read_csv("Accidents0515.csv",index_col="Accident_Index")
accidents.head()

Unnamed: 0_level_0,Location_Easting_OSGR,Location_Northing_OSGR,Longitude,Latitude,Police_Force,Accident_Severity,Number_of_Vehicles,Number_of_Casualties,Date,Day_of_Week,...,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Special_Conditions_at_Site,Carriageway_Hazards,Urban_or_Rural_Area,Did_Police_Officer_Attend_Scene_of_Accident,LSOA_of_Accident_Location
Accident_Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
200501BS00001,525680.0,178240.0,-0.19117,51.489096,1,2,1,1,04/01/2005,3,...,0,1,1,2,2,0,0,1,1,E01002849
200501BS00002,524170.0,181650.0,-0.211708,51.520075,1,3,1,1,05/01/2005,4,...,0,5,4,1,1,0,0,1,1,E01002909
200501BS00003,524520.0,182240.0,-0.206458,51.525301,1,3,2,1,06/01/2005,5,...,0,0,4,1,1,0,0,1,1,E01002857
200501BS00004,526900.0,177530.0,-0.173862,51.482442,1,3,1,1,07/01/2005,6,...,0,0,1,1,1,0,0,1,1,E01002840
200501BS00005,528060.0,179040.0,-0.156618,51.495752,1,3,1,1,10/01/2005,2,...,0,0,7,1,2,0,0,1,1,E01002863


###### checking for columns with missing values

In [5]:
accidents.columns[accidents.isnull().any()]

Index(['Location_Easting_OSGR', 'Location_Northing_OSGR', 'Longitude',
       'Latitude', 'Time', 'LSOA_of_Accident_Location'],
      dtype='object')

In [6]:
#deleting the least important columns
accidents1=accidents.drop(['Location_Easting_OSGR','1st_Road_Class','Location_Northing_OSGR','Police_Force',
             '1st_Road_Number','2nd_Road_Class','2nd_Road_Number','Pedestrian_Crossing-Human_Control',
             'Pedestrian_Crossing-Physical_Facilities','Special_Conditions_at_Site','Carriageway_Hazards',
             'LSOA_of_Accident_Location'],axis='columns',inplace=False)
accidents1.head()

Unnamed: 0_level_0,Longitude,Latitude,Accident_Severity,Number_of_Vehicles,Number_of_Casualties,Date,Day_of_Week,Time,Local_Authority_(District),Local_Authority_(Highway),Road_Type,Speed_limit,Junction_Detail,Junction_Control,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Urban_or_Rural_Area,Did_Police_Officer_Attend_Scene_of_Accident
Accident_Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
200501BS00001,-0.19117,51.489096,2,1,1,04/01/2005,3,17:42,12,E09000020,6,30,0,-1,1,2,2,1,1
200501BS00002,-0.211708,51.520075,3,1,1,05/01/2005,4,17:36,12,E09000020,3,30,6,2,4,1,1,1,1
200501BS00003,-0.206458,51.525301,3,2,1,06/01/2005,5,00:15,12,E09000020,6,30,0,-1,4,1,1,1,1
200501BS00004,-0.173862,51.482442,3,1,1,07/01/2005,6,10:35,12,E09000020,6,30,0,-1,1,1,1,1,1
200501BS00005,-0.156618,51.495752,3,1,1,10/01/2005,2,21:13,12,E09000020,6,30,0,-1,7,1,2,1,1


In [7]:
#check for more columns with missing values
accidents1.columns[accidents1.isnull().any()]

Index(['Longitude', 'Latitude', 'Time'], dtype='object')

In [8]:
#we check if the dataframe has columns with object datetypes...and if so we convert them to numeric datatypes....we do this coz 
#interpolation cnt work with obect datatypes in Timeseries
accidents1.dtypes

Longitude                                      float64
Latitude                                       float64
Accident_Severity                                int64
Number_of_Vehicles                               int64
Number_of_Casualties                             int64
Date                                            object
Day_of_Week                                      int64
Time                                            object
Local_Authority_(District)                       int64
Local_Authority_(Highway)                       object
Road_Type                                        int64
Speed_limit                                      int64
Junction_Detail                                  int64
Junction_Control                                 int64
Light_Conditions                                 int64
Weather_Conditions                               int64
Road_Surface_Conditions                          int64
Urban_or_Rural_Area                              int64
Did_Police

# converting object datatype to numeric

In [9]:
#converting object datatypes to numeric datatypes
for col in accidents1:
    accidents1[col] = pd.to_numeric(accidents1[col], errors ='coerce')#with errrors='coerce' any value tht cnt be converted to a number is
    #converted to NaN
accidents1.dtypes

Longitude                                      float64
Latitude                                       float64
Accident_Severity                                int64
Number_of_Vehicles                               int64
Number_of_Casualties                             int64
Date                                           float64
Day_of_Week                                      int64
Time                                           float64
Local_Authority_(District)                       int64
Local_Authority_(Highway)                      float64
Road_Type                                        int64
Speed_limit                                      int64
Junction_Detail                                  int64
Junction_Control                                 int64
Light_Conditions                                 int64
Weather_Conditions                               int64
Road_Surface_Conditions                          int64
Urban_or_Rural_Area                              int64
Did_Police

In [10]:
accidents1.interpolate(method ='linear',order = 3)


Unnamed: 0_level_0,Longitude,Latitude,Accident_Severity,Number_of_Vehicles,Number_of_Casualties,Date,Day_of_Week,Time,Local_Authority_(District),Local_Authority_(Highway),Road_Type,Speed_limit,Junction_Detail,Junction_Control,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Urban_or_Rural_Area,Did_Police_Officer_Attend_Scene_of_Accident
Accident_Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
200501BS00001,-0.191170,51.489096,2,1,1,,3,,12,,6,30,0,-1,1,2,2,1,1
200501BS00002,-0.211708,51.520075,3,1,1,,4,,12,,3,30,6,2,4,1,1,1,1
200501BS00003,-0.206458,51.525301,3,2,1,,5,,12,,6,30,0,-1,4,1,1,1,1
200501BS00004,-0.173862,51.482442,3,1,1,,6,,12,,6,30,0,-1,1,1,1,1,1
200501BS00005,-0.156618,51.495752,3,1,1,,2,,12,,6,30,0,-1,7,1,2,1,1
200501BS00006,-0.203238,51.515540,3,2,1,,3,,12,,6,30,0,-1,1,2,2,1,1
200501BS00007,-0.211277,51.512695,3,2,1,,5,,12,,6,30,3,4,4,1,1,1,1
200501BS00009,-0.187623,51.502260,3,1,2,,6,,12,,3,30,0,-1,1,1,1,1,1
200501BS00010,-0.167342,51.483420,3,2,2,,7,,12,,6,30,6,2,4,1,1,1,1
200501BS00011,-0.206531,51.512443,3,2,5,,7,,12,,6,30,3,4,1,1,1,1,1


In [11]:
print(accidents1.isnull().values.any())

True


In [12]:
accidents1.fillna(accidents1.mean(axis=0))

Unnamed: 0_level_0,Longitude,Latitude,Accident_Severity,Number_of_Vehicles,Number_of_Casualties,Date,Day_of_Week,Time,Local_Authority_(District),Local_Authority_(Highway),Road_Type,Speed_limit,Junction_Detail,Junction_Control,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Urban_or_Rural_Area,Did_Police_Officer_Attend_Scene_of_Accident
Accident_Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
200501BS00001,-0.191170,51.489096,2,1,1,,3,,12,,6,30,0,-1,1,2,2,1,1
200501BS00002,-0.211708,51.520075,3,1,1,,4,,12,,3,30,6,2,4,1,1,1,1
200501BS00003,-0.206458,51.525301,3,2,1,,5,,12,,6,30,0,-1,4,1,1,1,1
200501BS00004,-0.173862,51.482442,3,1,1,,6,,12,,6,30,0,-1,1,1,1,1,1
200501BS00005,-0.156618,51.495752,3,1,1,,2,,12,,6,30,0,-1,7,1,2,1,1
200501BS00006,-0.203238,51.515540,3,2,1,,3,,12,,6,30,0,-1,1,2,2,1,1
200501BS00007,-0.211277,51.512695,3,2,1,,5,,12,,6,30,3,4,4,1,1,1,1
200501BS00009,-0.187623,51.502260,3,1,2,,6,,12,,3,30,0,-1,1,1,1,1,1
200501BS00010,-0.167342,51.483420,3,2,2,,7,,12,,6,30,6,2,4,1,1,1,1
200501BS00011,-0.206531,51.512443,3,2,5,,7,,12,,6,30,3,4,1,1,1,1,1


## importing the casualities data set

In [13]:
casualities = pd.read_csv('Casualties0515.csv',error_bad_lines = False,warn_bad_lines = False)
casualities.head()

Unnamed: 0,Accident_Index,Vehicle_Reference,Casualty_Reference,Casualty_Class,Sex_of_Casualty,Age_of_Casualty,Age_Band_of_Casualty,Casualty_Severity,Pedestrian_Location,Pedestrian_Movement,Car_Passenger,Bus_or_Coach_Passenger,Pedestrian_Road_Maintenance_Worker,Casualty_Type,Casualty_Home_Area_Type
0,200501BS00001,1,1,3,1,37,7,2,1,1,0,0,-1,0,1
1,200501BS00002,1,1,2,1,37,7,3,0,0,0,4,-1,11,1
2,200501BS00003,2,1,1,1,62,9,3,0,0,0,0,-1,9,1
3,200501BS00004,1,1,3,1,30,6,3,5,2,0,0,-1,0,1
4,200501BS00005,1,1,1,1,49,8,3,0,0,0,0,-1,3,-1


###### checking for columns with missing values

In [14]:
casualities.columns[casualities.isnull().any()]

Index([], dtype='object')

In [15]:
#check if there any object data types
casualities.dtypes

Accident_Index                        object
Vehicle_Reference                      int64
Casualty_Reference                     int64
Casualty_Class                         int64
Sex_of_Casualty                        int64
Age_of_Casualty                        int64
Age_Band_of_Casualty                   int64
Casualty_Severity                      int64
Pedestrian_Location                    int64
Pedestrian_Movement                    int64
Car_Passenger                          int64
Bus_or_Coach_Passenger                 int64
Pedestrian_Road_Maintenance_Worker     int64
Casualty_Type                          int64
Casualty_Home_Area_Type                int64
dtype: object

## dropping the less important columns

In [16]:
#dropping the less important columns
casualities1 = casualities.drop(['Pedestrian_Movement','Vehicle_Reference','Casualty_Reference','Age_of_Casualty'
             ,'Bus_or_Coach_Passenger','Pedestrian_Road_Maintenance_Worker','Casualty_Type',
             'Casualty_Home_Area_Type'],axis='columns',inplace=False)
casualities1.head()

Unnamed: 0,Accident_Index,Casualty_Class,Sex_of_Casualty,Age_Band_of_Casualty,Casualty_Severity,Pedestrian_Location,Car_Passenger
0,200501BS00001,3,1,7,2,1,0
1,200501BS00002,2,1,7,3,0,0
2,200501BS00003,1,1,9,3,0,0
3,200501BS00004,3,1,6,3,5,0
4,200501BS00005,1,1,8,3,0,0


In [17]:
casualities1.isnull().values.any()

False

# loading the vehicle dataset

In [18]:
vehicles= pd.read_csv("Vehicles0515.csv",error_bad_lines = False,warn_bad_lines = False)
vehicles.head()

Unnamed: 0,Accident_Index,Vehicle_Reference,Vehicle_Type,Towing_and_Articulation,Vehicle_Manoeuvre,Vehicle_Location-Restricted_Lane,Junction_Location,Skidding_and_Overturning,Hit_Object_in_Carriageway,Vehicle_Leaving_Carriageway,...,Was_Vehicle_Left_Hand_Drive?,Journey_Purpose_of_Driver,Sex_of_Driver,Age_of_Driver,Age_Band_of_Driver,Engine_Capacity_(CC),Propulsion_Code,Age_of_Vehicle,Driver_IMD_Decile,Driver_Home_Area_Type
0,200501BS00001,1,9,0,18,0,0,0,0,0,...,1,15,2,74,10,-1,-1,-1,7,1
1,200501BS00002,1,11,0,4,0,3,0,0,0,...,1,1,1,42,7,8268,2,3,-1,-1
2,200501BS00003,1,11,0,17,0,0,0,4,0,...,1,1,1,35,6,8300,2,5,2,1
3,200501BS00003,2,9,0,2,0,0,0,0,0,...,1,15,1,62,9,1762,1,6,1,1
4,200501BS00004,1,9,0,18,0,0,0,0,0,...,1,15,2,49,8,1769,1,4,2,1


## dropping the less important columns

In [19]:
#dropping the Accident_Index in cs dataframe
vehicles1 = vehicles.drop(['Vehicle_Reference','Towing_and_Articulation','Junction_Location','Vehicle_Leaving_Carriageway',
             'Hit_Object_off_Carriageway','Was_Vehicle_Left_Hand_Drive?','Age_Band_of_Driver','Engine_Capacity_(CC)',
             'Propulsion_Code','Driver_IMD_Decile','Driver_Home_Area_Type'],axis='columns',inplace=False)
vehicles1.head()

Unnamed: 0,Accident_Index,Vehicle_Type,Vehicle_Manoeuvre,Vehicle_Location-Restricted_Lane,Skidding_and_Overturning,Hit_Object_in_Carriageway,1st_Point_of_Impact,Journey_Purpose_of_Driver,Sex_of_Driver,Age_of_Driver,Age_of_Vehicle
0,200501BS00001,9,18,0,0,0,1,15,2,74,-1
1,200501BS00002,11,4,0,0,0,4,1,1,42,3
2,200501BS00003,11,17,0,0,4,4,1,1,35,5
3,200501BS00003,9,2,0,0,0,3,15,1,62,6
4,200501BS00004,9,18,0,0,0,1,15,2,49,4


In [20]:
vehicles1.isnull().values.any()

False

In [21]:
#he reduce() function is used to apply a particular function passed in its argument to all of the list elements
from functools import reduce
Final_data = [accidents1, casualities1, vehicles1]
Final_data1 = reduce(lambda left,right: pd.merge(left,right,on='Accident_Index'), Final_data)#lambda takes in other functions as arguments
Final_data1.head()

Unnamed: 0,Accident_Index,Longitude,Latitude,Accident_Severity,Number_of_Vehicles,Number_of_Casualties,Date,Day_of_Week,Time,Local_Authority_(District),...,Vehicle_Type,Vehicle_Manoeuvre,Vehicle_Location-Restricted_Lane,Skidding_and_Overturning,Hit_Object_in_Carriageway,1st_Point_of_Impact,Journey_Purpose_of_Driver,Sex_of_Driver,Age_of_Driver,Age_of_Vehicle
0,200501BS00001,-0.19117,51.489096,2,1,1,,3,,12,...,9,18,0,0,0,1,15,2,74,-1
1,200501BS00002,-0.211708,51.520075,3,1,1,,4,,12,...,11,4,0,0,0,4,1,1,42,3
2,200501BS00003,-0.206458,51.525301,3,2,1,,5,,12,...,11,17,0,0,4,4,1,1,35,5
3,200501BS00003,-0.206458,51.525301,3,2,1,,5,,12,...,9,2,0,0,0,3,15,1,62,6
4,200501BS00004,-0.173862,51.482442,3,1,1,,6,,12,...,9,18,0,0,0,1,15,2,49,4


In [22]:
Final_data1.isnull().values.any()

True

In [None]:
Final_data1.isnull().sum()

# visualisations

In [118]:
!pip install xgboost

Collecting xgboost
  Downloading https://files.pythonhosted.org/packages/5e/49/b95c037b717b4ceadc76b6e164603471225c27052d1611d5a2e832757945/xgboost-0.90-py2.py3-none-win_amd64.whl (18.3MB)
Installing collected packages: xgboost
Successfully installed xgboost-0.90


In [121]:
%matplotlib inline 
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
from xgboost.sklearn import XGBClassifier
from xgboost import plot_importance