In [566]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import datetime
import time
import seaborn as sns

In [567]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [568]:
df2016 = pd.read_csv('data/rtc_2016.csv')
df2017 = pd.read_csv('data/rtc_2017.csv')
df2018 = pd.read_csv('data/rtc_2018.csv')
df2019 = pd.read_csv('data/rtc_2019.csv')
df2020 = pd.read_csv('data/rtc_2020.csv')

#### Aggregate Data

In [569]:
total_df = pd.concat([df2016,df2017,df2018,df2019,df2020], ignore_index=True)

In [571]:
total_df.drop('Unnamed: 0', inplace=True, axis=1)


In [572]:
total_df.replace(0, np.nan, inplace=True)

In [573]:
# Keep only the rows with at least 2 non-NA values.
# total_df.dropna(thresh=3,inplace=True)
total_df.dropna(subset=["crash_time","report_time", "arrival_time","total_involved","causes"],inplace=True)

#### Time Splitter and Cleaner Function

In [574]:
new_crash_time = total_df['crash_time'].astype(str).apply(lambda x: re.sub("[^0-9]", "", x))

In [575]:
new_arrival_time = total_df['arrival_time'].astype(str).apply(lambda x: re.sub("[^0-9]", "", x))

In [576]:
new_response_time = total_df['response_time'].astype(str).apply(lambda x: re.sub("[^0-9]", "", x))

In [577]:
total_df['crash_time'] = new_crash_time.apply(np.int64)

In [578]:
total_df['arrival_time'] = new_arrival_time.apply(np.int64)

In [579]:
total_df['report_time'] = total_df['report_time'].round(0).astype(int)

In [580]:
total_df['date'] = total_df['date'].astype(str)

In [581]:
def clean_date(date_given):
    try:
        return pd.to_datetime(date_given,errors="coerce")
    except:
        return '0'
    

In [582]:
total_df['date'] = total_df['date'].apply(lambda x: clean_date(x))

In [583]:
# Four(4 Data Points with missing date)
total_df[np.isnat(total_df['date'])]

Unnamed: 0,sn,command,date,crash_time,report_time,arrival_time,response_time,route,location,vehicle_no,vehicle_type,vehicle_cat,vehicle_make,vehicle_model,fleet_operator,name_of_driver,dl_no,causes,no_injured_male_adult,no_injured_female_adult,no_injured_male_child,no_injured_female_child,total_injured,no_killed_male_adult,no_killed_female_adult,no_killed_male_child,no_killed_female_child,total_killed,no_involved_male_adult,no_involved_female_adult,no_involved_male_child,no_involved_female_child,total_involved
107,5,RS11.22,NaT,1640,1653,1658,5.0,OWO - T/SHIP,KM 4 GRA,NO NOS & NO NOS,2 BIKE,2 COM,2 BAJAJ,2 BOXERS,,,,SPV,2.0,,,,2.0,,,,,,2.0,,,,2.0
568,1,RS11.23,NaT,11123,1132,1149,17.0,IKR - OWO,KM 13 ADEFARATI,JJJ 314 CX,CAR,PVT,TYT,CAMRY,,,,SPV,3.0,1.0,,,4.0,,,,,,3.0,1.0,,,4.0
610,4,RS11.22,NaT,1003,1033,1041,8.0,OWO - T/SHIP,KM 10 B4 EMURE JUNCTION,KJA 405 XL,BUS,COM,FOTON,VIEW,,,,TBT,,,,,,,,,,,1.0,,,,1.0
785,1,RS11.21,NaT,1210,1211,1213,2.0,ORE - BEN,KM 7 OPP SUNSHINE PARK,NO NOS,BUS,COM,TYT,HIACE,,,,SPV,1.0,,,,1.0,,,,,,2.0,,,,2.0


In [584]:
total_df['month'] = pd.to_numeric(total_df['date'].dt.month.astype(int, errors='ignore'))
total_df['day'] = pd.to_numeric(total_df['date'].dt.day.astype(int, errors='ignore'))
total_df['year'] = pd.to_numeric(total_df['date'].dt.year.astype(int, errors='ignore'))

In [585]:
total_df['month'].fillna(method='bfill',inplace=True)
total_df['day'].fillna(method='bfill', inplace=True)
total_df['year'].fillna(method='bfill', inplace=True)

In [586]:
total_df['year'] = total_df['year'].apply(np.int64)
total_df['month'] = total_df['month'].apply(np.int64)
total_df['day'] = total_df['day'].apply(np.int64)

In [587]:
def fill_missing_date(date, year, month, day):
    if pd.isnull(date):
        new_date = datetime.datetime(year=year, month=month, day=day)
        return new_date
    else:
        return date

In [588]:
total_df['date'] = total_df.apply(lambda x:fill_missing_date(x['date'], x['year'], x['month'], x['day']), axis=1)

In [589]:
total_df['crash_time'].apply(lambda x: len(str(x))).unique()

array([3, 4, 2, 5], dtype=int64)

In [590]:
total_df['report_time'].apply(lambda x: len(str(x))).unique()

array([3, 4, 2, 5], dtype=int64)

In [591]:
total_df['arrival_time'].apply(lambda x: len(str(x))).unique()

array([3, 4, 2], dtype=int64)

In [592]:
total_df['response_time'].apply(lambda x: len(str(x))).unique()

array([3, 4], dtype=int64)

In [593]:
total_df[total_df['crash_time'].astype(str).map(len)==5]


Unnamed: 0,sn,command,date,crash_time,report_time,arrival_time,response_time,route,location,vehicle_no,vehicle_type,vehicle_cat,vehicle_make,vehicle_model,fleet_operator,name_of_driver,dl_no,causes,no_injured_male_adult,no_injured_female_adult,no_injured_male_child,no_injured_female_child,total_injured,no_killed_male_adult,no_killed_female_adult,no_killed_male_child,no_killed_female_child,total_killed,no_involved_male_adult,no_involved_female_adult,no_involved_male_child,no_involved_female_child,total_involved,month,day,year
568,1,RS11.23,2019-02-18,11123,1132,1149,17.0,IKR - OWO,KM 13 ADEFARATI,JJJ 314 CX,CAR,PVT,TYT,CAMRY,,,,SPV,3.0,1.0,,,4.0,,,,,,3.0,1.0,,,4.0,2,18,2019


In [594]:
total_df[total_df['report_time'].astype(str).map(len)==5]


Unnamed: 0,sn,command,date,crash_time,report_time,arrival_time,response_time,route,location,vehicle_no,vehicle_type,vehicle_cat,vehicle_make,vehicle_model,fleet_operator,name_of_driver,dl_no,causes,no_injured_male_adult,no_injured_female_adult,no_injured_male_child,no_injured_female_child,total_injured,no_killed_male_adult,no_killed_female_adult,no_killed_male_child,no_killed_female_child,total_killed,no_involved_male_adult,no_involved_female_adult,no_involved_male_child,no_involved_female_child,total_involved,month,day,year
526,4,RS11.21,2019-01-24,1159,12013,1203,4.0,ORE - LAG,KM25,ACA 636 XT & XA 336 EKY,BUS & TRL,2 COM,TYT FOTON,BUS & TRL,,,,SPV,1.0,1.0,,,2.0,,,,,,8.0,7.0,,,15.0,1,24,2019


In [595]:
total_df.loc[total_df['crash_time'].astype(str).map(len)==5, 'crash_time'] = 1123

In [596]:
total_df.loc[total_df['report_time']==7855, 'report_time'] = 1855

In [597]:
total_df.loc[total_df['report_time'].astype(str).map(len)==5, 'report_time'] = 1213

In [598]:
total_df

Unnamed: 0,sn,command,date,crash_time,report_time,arrival_time,response_time,route,location,vehicle_no,vehicle_type,vehicle_cat,vehicle_make,vehicle_model,fleet_operator,name_of_driver,dl_no,causes,no_injured_male_adult,no_injured_female_adult,no_injured_male_child,no_injured_female_child,total_injured,no_killed_male_adult,no_killed_female_adult,no_killed_male_child,no_killed_female_child,total_killed,no_involved_male_adult,no_involved_female_adult,no_involved_male_child,no_involved_female_child,total_involved,month,day,year
0,1,RS11.2,2016-02-09,335,405,412,7.0,AKR -IPT,KM 2.5 NEAR ANTI CULTIT P/STATION,KPA 328 LG & LSR 999 XN,2 BUS,2 COM,2 TYT,2 HIACE,,,,SOS,3.0,3.0,,,6.0,,,,,,6.0,3.0,,,9.0,2,9,2016
1,2,RS11.2,2016-02-09,1030,1032,1035,3.0,AKR - IPT,KM 6 ILARA MOKIN,AKD 991 CE & EKY 431 EH,BUS & CAR,2 PVT,HONDA & TYT,ODESS& COROLLA,,,,DGD,2.0,,,,2.0,,,,,,2.0,,,,2.0,2,9,2016
2,3,RS11.2,2016-08-09,45,56,105,9.0,AKR - OWO,KM 2.5 SEENI JUNT.,DKA 378 YYY & TFB 118 XA,2 BUS,2 COM,2 PGT,2 J5,,,,DGD,7.0,,,,7.0,,,,,,9.0,,,,9.0,8,9,2016
3,4,RS11.2,2016-09-18,1600,1623,1635,12.0,AKR - IPT,KM 13 ERO JUNT,"FKJ 611 XB, KNR 377 XA & GED 81 XA",3 BUS,2 COM & PVT,2 TYT & IZUZU,"HIACE, PICNIC & BUS",,,,WOV,3.0,2.0,,,5.0,1.0,1.0,,1.0,3.0,8.0,5.0,,1.0,14.0,9,18,2016
4,5,RS11.2,2016-09-23,1830,1832,1833,1.0,AKR - OWO,KM1.5 NNPC MEGA STATION,AKR 397 AT & WWW 405 AA,2 CAR,2 PVT,2 NISSAN,2 ALMERA,,,,WOV,2.0,,,,2.0,,,,,,2.0,1.0,,1.0,4.0,9,23,2016
5,6,RS11.2,2016-09-25,450,455,500,5.0,AKR - OWO,KM 4 QUARTER GUARD,NO NOS,BUS,PVT,TYT,SIENNA,,,,SOS,3.0,,,,3.0,,,,,,3.0,,,,3.0,9,25,2016
6,1,RS11.21,2016-03-09,930,950,1017,27.0,ORE - LAG,KM 23 OMOTOSHO AXIS,SMK 355 CK & XA 349 DGB,BUS & TRK,PVT & COM,TYT & BEDFORD,SIENNA & TRK,,,,SPV,5.0,1.0,,,6.0,1.0,1.0,,,2.0,6.0,2.0,,,8.0,3,9,2016
7,2,RS11.21,2016-09-22,2020,2025,2027,2.0,ORE - LAG,KM 4 B4 NNPC FILLING STATION,GWK 730 XA,BUS,COM,TYT,SIENNA,,,,SLV,3.0,3.0,,1.0,7.0,1.0,,,,1.0,4.0,3.0,,1.0,8.0,9,22,2016
8,3,RS11.21,2016-09-24,1235,1240,1245,5.0,ORE - BEN,KM 13 OWENA VILLAGE,APP 525 EC,JEEP,PVT,TYT,JEEP,,,,SLV,1.0,1.0,,,2.0,1.0,2.0,,,3.0,2.0,3.0,,,5.0,9,24,2016
9,4,RS11.21,2016-09-25,1410,1417,1422,5.0,ORE - LAG,QKM 7 AFTER MTN MAST,AWK 815 XA & NO NOS,BUS & TRK,2 COM,TYT & MAN DIESEL,HIACE & TRK,,,,SLV & OTH,4.0,2.0,,,6.0,,,,,,9.0,5.0,1.0,,15.0,9,25,2016


In [599]:
def time_splitter(time):
    to_string = str(time)
    if len(to_string) == 2:
        return pd.to_datetime("00" + ":" + str(time), format= '%H:%M')
    elif len(to_string) == 3:
        return pd.to_datetime(to_string[0] +":" + to_string[1:], format= '%H:%M')
    elif len(to_string) == 4:
        return pd.to_datetime(to_string[0:2] + ":"+ to_string[2:], format= '%H:%M')
    else:
        return pd.to_datetime(str(time), format= '%H:%M')

In [600]:
total_df['crash_time'] = total_df['crash_time'].apply(lambda x:time_splitter(x)).dt.time

In [601]:
total_df['report_time'] = total_df['report_time'].apply(lambda x:time_splitter(x)).dt.time

In [602]:
total_df['arrival_time'] = total_df['arrival_time'].apply(lambda x:time_splitter(x)).dt.time

In [603]:
def combine_date_time(date, time):
    return datetime.datetime.combine(date, time)

def subtract_date_time(start, end):
    return end - start

In [604]:
crash_datetime = total_df.apply(lambda x: combine_date_time(x['date'], x['crash_time']), axis=1)
report_datetime = total_df.apply(lambda x: combine_date_time(x['date'], x['report_time']), axis=1)
arrival_datetime = total_df.apply(lambda x: combine_date_time(x['date'], x['arrival_time']), axis=1)

In [605]:
total_df['report_minus_crash'] =  subtract_date_time(crash_datetime, report_datetime).apply(lambda x:x.total_seconds()).astype(int)

In [606]:
total_df['arrival_minus_crash'] =  subtract_date_time(crash_datetime, arrival_datetime).apply(lambda x:x.total_seconds()).astype(int)

In [607]:
total_df["arrival_minus_report"] =  subtract_date_time(report_datetime, arrival_datetime).apply(lambda x:x.total_seconds()).astype(int)

In [608]:
total_df['datetime'] = pd.to_datetime(crash_datetime)

In [609]:
total_df.replace(np.nan, 0, inplace=True)

In [610]:
total_df['fleet_operator'] = total_df['fleet_operator'].replace(0, np.nan)
total_df['name_of_driver'] = total_df['name_of_driver'].replace(0, np.nan)
total_df['dl_no'] = total_df['dl_no'].replace(0, np.nan)

In [611]:
total_df['vehicle_type']

0                                             2 BUS
1                                         BUS & CAR
2                                             2 BUS
3                                             3 BUS
4                                             2 CAR
5                                               BUS
6                                         BUS & TRK
7                                               BUS
8                                              JEEP
9                                         BUS & TRK
10                                              CAR
11                                              CAR
12                                              CAR
13                                             BIKE
14                                      2 CAR & BUS
15                                             BIKE
16                                           2 BIKE
17                                       TRK & BIKE
18                                            2 TRK
19          

In [612]:
def get_automobile_no(car_det):
    try:
        # print(car_det)
        # Split the various car category
        car_det = car_det.split('&')
        all_g= []
        automobile_no = 0
        # Iterate through each item
        for item in car_det:
            item = item.strip()
            # Separate inner lists
            item = re.split(',| * ',item)
            # Iterate through the inner loop and add to the primary list
            if type(item) is list:
                for val in item:
                    val = val.strip()
                    if val not in ["","*",'HIT','RUN', '(HIT', 'RUN)']:
                        all_g.append(val)
            # Iterate through ther list generate including the numbers
        for each in all_g: 
            if each.isdigit():
                each = int(each)
                automobile_no += each - 1
            else: 
                automobile_no +=1
        if automobile_no < 1:
            return 0
        else:      
            return automobile_no
    except:
        return 0




In [613]:
total_df['no_automobile'] = total_df.apply(lambda x: get_automobile_no(x['vehicle_cat']), axis=1)

In [614]:
total_df['route'] = total_df['route'].str.strip()

In [615]:
total_df[total_df['vehicle_cat']== '3COM']

Unnamed: 0,sn,command,date,crash_time,report_time,arrival_time,response_time,route,location,vehicle_no,vehicle_type,vehicle_cat,vehicle_make,vehicle_model,fleet_operator,name_of_driver,dl_no,causes,no_injured_male_adult,no_injured_female_adult,no_injured_male_child,no_injured_female_child,total_injured,no_killed_male_adult,no_killed_female_adult,no_killed_male_child,no_killed_female_child,total_killed,no_involved_male_adult,no_involved_female_adult,no_involved_male_child,no_involved_female_child,total_involved,month,day,year,report_minus_crash,arrival_minus_crash,arrival_minus_report,datetime,no_automobile


In [616]:
total_df.to_csv('data/cleaned_aggregated.csv', index=False)

In [617]:
c.columns

Index(['sn', 'command', 'date', 'crash_time', 'report_time', 'arrival_time',
       'response_time', 'route', 'location', 'vehicle_no', 'vehicle_type',
       'vehicle_cat', 'vehicle_make', 'vehicle_model', 'fleet_operator',
       'name_of_driver', 'dl_no', 'causes', 'no_injured_male_adult',
       'no_injured_female_adult', 'no_injured_male_child',
       'no_injured_female_child', 'total_injured', 'no_killed_male_adult',
       'no_killed_female_adult', 'no_killed_male_child',
       'no_killed_female_child', 'total_killed', 'no_involved_male_adult',
       'no_involved_female_adult', 'no_involved_male_child',
       'no_involved_female_child', 'total_involved', 'month', 'day', 'year',
       'report_minus_crash', 'arrival_minus_crash', 'arrival_minus_report',
       'datetime', 'no_automobile'],
      dtype='object')

In [618]:
total_df

Unnamed: 0,sn,command,date,crash_time,report_time,arrival_time,response_time,route,location,vehicle_no,vehicle_type,vehicle_cat,vehicle_make,vehicle_model,fleet_operator,name_of_driver,dl_no,causes,no_injured_male_adult,no_injured_female_adult,no_injured_male_child,no_injured_female_child,total_injured,no_killed_male_adult,no_killed_female_adult,no_killed_male_child,no_killed_female_child,total_killed,no_involved_male_adult,no_involved_female_adult,no_involved_male_child,no_involved_female_child,total_involved,month,day,year,report_minus_crash,arrival_minus_crash,arrival_minus_report,datetime,no_automobile
0,1,RS11.2,2016-02-09,03:35:00,04:05:00,04:12:00,7.0,AKR -IPT,KM 2.5 NEAR ANTI CULTIT P/STATION,KPA 328 LG & LSR 999 XN,2 BUS,2 COM,2 TYT,2 HIACE,,,,SOS,3.0,3.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,6.0,3.0,0.0,0.0,9.0,2,9,2016,1800,2220,420,2016-02-09 03:35:00,2
1,2,RS11.2,2016-02-09,10:30:00,10:32:00,10:35:00,3.0,AKR - IPT,KM 6 ILARA MOKIN,AKD 991 CE & EKY 431 EH,BUS & CAR,2 PVT,HONDA & TYT,ODESS& COROLLA,,,,DGD,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,2,9,2016,120,300,180,2016-02-09 10:30:00,2
2,3,RS11.2,2016-08-09,00:45:00,00:56:00,01:05:00,9.0,AKR - OWO,KM 2.5 SEENI JUNT.,DKA 378 YYY & TFB 118 XA,2 BUS,2 COM,2 PGT,2 J5,,,,DGD,7.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,9.0,8,9,2016,660,1200,540,2016-08-09 00:45:00,2
3,4,RS11.2,2016-09-18,16:00:00,16:23:00,16:35:00,12.0,AKR - IPT,KM 13 ERO JUNT,"FKJ 611 XB, KNR 377 XA & GED 81 XA",3 BUS,2 COM & PVT,2 TYT & IZUZU,"HIACE, PICNIC & BUS",,,,WOV,3.0,2.0,0.0,0.0,5.0,1.0,1.0,0.0,1.0,3.0,8.0,5.0,0.0,1.0,14.0,9,18,2016,1380,2100,720,2016-09-18 16:00:00,3
4,5,RS11.2,2016-09-23,18:30:00,18:32:00,18:33:00,1.0,AKR - OWO,KM1.5 NNPC MEGA STATION,AKR 397 AT & WWW 405 AA,2 CAR,2 PVT,2 NISSAN,2 ALMERA,,,,WOV,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,1.0,4.0,9,23,2016,120,180,60,2016-09-23 18:30:00,2
5,6,RS11.2,2016-09-25,04:50:00,04:55:00,05:00:00,5.0,AKR - OWO,KM 4 QUARTER GUARD,NO NOS,BUS,PVT,TYT,SIENNA,,,,SOS,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,9,25,2016,300,600,300,2016-09-25 04:50:00,1
6,1,RS11.21,2016-03-09,09:30:00,09:50:00,10:17:00,27.0,ORE - LAG,KM 23 OMOTOSHO AXIS,SMK 355 CK & XA 349 DGB,BUS & TRK,PVT & COM,TYT & BEDFORD,SIENNA & TRK,,,,SPV,5.0,1.0,0.0,0.0,6.0,1.0,1.0,0.0,0.0,2.0,6.0,2.0,0.0,0.0,8.0,3,9,2016,1200,2820,1620,2016-03-09 09:30:00,2
7,2,RS11.21,2016-09-22,20:20:00,20:25:00,20:27:00,2.0,ORE - LAG,KM 4 B4 NNPC FILLING STATION,GWK 730 XA,BUS,COM,TYT,SIENNA,,,,SLV,3.0,3.0,0.0,1.0,7.0,1.0,0.0,0.0,0.0,1.0,4.0,3.0,0.0,1.0,8.0,9,22,2016,300,420,120,2016-09-22 20:20:00,1
8,3,RS11.21,2016-09-24,12:35:00,12:40:00,12:45:00,5.0,ORE - BEN,KM 13 OWENA VILLAGE,APP 525 EC,JEEP,PVT,TYT,JEEP,,,,SLV,1.0,1.0,0.0,0.0,2.0,1.0,2.0,0.0,0.0,3.0,2.0,3.0,0.0,0.0,5.0,9,24,2016,300,600,300,2016-09-24 12:35:00,1
9,4,RS11.21,2016-09-25,14:10:00,14:17:00,14:22:00,5.0,ORE - LAG,QKM 7 AFTER MTN MAST,AWK 815 XA & NO NOS,BUS & TRK,2 COM,TYT & MAN DIESEL,HIACE & TRK,,,,SLV & OTH,4.0,2.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,9.0,5.0,1.0,0.0,15.0,9,25,2016,420,720,300,2016-09-25 14:10:00,2
