In [1]:
import pandas as pd

# Load the CSV file into a pandas DataFrame
vpicdecode_20 = pd.read_csv('VPICDECODE_20.csv', encoding='ISO-8859-1', low_memory=False)
vpicdecode_21 = pd.read_csv('VPICDECODE_21.csv', encoding='ISO-8859-1', low_memory=False)
vpicdecode_22 = pd.read_csv('VPICDECODE_22.csv', encoding='ISO-8859-1', low_memory=False)

# Add the Year column to identify which year
vpicdecode_20['Year'] = 2020
vpicdecode_21['Year'] = 2021
vpicdecode_22['Year'] = 2022

# Inspect data
vpicdecode_20.head()
vpicdecode_21.head()
vpicdecode_22.head()


Unnamed: 0,CASEID,PSU,CASENO,CASENUMBER,CATEGORY,VEHNO,VehicleDescriptor,VINDecodedOn,VINDecodeError,VehicleTypeId,...,KeylessIgnition,SAEAutomationLevel_from,SAEAutomationLevel_to,AutoReverseSystemId,AutoReverseSystem,ActiveSafetySysNote,CASEWGT,PSUSTRAT,VERSION,Year
0,24630,54,5,1-54-2022-005-10,10,1,1G1ZG5E7*CF******,04OCT2023:07:27:26.617,0,2,...,,,,,,,962.306932,9,7,2022
1,24632,17,4,1-17-2022-004-10,10,1,5FNRL387*7B******,04OCT2023:07:27:27.340,0,7,...,,,,,,,1006.177319,4,7,2022
2,24637,19,14,1-19-2022-014-06,6,2,1C6RR7XT*GS******,04OCT2023:07:27:29.290,0,3,...,,,,,,,156.257796,9,7,2022
3,24638,11,3,1-11-2022-003-06,6,2,5N1AR2MM*DC******,04OCT2023:07:27:29.620,0,7,...,,,,,,,2831.781528,8,7,2022
4,24645,13,9,1-13-2022-009-01,1,1,JTLZE4FE*A1******,04OCT2023:07:27:32.377,0,7,...,,,,,,,150.229977,7,7,2022


In [2]:
# Concatenate the datasets
decode_data = pd.concat([vpicdecode_20, vpicdecode_21, vpicdecode_22], ignore_index=True)

decode_data.head()


Unnamed: 0,CASEID,PSU,CASENO,CASENUMBER,CATEGORY,VEHNO,VehicleDescriptor,VINDecodedOn,VINDecodeError,VehicleTypeId,...,KeylessIgnition,SAEAutomationLevel_from,SAEAutomationLevel_to,AutoReverseSystemId,AutoReverseSystem,ActiveSafetySysNote,CASEWGT,PSUSTRAT,VERSION,Year
0,16035,52,2,1-52-2020-002-07,7,1,2FMHK6DC*AB******,14OCT2021:05:06:19.637,0,7,...,,,,,,,64.65143,12,5,2020
1,16035,52,2,1-52-2020-002-07,7,2,5TFUX4EN*CX******,14OCT2021:05:07:14.743,0,3,...,,,,,,,64.65143,12,5,2020
2,16037,66,1,1-66-2020-001-10,10,1,2GCEK13T*51******,14OCT2021:05:07:14.883,0,3,...,,,,,,,3758.565419,6,5,2020
3,16038,16,1,1-16-2020-001-09,9,1,2G4WS52J*31******,14OCT2021:05:06:57.020,0,2,...,,,,,,,2592.567438,5,5,2020
4,16038,16,1,1-16-2020-001-09,9,2,2G1WT57K*91******,14OCT2021:05:07:14.977,0,2,...,,,,,,,2592.567438,5,5,2020


In [3]:
# List of the relevant columns
adas_columns = [
    'ForwardCollisionWarning',
    'DynamicBrakeSupport',
    'CrashImminentBraking',
    'PedestrianAutoEmergencyBraking',
    'BlindSpotWarning',
    'BlindSpotIntervention',
    'LaneDepartureWarning',
    'LaneKeepingAssistance',
    'LaneCenteringAssistance',
    'RearCrossTrafficAlert',
    'RearAutomaticEmergencyBraking',
    'AdaptiveDrivingBeam',
    'AdaptiveCruiseControl',
    'ElectronicStabilityControl',
    'TractionControl',
    'AutoPedestrianAlertingSound'
]

# Create the 'ADAS' column based on whether any of the ADAS-related columns have a non-null value
adas_series = decode_data[adas_columns].notnull().any(axis=1).astype(int)

# Use pd.concat() to add the ADAS column in one go to avoid fragmentation
decode_data = pd.concat([decode_data, adas_series.rename('ADAS')], axis=1)

# To defragment the DataFrame, if needed, you can create a copy
decode_data = decode_data.copy()

# Verify the result
print(decode_data[['ADAS'] + adas_columns].head())


0     0                     NaN                 NaN                  NaN   
1     0                     NaN                 NaN                  NaN   
2     0                     NaN                 NaN                  NaN   
3     0                     NaN                 NaN                  NaN   
4     0                     NaN                 NaN                  NaN   

0                            NaN              NaN                   NaN   
1                            NaN              NaN                   NaN   
2                            NaN              NaN                   NaN   
3                            NaN              NaN                   NaN   
4                            NaN              NaN                   NaN   

0                  NaN                   NaN                     NaN   
1                  NaN                   NaN                     NaN   
2                  NaN                   NaN                     NaN   
3                  NaN     

In [4]:
# Filter rows where VEHNO is '1'
# decode_data = decode_data[decode_data['VEHNO'] == 1]

# decode_data.head()

In [5]:
# Number of rows (total crashes)
total_crashes = len(decode_data)

# Number of vehicles without ADAS (ADAS = 0)
vehicles_without_adas = decode_data[decode_data['ADAS'] == 0].shape[0]

# Number of vehicles with ADAS (ADAS = 1)
vehicles_with_adas = decode_data[decode_data['ADAS'] == 1].shape[0]

# Percentage of vehicles with ADAS
percentage_with_adas = (vehicles_with_adas / total_crashes) * 100

# Print the results
print(f"Total crashes: {total_crashes}")
print(f"Number of vehicles without ADAS (ADAS = 0): {vehicles_without_adas}")
print(f"Number of vehicles with ADAS (ADAS = 1): {vehicles_with_adas}")
print(f"Percentage of vehicles with ADAS: {percentage_with_adas:.2f}%")


Total crashes: 17182
Number of vehicles without ADAS (ADAS = 0): 12710
Number of vehicles with ADAS (ADAS = 1): 4472
Percentage of vehicles with ADAS: 26.03%


In [6]:
decode_data.head()

Unnamed: 0,CASEID,PSU,CASENO,CASENUMBER,CATEGORY,VEHNO,VehicleDescriptor,VINDecodedOn,VINDecodeError,VehicleTypeId,...,SAEAutomationLevel_from,SAEAutomationLevel_to,AutoReverseSystemId,AutoReverseSystem,ActiveSafetySysNote,CASEWGT,PSUSTRAT,VERSION,Year,ADAS
0,16035,52,2,1-52-2020-002-07,7,1,2FMHK6DC*AB******,14OCT2021:05:06:19.637,0,7,...,,,,,,64.65143,12,5,2020,0
1,16035,52,2,1-52-2020-002-07,7,2,5TFUX4EN*CX******,14OCT2021:05:07:14.743,0,3,...,,,,,,64.65143,12,5,2020,0
2,16037,66,1,1-66-2020-001-10,10,1,2GCEK13T*51******,14OCT2021:05:07:14.883,0,3,...,,,,,,3758.565419,6,5,2020,0
3,16038,16,1,1-16-2020-001-09,9,1,2G4WS52J*31******,14OCT2021:05:06:57.020,0,2,...,,,,,,2592.567438,5,5,2020,0
4,16038,16,1,1-16-2020-001-09,9,2,2G1WT57K*91******,14OCT2021:05:07:14.977,0,2,...,,,,,,2592.567438,5,5,2020,0


In [7]:
# Create decode_new by selecting the specified columns from decode_data
decode_new = decode_data[['CASEID', 'VEHNO', 'VehicleType', 'Make', 'Model', 'ModelYear', 'ADAS']]

# Verify the result by printing the first few rows
print(decode_new.head())


   CASEID  VEHNO                           VehicleType       Make      Model  \
0   16035      1  MULTIPURPOSE PASSENGER VEHICLE (MPV)       FORD       Flex   
1   16035      2                                 TRUCK     TOYOTA     Tacoma   
2   16037      1                                 TRUCK  CHEVROLET  Silverado   
3   16038      1                         PASSENGER CAR      BUICK    Century   
4   16038      2                         PASSENGER CAR  CHEVROLET     Impala   

   ModelYear  ADAS  
0       2010     0  
1       2012     0  
2       2005     0  
3       2003     0  
4       2009     0  


In [8]:
# Create adas by filtering where ADAS is '1'
adas_data = decode_new[decode_new['ADAS'] == 1]
adas_data.to_csv("DataNew/adas.csv")

In [9]:
adas_data.head()

Unnamed: 0,CASEID,VEHNO,VehicleType,Make,Model,ModelYear,ADAS
12,16083,2,MULTIPURPOSE PASSENGER VEHICLE (MPV),LEXUS,RX,2018,1
15,16086,1,PASSENGER CAR,HONDA,Civic,2017,1
16,16086,2,MULTIPURPOSE PASSENGER VEHICLE (MPV),TOYOTA,RAV4,2019,1
38,16059,3,MULTIPURPOSE PASSENGER VEHICLE (MPV),FORD,Escape,2017,1
45,16274,1,MULTIPURPOSE PASSENGER VEHICLE (MPV),JEEP,Renegade,2018,1


In [10]:
# Create no_adas by filtering where ADAS is '0'
no_adas_data = decode_new[decode_new['ADAS'] == 0]
no_adas_data.to_csv("DataNew/no_adas.csv")

In [11]:
no_adas_data.head()

Unnamed: 0,CASEID,VEHNO,VehicleType,Make,Model,ModelYear,ADAS
0,16035,1,MULTIPURPOSE PASSENGER VEHICLE (MPV),FORD,Flex,2010,0
1,16035,2,TRUCK,TOYOTA,Tacoma,2012,0
2,16037,1,TRUCK,CHEVROLET,Silverado,2005,0
3,16038,1,PASSENGER CAR,BUICK,Century,2003,0
4,16038,2,PASSENGER CAR,CHEVROLET,Impala,2009,0


In [12]:
df_all = pd.read_csv('DataNew/Mergedata.csv',encoding='cp1252')

In [13]:
df_all = df_all.drop(columns=['VEHNO'], errors='ignore')

In [14]:
adas_data.shape

(4472, 7)

In [15]:
no_adas_data.shape

(12710, 7)

In [16]:
df_all.shape

(9951, 145)

In [17]:
dfad = pd.merge(adas_data, df_all, how="inner", on="CASEID")
dfad.head()

Unnamed: 0.1,CASEID,VEHNO,VehicleType,Make,Model,ModelYear,ADAS,Unnamed: 0,CASEWGT,DISTRACTN_1,...,WEATHER_8.0,WEATHER_9.0,WEATHER_10.0,WEATHER_98.0,WEATHER_99.0,DISTRACT_0.0,DISTRACT_1.0,DISTRACT_2.0,DISTRACT_3.0,DISTRACT_9.0
0,16083,2,MULTIPURPOSE PASSENGER VEHICLE (MPV),LEXUS,RX,2018,1,94,427,,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,16086,1,PASSENGER CAR,HONDA,Civic,2017,1,4719,109,,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,16086,2,MULTIPURPOSE PASSENGER VEHICLE (MPV),TOYOTA,RAV4,2019,1,4719,109,,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,16059,3,MULTIPURPOSE PASSENGER VEHICLE (MPV),FORD,Escape,2017,1,66,502,,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,16274,1,MULTIPURPOSE PASSENGER VEHICLE (MPV),JEEP,Renegade,2018,1,3622,41,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [18]:
dfnad = pd.merge(adas_data, df_all, how="inner", on="CASEID")
dfnad.head()

Unnamed: 0.1,CASEID,VEHNO,VehicleType,Make,Model,ModelYear,ADAS,Unnamed: 0,CASEWGT,DISTRACTN_1,...,WEATHER_8.0,WEATHER_9.0,WEATHER_10.0,WEATHER_98.0,WEATHER_99.0,DISTRACT_0.0,DISTRACT_1.0,DISTRACT_2.0,DISTRACT_3.0,DISTRACT_9.0
0,16083,2,MULTIPURPOSE PASSENGER VEHICLE (MPV),LEXUS,RX,2018,1,94,427,,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,16086,1,PASSENGER CAR,HONDA,Civic,2017,1,4719,109,,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,16086,2,MULTIPURPOSE PASSENGER VEHICLE (MPV),TOYOTA,RAV4,2019,1,4719,109,,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,16059,3,MULTIPURPOSE PASSENGER VEHICLE (MPV),FORD,Escape,2017,1,66,502,,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,16274,1,MULTIPURPOSE PASSENGER VEHICLE (MPV),JEEP,Renegade,2018,1,3622,41,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [19]:
#Driver Error (ADAS)

In [20]:
cols=["DISTRACTN_1",\
    "DISTRACTN_2","DISTRACTN_3","DISTRACTN_4","DISTRACTN_5","DISTRACTN_6","DISTRACTN_7",\
     "DISTRACTN_8","DISTRACTN_9","DISTRACTN_10","DISTRACTN_99","DISTRACT_2.0","DISTRACT_3.0","CRITEVENT_6.0",\
    "CRITEVENT_8.0","CRITEVENT_10.0","CRITEVENT_11.0","CRITEVENT_12.0","CRITEVENT_13.0","CRITEVENT_15.0"\
     ,"CRITEVENT_16.0","CRITEVENT_17.0","CRITEVENT_18.0","CRITEVENT_19.0","CRITEVENT_20.0","CRITEVENT_21.0",\
    "CRITEVENT_51.0","CRITEVENT_52.0","CRITEVENT_53.0","CRITEVENT_54.0"\
    ,"CRITEVENT_55.0","CRITEVENT_56.0","CRITEVENT_59.0","CRITEVENT_60.0","CRITEVENT_61.0","CRITEVENT_62.0","CRITEVENT_63.0","CRITEVENT_64.0"\
     ,"CRITEVENT_65.0","CRITEVENT_66.0","CRITEVENT_67.0","CRITEVENT_68.0","CRITEVENT_70.0","CRITEVENT_71.0","CRITEVENT_72.0","CRITEVENT_73.0","CRITEVENT_74.0","CRITEVENT_78.0","CRITEVENT_80.0","CRITEVENT_81.0"\
     ,"CRITEVENT_83.0","PRELOC_2.0","PRELOC_4.0","PRELOC_6.0","PRELOC_7.0","OBJCONT_77","OBJCONT_78"]
dfde=dfad[cols].copy()
dfde["DE"]=dfde.any(axis="columns")
dfde.head()

Unnamed: 0,DISTRACTN_1,DISTRACTN_2,DISTRACTN_3,DISTRACTN_4,DISTRACTN_5,DISTRACTN_6,DISTRACTN_7,DISTRACTN_8,DISTRACTN_9,DISTRACTN_10,...,CRITEVENT_80.0,CRITEVENT_81.0,CRITEVENT_83.0,PRELOC_2.0,PRELOC_4.0,PRELOC_6.0,PRELOC_7.0,OBJCONT_77,OBJCONT_78,DE
0,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,True
1,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
2,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
3,,,,,,,,,,,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,,,True
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True


In [21]:
dfde['DE'].value_counts()

DE
True     4271
False     200
Name: count, dtype: int64

In [22]:
data=[dfad["CASEID"],dfad['CASEWGT'],dfde["DE"]]
headers=["ypc","combwgt","de"]
dffinal=pd.concat(data,axis=1, keys=headers)
dffinal["de"]=dffinal["de"].astype(int)
dffinal.head()

Unnamed: 0,ypc,combwgt,de
0,16083,427,1
1,16086,109,1
2,16086,109,1
3,16059,502,1
4,16274,41,1


In [23]:
dfde.to_csv("DataNew/DE1.csv")

In [24]:
dfde.shape

(4471, 58)

In [25]:
#Human Factors

In [26]:
cols=["OBJCONT_72","OBJCONT_73","OBJCONT_75","DISTRACTN_1","DISTRACTN_2","CRITEVENT_50.0","CRITEVENT_56.0","PRELOC_6.0","PRELOC_7.0"]
dfhf=dfad[cols].copy()
dfhf["HF"]=dfhf.any(axis="columns")
dfhf.head()

Unnamed: 0,OBJCONT_72,OBJCONT_73,OBJCONT_75,DISTRACTN_1,DISTRACTN_2,CRITEVENT_50.0,CRITEVENT_56.0,PRELOC_6.0,PRELOC_7.0,HF
0,,,,,,0.0,0.0,0.0,0.0,False
1,0.0,0.0,0.0,,,1.0,0.0,0.0,0.0,True
2,0.0,0.0,0.0,,,1.0,0.0,0.0,0.0,True
3,,,,,,0.0,0.0,0.0,0.0,False
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,True


In [27]:
dfhf['HF'].value_counts()

HF
False    3778
True      693
Name: count, dtype: int64

In [28]:
data=[dfad["CASEID"],dfad['CASEWGT'],dfhf["HF"]]
headers=["ypc","combwgt","de"]
dffinal=pd.concat(data,axis=1, keys=headers)
dffinal["de"]=dffinal["de"].astype(int)
dffinal.head()

Unnamed: 0,ypc,combwgt,de
0,16083,427,0
1,16086,109,1
2,16086,109,1
3,16059,502,0
4,16274,41,1


In [29]:
dfhf.to_excel("DataNew/HF1.xlsx")

In [30]:
dfhf.shape

(4471, 10)

In [31]:
#Vehicle Factors

In [32]:
cols=["OBJCONT_78","CRITEVENT_3.0","CRITEVENT_4.0","CRITEVENT_8.0"]
dfvf=dfad[cols].copy()
dfvf["VF"]=dfvf.any(axis="columns")
dfvf.head()

Unnamed: 0,OBJCONT_78,CRITEVENT_3.0,CRITEVENT_4.0,CRITEVENT_8.0,VF
0,,0.0,0.0,0.0,False
1,0.0,0.0,0.0,0.0,False
2,0.0,0.0,0.0,0.0,False
3,,0.0,0.0,0.0,False
4,0.0,0.0,0.0,0.0,False


In [33]:
dfvf['VF'].value_counts()

VF
False    4429
True       42
Name: count, dtype: int64

In [34]:
data=[dfad["CASEID"],dfad['CASEWGT'],dfvf["VF"]]
headers=["ypc","combwgt","vf"]
dffinal=pd.concat(data,axis=1, keys=headers)
dffinal["vf"]=dffinal["vf"].astype(int)
dffinal.head()

Unnamed: 0,ypc,combwgt,vf
0,16083,427,0
1,16086,109,0
2,16086,109,0
3,16059,502,0
4,16274,41,0


In [35]:
dfol=dffinal[(dffinal.vf==1)]
dfol["combwgt"].sum()

11833

In [36]:
dfvf.shape

(4471, 5)

In [37]:
dfvf.to_excel("DataNew/VF1.xlsx")

In [38]:
#Vehicle Maintenance

In [39]:
cols=["CRITEVENT_1.0"]
dfvm=dfad[cols].copy()
dfvm["VM"]=dfvm.any(axis="columns")
dfvm.head()

Unnamed: 0,CRITEVENT_1.0,VM
0,0.0,False
1,0.0,False
2,0.0,False
3,0.0,False
4,0.0,False


In [40]:
dfvm['VM'].value_counts()

VM
False    4460
True       11
Name: count, dtype: int64

In [41]:
data=[dfad["CASEID"],dfad['CASEWGT'],dfvm["VM"]]
headers=["ypc","combwgt","vm"]
dffinal=pd.concat(data,axis=1, keys=headers)
dffinal["vm"]=dffinal["vm"].astype(int)
dffinal.head()

Unnamed: 0,ypc,combwgt,vm
0,16083,427,0
1,16086,109,0
2,16086,109,0
3,16059,502,0
4,16274,41,0


In [42]:
dfol=dffinal[(dffinal.vm==1)]
dfol["combwgt"].sum()

10129

In [43]:
dfvm.shape

(4471, 2)

In [44]:
dfvm.to_excel("DataNew/VM1.xlsx")

In [45]:
#Infrastructure Factors

In [46]:
cols=[ 'OBJCONT_41','OBJCONT_42', 'OBJCONT_43', 'OBJCONT_44', 'OBJCONT_45',
      'OBJCONT_47','OBJCONT_48','OBJCONT_49','OBJCONT_50','OBJCONT_51','OBJCONT_52','OBJCONT_53','OBJCONT_54','OBJCONT_55','OBJCONT_56','OBJCONT_57',\
   'OBJCONT_58','OBJCONT_59','OBJCONT_60','OBJCONT_61','OBJCONT_62','OBJCONT_63','OBJCONT_64','OBJCONT_68','OBJCONT_69',\
      "CRITEVENT_5.0","TRAFFUNCT_1.0"]#'OBJCONT_76.0',"CRITEVENT_87.0","CRITEVENT_88.0","CRITEVENT_89.0","CRITEVENT_90.0","CRITEVENT_91.0","CRITEVENT_92.0"]
#cols=["WEATHER_2.0","WEATHER_3.0","WEATHER_4.0","WEATHER_5.0","WEATHER_6.0","WEATHER_7.0","WEATHER_9.0","WEATHER_10.0"]
dfif=dfad[cols].copy()
dfif["IF"]=dfif.any(axis="columns")
dfif.head()

Unnamed: 0,OBJCONT_41,OBJCONT_42,OBJCONT_43,OBJCONT_44,OBJCONT_45,OBJCONT_47,OBJCONT_48,OBJCONT_49,OBJCONT_50,OBJCONT_51,...,OBJCONT_60,OBJCONT_61,OBJCONT_62,OBJCONT_63,OBJCONT_64,OBJCONT_68,OBJCONT_69,CRITEVENT_5.0,TRAFFUNCT_1.0,IF
0,,,,,,,,,,,...,,,,,,,,0.0,0.0,False
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
3,,,,,,,,,,,...,,,,,,,,0.0,0.0,False
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True


In [47]:
dfif['IF'].value_counts()

IF
False    3092
True     1379
Name: count, dtype: int64

In [48]:
data=[dfad["CASEID"],dfad['CASEWGT'],dfif["IF"]]
headers=["ypc","combwgt","df"]
dffinal=pd.concat(data,axis=1, keys=headers)
dffinal["df"]=dffinal["df"].astype(int)
dffinal.head()

Unnamed: 0,ypc,combwgt,df
0,16083,427,0
1,16086,109,1
2,16086,109,1
3,16059,502,0
4,16274,41,1


In [49]:
dfol=dffinal[(dffinal.df==1)]
dfol["combwgt"].sum()

595750

In [50]:
dfif.shape

(4471, 28)

In [51]:
dfif.to_excel("DataNew/IF1.xlsx")

In [52]:
#Weather Factors

In [53]:
cols=['OBJCONT_61','OBJCONT_68','OBJCONT_69','OBJCONT_76',"CRITEVENT_5.0","CRITEVENT_87.0","CRITEVENT_88.0","CRITEVENT_89.0","CRITEVENT_90.0","CRITEVENT_91.0","CRITEVENT_92.0",\
      "SURFCOND_2.0","SURFCOND_3.0","SURFCOND_4.0","SURFCOND_5.0","SURFCOND_6.0","SURFCOND_8.0","SURFCOND_9.0",\
      "WEATHER_2.0","WEATHER_3.0","WEATHER_4.0","WEATHER_5.0","WEATHER_6.0","WEATHER_7.0","WEATHER_9.0","WEATHER_10.0"]
dfwe=dfad[cols].copy()
dfwe["WE"]=dfwe.any(axis="columns")
dfwe.head()

Unnamed: 0,OBJCONT_61,OBJCONT_68,OBJCONT_69,OBJCONT_76,CRITEVENT_5.0,CRITEVENT_87.0,CRITEVENT_88.0,CRITEVENT_89.0,CRITEVENT_90.0,CRITEVENT_91.0,...,SURFCOND_9.0,WEATHER_2.0,WEATHER_3.0,WEATHER_4.0,WEATHER_5.0,WEATHER_6.0,WEATHER_7.0,WEATHER_9.0,WEATHER_10.0,WE
0,,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
3,,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False


In [54]:
dfwe['WE'].value_counts()

WE
False    3404
True     1067
Name: count, dtype: int64

In [55]:
data=[dfad["CASEID"],dfad['CASEWGT'],dfwe["WE"]]
headers=["ypc","combwgt","df"]
dffinal=pd.concat(data,axis=1, keys=headers)
dffinal["df"]=dffinal["df"].astype(int)
dffinal.head()

Unnamed: 0,ypc,combwgt,df
0,16083,427,0
1,16086,109,0
2,16086,109,0
3,16059,502,1
4,16274,41,0


In [56]:
dfol=dffinal[(dffinal.df==1)]
dfol["combwgt"].sum()

549485

In [57]:
dfwe.shape

(4471, 27)

In [58]:
dfwe.to_excel("DataNew/WE1.xlsx")

In [59]:
#Socio Economic Factors

In [60]:
cols=['DISTRACTN_1',"CRITEVENT_6.0"]
dfsf=dfad[cols].copy()
dfsf["SF"]=dfsf.any(axis="columns")
dfsf.head()

Unnamed: 0,DISTRACTN_1,CRITEVENT_6.0,SF
0,,0.0,False
1,,0.0,False
2,,0.0,False
3,,0.0,False
4,1.0,0.0,True


In [61]:
dfsf['SF'].value_counts()

SF
False    4166
True      305
Name: count, dtype: int64

In [62]:
data=[dfad["CASEID"],dfad['CASEWGT'],dfsf["SF"]]
headers=["ypc","combwgt","sf"]
dffinal=pd.concat(data,axis=1, keys=headers)
dffinal["sf"]=dffinal["sf"].astype(int)
dffinal.head()

Unnamed: 0,ypc,combwgt,sf
0,16083,427,0
1,16086,109,0
2,16086,109,0
3,16059,502,0
4,16274,41,1


In [63]:
dfol=dffinal[(dffinal.sf==1)]
dfol["combwgt"].sum()

113025

In [64]:
dfsf.shape

(4471, 3)

In [65]:
dfsf.to_excel("DataNew/SF1.xlsx")

In [66]:
data=[dfad["CASEID"],dfad['CASEWGT'],dfde["DE"],dfhf["HF"],dfvf['VF'],dfvm['VM'],dfif['IF'],dfwe['WE'],dfsf['SF']]
headers=["ypc","combwgt","de","hf","vf","vm","if","we","sf"]
dffinal=pd.concat(data,axis=1, keys=headers)
dffinal["de"]=dffinal["de"].astype(int)
dffinal["hf"]=dffinal["hf"].astype(int)
dffinal["vf"]=dffinal["vf"].astype(int)
dffinal["vm"]=dffinal["vm"].astype(int)
dffinal["if"]=dffinal["if"].astype(int)
dffinal["we"]=dffinal["we"].astype(int)
dffinal["sf"]=dffinal["sf"].astype(int)
dffinal.head()

Unnamed: 0,ypc,combwgt,de,hf,vf,vm,if,we,sf
0,16083,427,1,0,0,0,0,0,0
1,16086,109,1,1,0,0,1,0,0
2,16086,109,1,1,0,0,1,0,0
3,16059,502,1,0,0,0,0,1,0
4,16274,41,1,1,0,0,1,0,1


In [67]:
dffinal.to_csv("DataNew/Contributingfactor.CSV")

In [68]:
df = pd.read_csv('DataNew/Contributingfactor.CSV')

In [69]:
df.head()

Unnamed: 0.1,Unnamed: 0,ypc,combwgt,de,hf,vf,vm,if,we,sf
0,0,16083,427,1,0,0,0,0,0,0
1,1,16086,109,1,1,0,0,1,0,0
2,2,16086,109,1,1,0,0,1,0,0
3,3,16059,502,1,0,0,0,0,1,0
4,4,16274,41,1,1,0,0,1,0,1


In [70]:
df["combwgt"].sum()

2733586

In [71]:
cols=["de","hf"]
dfhuman=df[cols].copy()
dfhuman["DEHFSF"]=dfhuman.any(axis="columns")
dfhuman.head()

Unnamed: 0,de,hf,DEHFSF
0,1,0,True
1,1,1,True
2,1,1,True
3,1,0,True
4,1,1,True


In [72]:
dfhuman["DEHFSF"].value_counts()

DEHFSF
True     4272
False     199
Name: count, dtype: int64

In [73]:
cols=["vf","vm"]
dfvehicle=df[cols].copy()
dfvehicle["VFVM"]=dfvehicle.any(axis="columns")
dfvehicle.head()

Unnamed: 0,vf,vm,VFVM
0,0,0,False
1,0,0,False
2,0,0,False
3,0,0,False
4,0,0,False


In [74]:
dfvehicle["VFVM"].value_counts()

VFVM
False    4418
True       53
Name: count, dtype: int64

In [75]:
cols=["if","we"]
dfenvironment=df[cols].copy()
dfenvironment["IFWE"]=dfenvironment.any(axis="columns")
dfenvironment.head()

Unnamed: 0,if,we,IFWE
0,0,0,False
1,1,0,True
2,1,0,True
3,0,1,True
4,1,0,True


In [76]:
dfenvironment['IFWE'].value_counts()

IFWE
False    2494
True     1977
Name: count, dtype: int64

In [77]:
data=[df["ypc"],df['combwgt'],dfhuman["DEHFSF"],dfvehicle["VFVM"],dfenvironment['IFWE']]
headers=["ypc","combwgt","dehfsf","vfvm","ifwe"]
dffinal=pd.concat(data,axis=1, keys=headers)
dffinal["dehfsf"]=dffinal["dehfsf"].astype(int)
dffinal["vfvm"]=dffinal["vfvm"].astype(int)
dffinal["ifwe"]=dffinal["ifwe"].astype(int)
dffinal.head()

Unnamed: 0,ypc,combwgt,dehfsf,vfvm,ifwe
0,16083,427,1,0,0
1,16086,109,1,0,1
2,16086,109,1,0,1
3,16059,502,1,0,1
4,16274,41,1,0,1
