In [52]:
import pandas as pd
import numpy as np
import regex as re

import requests
import asyncio
import json as js

import time

import os

pd.set_option('display.max_rows', 6)

#### <b>Notebook Description</b>
<br/>
Convert driver factor data into analyzable format and yields a CSV as output

In [53]:
dir = os.path.abspath(os.path.dirname(os.getcwd())) + '/data'

df_data = pd.read_csv(dir + '/auxiliary/driver_behavioral_factors.csv')

df_data.head()

Unnamed: 0,driver_factor
0,0=No factors identified for this driver
1,Physical/Mental Condition:
2,"1=(through 2009) Drowsy, Asleep"
3,"2=(through 2009) Ill, Passed Out, Blacked Out"
4,"3=(through 2009) Emotional (e.g., depressed, a..."


In [54]:
index_pat = r'([0-9])+='
def is_valid_index(s:str) -> bool:
    mat = re.match(pattern=index_pat, string=s)
    return bool(mat)

In [55]:
df_data['valid_index'] = df_data.driver_factor.map(is_valid_index)
df_data = df_data[df_data.valid_index]
df_data.drop(labels='valid_index', axis=1, inplace=True)
df_data.reset_index(inplace=True, drop=True)

In [56]:
since_pat = r'\(since (?P<year>[0-9]{4})\)'
through_pat = r'\(through (?P<year>[0-9]{4})\)'
def get_valid_period(s:str):
    since_search = re.search(pattern=since_pat, string=s)
    through_search = re.search(pattern=through_pat, string=s)
    if since_search:
        since = since_search.group('year')
    else:
        since = float('-inf')
    if through_search:
        thru = through_search.group('year')
    else:
        thru = float('inf')
    return (since, thru)


def clear_parentheses(s:str):
    match1 = re.search(pattern=through_pat, string=s)
    if  match1:
        s = re.sub(pattern=through_pat, repl='', string=s)
    match2 = re.search(pattern=since_pat, string=s)
    if match2:
        s = re.sub(pattern=since_pat, repl='', string=s)
    return s

In [57]:
df_data['effect_start'] = df_data.driver_factor.map(
    lambda s: get_valid_period(s)[0]
)

df_data['effect_end'] = df_data.driver_factor.map(
    lambda s: get_valid_period(s)[1]
)

df_data.driver_factor = df_data.driver_factor.map(clear_parentheses)

df_data

Unnamed: 0,driver_factor,effect_start,effect_end
0,0=No factors identified for this driver,-inf,inf
1,"1= Drowsy, Asleep",-inf,2009
2,"2= Ill, Passed Out, Blacked Out",-inf,2009
...,...,...,...
102,95=Fire Personnel,-inf,inf
103,96=Tow Operator,-inf,inf
104,"97=Transportation i.e. maintenance workers, sa...",-inf,inf


In [58]:
df_data.insert(0, 'factor_index', df_data.driver_factor.map(lambda v: int(v.split('=')[0])))


df_data.driver_factor = df_data.driver_factor.map(
    lambda s : re.sub(pattern=r'[0-9]+=', string= s, repl='')
)

df_data.effect_start = df_data.effect_start.astype(float)
df_data.effect_end = df_data.effect_end.astype(float)

In [59]:
def get_factor_category(ind:int, timeframe: tuple[float, float]) -> str:
    if ind == 0 :
        return 'default'
    elif ind <= 13:
        return 'Physical/Mental Condition'
    elif ind <=60:
        return 'Miscellaneous Factors'
    elif ind <=72:
        return 'Vision Obscured'
    elif ind == 73:
        if timeframe[1] <=2001:
            return 'Vision Obscured'
        else:
            return 'Special Circumstances'
    elif ind == 74:
        return 'Special Circumstances'
    elif ind <= 92:
        return 'Skidding, Swerving, Sliding'
    else:
        return 'Possible Distraction Inside Vehicle'


df_data['factor_category'] = df_data.index.map(
        lambda i: get_factor_category(
            df_data.loc[i, 'factor_index'], (df_data.loc[i, 'effect_start'], df_data.loc[i, 'effect_end'])
        )
    )

df_data

Unnamed: 0,factor_index,driver_factor,effect_start,effect_end,factor_category
0,0,No factors identified for this driver,-inf,inf,default
1,1,"Drowsy, Asleep",-inf,2009.0,Physical/Mental Condition
2,2,"Ill, Passed Out, Blacked Out",-inf,2009.0,Physical/Mental Condition
...,...,...,...,...,...
102,95,Fire Personnel,-inf,inf,Possible Distraction Inside Vehicle
103,96,Tow Operator,-inf,inf,Possible Distraction Inside Vehicle
104,97,"Transportation i.e. maintenance workers, safte...",-inf,inf,Possible Distraction Inside Vehicle


In [60]:
df_data.to_csv(dir + '/output/driver_behavioral_factors.csv')