In [8]:
import pandas as pd
import numpy as np
import regex as re

import requests
import asyncio
import json as js

import time

import os

pd.set_option('display.max_rows', 9)

#### <b>Notebook Description</b>
<br/>
Convert driver factor data into analyzable format and yields a CSV as output

In [9]:
dir = os.path.abspath(os.path.dirname(os.getcwd())) + '/data'

df_data = pd.read_csv(dir + '/auxiliary/driver_behavioral_factors.csv')

df_data.head()

Unnamed: 0,driver_factor
0,0=No factors identified for this driver
1,Physical/Mental Condition:
2,"1=(through 2009) Drowsy, Asleep"
3,"2=(through 2009) Ill, Passed Out, Blacked Out"
4,"3=(through 2009) Emotional (e.g., depressed, a..."


In [10]:
index_pat = r'([0-9])+='
def is_valid_index(s:str) -> bool:
    mat = re.match(pattern=index_pat, string=s)
    return bool(mat)

In [11]:
df_data['valid_index'] = df_data.driver_factor.map(is_valid_index)
df_data = df_data[df_data.valid_index]
df_data.drop(labels='valid_index', axis=1, inplace=True)
df_data.reset_index(inplace=True, drop=True)

In [12]:
tuple_pat = r'\((?P<year1>[0-9]{4})-(?P<year2>[0-9]{4})\)'
only_pat = r'\((?P<year>[0-9]{4}) only\)'
since_pat = r'\(since (?P<year>[0-9]{4})\)'
through_pat = r'\(through (?P<year>[0-9]{4})\)'


def get_valid_period(s:str) -> tuple[int, int]:
    tuple_search = re.search(pattern=tuple_pat, string=s)
    only_search = re.search(pattern=only_pat, string=s)
    if tuple_search:
        start = tuple_search.group('year1')
        end = tuple_search.group('year2')
    elif only_search:
        start = only_search.group('year')
        end  = start
    else:
        since_search = re.search(pattern=since_pat, string=s)
        through_search = re.search(pattern=through_pat, string=s)
        if since_search:
            start = since_search.group('year')
        else:
            start =0
        if through_search:
            end = through_search.group('year')
        else:
            end = 9999
    return (start, end)


def clear_parentheses(s:str):
    match_thru = re.search(pattern=through_pat, string=s)
    if  match_thru:
        s = re.sub(pattern=through_pat, repl='', string=s)
    match_since = re.search(pattern=since_pat, string=s)
    if match_since:
        s = re.sub(pattern=since_pat, repl='', string=s)
    match_tuple = re.search(pattern=tuple_pat, string=s)
    if match_tuple:
        s = re.sub(pattern=tuple_pat, string=s, repl='')
    match_only = re.search(pattern=only_pat, string=s, repl='')
    if match_only:
        s = re.sub(pattern=only_pat, string=s, repl='')
    return s

In [13]:
df_data['effect_start'] = df_data.driver_factor.map(
    lambda s: get_valid_period(s)[0]
)

df_data['effect_end'] = df_data.driver_factor.map(
    lambda s: get_valid_period(s)[1] if get_valid_period(s)[1] == 9999 else int(get_valid_period(s)[1]) - 1
)

df_data.effect_start = df_data.effect_start.astype(int)
df_data.effect_end = df_data.effect_end.astype(int)


df_data.driver_factor = df_data.driver_factor.map(clear_parentheses)

In [14]:
df_data.insert(0, 'factor_index', df_data.driver_factor.map(lambda v: int(v.split('=')[0])))


df_data.driver_factor = df_data.driver_factor.map(
    lambda s : re.sub(pattern=r'[0-9]{0,3}=', string= s, repl='')
)

In [15]:
def get_factor_category(ind:int, timeframe: tuple[float, float]) -> str:
    if ind == 0 :
        return 'default'
    elif ind <= 13:
        return 'Physical/Mental Condition'
    elif ind <=60:
        return 'Miscellaneous Factors'
    elif ind <=72:
        return 'Vision Obscured'
    elif ind == 73:
        if timeframe[1] <=2001:
            return 'Vision Obscured'
        else:
            return 'Special Circumstances'
    elif ind == 74:
        return 'Special Circumstances'
    elif ind <= 92:
        return 'Skidding, Swerving, Sliding'
    else:
        return 'Possible Distraction Inside Vehicle'


df_data['factor_category'] = df_data.index.map(
        lambda i: get_factor_category(
            df_data.loc[i, 'factor_index'], (df_data.loc[i, 'effect_start'], df_data.loc[i, 'effect_end'])
        )
    )

df_data

Unnamed: 0,factor_index,driver_factor,effect_start,effect_end,factor_category
0,0,No factors identified for this driver,0,9999,default
1,1,"Drowsy, Asleep",0,2008,Physical/Mental Condition
2,2,"Ill, Passed Out, Blacked Out",0,2008,Physical/Mental Condition
3,3,"Emotional (e.g., depressed, angry, disturbed)",0,2008,Physical/Mental Condition
...,...,...,...,...,...
103,94,Emergency Medical Service Personnel,0,2018,Possible Distraction Inside Vehicle
104,95,Fire Personnel,0,2018,Possible Distraction Inside Vehicle
105,96,Tow Operator,0,2018,Possible Distraction Inside Vehicle
106,97,"Transportation i.e. maintenance workers, safte...",0,2018,Possible Distraction Inside Vehicle


In [16]:
df_data.factor_category.value_counts()

Miscellaneous Factors                  51
Physical/Mental Condition              14
Skidding, Swerving, Sliding            14
Vision Obscured                        13
Possible Distraction Inside Vehicle    12
Special Circumstances                   2
default                                 1
Name: factor_category, dtype: int64

In [17]:
df_data.to_csv(dir + '/output/driver_behavioral_factors.csv')