In [23]:
import pandas as pd
import numpy as np
import regex as re

import requests
import asyncio
import json as js

import time

import os

pd.set_option('display.max_rows', 6)

#### Notebook Description
<br/>
Convert driver factor data into analyzable format and yields a CSV as output

In [24]:
dir = os.path.abspath(os.path.dirname(os.getcwd())) + '/data/output/'

df_data = pd.read_csv(dir + 'driver_factors.csv')
df_data.valid_period = df_data.valid_period.astype(str)

df_data.head()

Unnamed: 0,valid_period,content
0,,0=No factors identified for this driver
1,",2009","1=(through 2009) Drowsy, Asleep"
2,",2009","2=(through 2009) Ill, Passed Out, Blacked Out"
3,",2009","3=(through 2009) Emotional (e.g., depressed, a..."
4,,4=Reaction to/Failure to take Drugs/Medication


In [25]:
df_data.valid_period.value_counts()

nan      78
,2009    18
2010,    15
         ..
2002,     1
2021,     1
2015,     1
Name: valid_period, Length: 23, dtype: int64

In [26]:
def get_factor_period(s: str):
    if s == 'nan':
        return (float('-inf'), float('inf'))
    else:
        s_split = s.split(',')
        if s_split[0] != '':
            left = int(s_split[0])
        else:
            left = float('-inf')
        if len(s_split) > 1 and  s_split[1] != '':
            right = int(s_split[1])
        else:
            right = float('inf')
        
        return (str(left), str(right))

In [27]:
df_data['valid_period_start'] = df_data.valid_period.map(
    lambda v : get_factor_period(v)[0]
)
df_data['valid_period_end'] = df_data.valid_period.map(
    lambda v : get_factor_period(v)[1]
)
df_data.head()
#df_data.valid_period.dtype

Unnamed: 0,valid_period,content,valid_period_start,valid_period_end
0,,0=No factors identified for this driver,-inf,inf
1,",2009","1=(through 2009) Drowsy, Asleep",-inf,2009.0
2,",2009","2=(through 2009) Ill, Passed Out, Blacked Out",-inf,2009.0
3,",2009","3=(through 2009) Emotional (e.g., depressed, a...",-inf,2009.0
4,,4=Reaction to/Failure to take Drugs/Medication,-inf,inf


In [32]:
parenthesis_pat1 = r'\(through ([0-9]{4})\)'
parenthesis_pat2 = r'\(since ([0-9]{4})\)'
def clear_parentheses(s:str):
    match1 = re.search(pattern=parenthesis_pat1, string=s)
    if  match1:
        s = re.sub(pattern=parenthesis_pat1, repl='', string=s)
    match2 = re.search(pattern=parenthesis_pat2, string=s)
    if match2:
        s = re.sub(pattern=parenthesis_pat2, repl='', string=s)
    return s

In [33]:
df_data.content = df_data.content.map(clear_parentheses)
df_data.head()

Unnamed: 0,valid_period,content,valid_period_start,valid_period_end
0,,0=No factors identified for this driver,-inf,inf
1,",2009","1= Drowsy, Asleep",-inf,2009.0
2,",2009","2= Ill, Passed Out, Blacked Out",-inf,2009.0
3,",2009","3= Emotional (e.g., depressed, angry,disturbed)",-inf,2009.0
4,,4=Reaction to/Failure to take Drugs/Medication,-inf,inf


In [37]:
df_data.content.sort_values()

128                                         Bicycle Lane
120      Non-Intersection, On Road, Not in Marked Cro...
118                 Non-Intersection,In Marked Crosswalk
                             ...                        
117                 9=At Intersection, Unknown Location 
10                        Aggressive Driving Road Rage  
95                  Possible Distraction Inside Vehicle 
Name: content, Length: 141, dtype: object