In [None]:
import pandas as pd
import chardet
import re
import tabula

In [19]:
extraction_dataframe = pd.read_pickle("master_import_dataframe.pkl")

First, design the schema.

Because the data is unstructured, we will identify key values via visual exploration and extract them via keyword recognition (regex)

In [20]:
extraction_dataframe.head()

Unnamed: 0,Complete Records
0,Complet...
1,Complete...
2,Complet...
3,Complete...
4,Complet...


Inspect data structure within instances here:

In [12]:
extraction_dataframe["Complete Records"][24]

individual_exploratory_instance = extraction_dataframe["Complete Records"][24]

Let's break out primary keys in the schema first. Here is the VIN:

There are two options here: Use RegEx with the dataframes imported as strings, or work with the dataframes themselves.

The second option means less transformation of data and thus less potential to lose data granularity, but it also means that objects within cells must be split eventually.

In [21]:
extraction_dataframe['Complete Records String'] =  extraction_dataframe['Complete Records'].apply(lambda x: x.to_string(index=False, header=False))

In [22]:
extraction_dataframe.head()

Unnamed: 0,Complete Records,Complete Records String
0,Complet...,...
1,Complete...,...
2,Complet...,...
3,Complete...,...
4,Complet...,...


In [23]:
# Define regex pattern for VIN
vin_pattern = re.compile(r"User Entered VIN,([0-9A-Za-z]+)")

# Apply regex pattern on the dataframe and create a new column 'VIN'
extraction_dataframe['VIN'] = extraction_dataframe['Complete Records String'].apply(lambda x: vin_pattern.search(x).group(1) if vin_pattern.search(x) else None)

extraction_dataframe.head()


Unnamed: 0,Complete Records,Complete Records String,VIN
0,Complet...,...,1FDXE45S29DA10452
1,Complete...,...,1FDXE4FS3BDB02206
2,Complet...,...,1FDXE4FS4BDA50634
3,Complete...,...,1FDXE4FS5BDA96599
4,Complet...,...,1FDXE4FSXCDA06364


User information:

In [24]:
# Define regex pattern for VIN
#This regex pattern will stop at \r\n (a 'carriage return') because \ is not an included character in the search pattern.
user_pattern = re.compile(r"User,([A-Za-z' '.]+)")

# Apply regex pattern on the dataframe and create a new column 'VIN'
extraction_dataframe['User'] = extraction_dataframe['Complete Records String'].apply(lambda x: user_pattern.search(x).group(1) if user_pattern.search(x) else None)

extraction_dataframe.head()

Unnamed: 0,Complete Records,Complete Records String,VIN,User
0,Complet...,...,1FDXE45S29DA10452,M. Mikhailov
1,Complete...,...,1FDXE4FS3BDB02206,SP
2,Complet...,...,1FDXE4FS4BDA50634,M. Mikhailov
3,Complete...,...,1FDXE4FS5BDA96599,DENIS SHARSHUNSKIY
4,Complet...,...,1FDXE4FSXCDA06364,SP


Case number breakout and some data cleaning:

In [25]:
case_number_pattern = re.compile(r"Case Number,([0-9-]+)")

extraction_dataframe['Case Number'] = extraction_dataframe['Complete Records String'].apply(lambda x: case_number_pattern.search(x).group(1) if case_number_pattern.search(x) else None)

extraction_dataframe.head()

#This step requires additional cleaning, as all case numbers begin with 0 but are not always included. We must iterate through the instances of extraction_dataframe['Case Number'] and check if not NaN, and if .startswith() == 0, then prepend 0.

Unnamed: 0,Complete Records,Complete Records String,VIN,User,Case Number
0,Complet...,...,1FDXE45S29DA10452,M. Mikhailov,01371524-2019
1,Complete...,...,1FDXE4FS3BDB02206,SP,1047504-2018
2,Complet...,...,1FDXE4FS4BDA50634,M. Mikhailov,02596301-2022
3,Complete...,...,1FDXE4FS5BDA96599,DENIS SHARSHUNSKIY,00871457
4,Complet...,...,1FDXE4FSXCDA06364,SP,866715-2017


In [26]:
def has_seven_digits_before_hyphen(case_number):
    number_segment = case_number.split('-')[0]
    return len(number_segment) == 7

extraction_dataframe['Case Number'] = extraction_dataframe['Case Number'].apply(
    lambda x: '0' + x if pd.notna(x) and not x.startswith('0') and has_seven_digits_before_hyphen(x) else x
)
extraction_dataframe['Case Number']

0     01371524-2019
1     01047504-2018
2     02596301-2022
3          00871457
4       866715-2017
5       547078-2014
6       685630-2015
7     01901400-2020
8       734406-2016
9              None
10    01358628-2019
11    01424658-2019
12    02464402-2021
13    01825864-2020
14    01144564-2018
15             None
16    02091914-2021
17    01869039-2020
18    02846539-2022
19    02083717-2020
20    02146598-2021
21    01199426-2018
22    02086865-2020
23    01858026-2020
24             None
25             None
26      924258-2018
27             None
28             None
29    01622036-2019
Name: Case Number, dtype: object

Collect EDR Data Imaging Date

In [27]:
imaging_date_pattern = re.compile(r"\d{2}/\d{2}/\d{4}")

extraction_dataframe['EDR Data Imaging Date'] = extraction_dataframe['Complete Records String'].apply(lambda x: imaging_date_pattern.search(x).group() if imaging_date_pattern.search(x) else None)

extraction_dataframe.head()

Unnamed: 0,Complete Records,Complete Records String,VIN,User,Case Number,EDR Data Imaging Date
0,Complet...,...,1FDXE45S29DA10452,M. Mikhailov,01371524-2019,02/20/2019
1,Complete...,...,1FDXE4FS3BDB02206,SP,01047504-2018,08/01/2018
2,Complet...,...,1FDXE4FS4BDA50634,M. Mikhailov,02596301-2022,04/29/2022
3,Complete...,...,1FDXE4FS5BDA96599,DENIS SHARSHUNSKIY,00871457,08/12/2017
4,Complet...,...,1FDXE4FSXCDA06364,SP,866715-2017,08/07/2017


Filenames:

In [36]:
filename_pattern = re.compile(r"Filename,(?:.+?X )*?([^X]+\.CDRX)")

extraction_dataframe['Filename'] = extraction_dataframe['Complete Records String'].apply(lambda x: filename_pattern.search(x).group(1) if filename_pattern.search(x) else None)

extraction_dataframe.head()

Unnamed: 0,Complete Records,Complete Records String,VIN,User,Case Number,EDR Data Imaging Date,Filename
0,Complet...,...,1FDXE45S29DA10452,M. Mikhailov,01371524-2019,02/20/2019,
1,Complete...,...,1FDXE4FS3BDB02206,SP,01047504-2018,08/01/2018,
2,Complet...,...,1FDXE4FS4BDA50634,M. Mikhailov,02596301-2022,04/29/2022,
3,Complete...,...,1FDXE4FS5BDA96599,DENIS SHARSHUNSKIY,00871457,08/12/2017,
4,Complet...,...,1FDXE4FSXCDA06364,SP,866715-2017,08/07/2017,


Saved On Date:

In [42]:
#This is a difficult segment because we do not want to include the variable name 'Saved On.' Thus, we must filter for days of the work.
saved_on_pattern = re.compile(r"((?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)day, .+? at \d{1,2}:\d{2}:\d{2})")

extraction_dataframe['Saved On'] = extraction_dataframe['Complete Records String'].apply(lambda x: saved_on_pattern.search(x).group() if saved_on_pattern.search(x) else None)

extraction_dataframe.head()

Unnamed: 0,Complete Records,Complete Records String,VIN,User,Case Number,EDR Data Imaging Date,Filename,Saved On
0,Complet...,...,1FDXE45S29DA10452,M. Mikhailov,01371524-2019,02/20/2019,,
1,Complete...,...,1FDXE4FS3BDB02206,SP,01047504-2018,08/01/2018,,
2,Complet...,...,1FDXE4FS4BDA50634,M. Mikhailov,02596301-2022,04/29/2022,,"Friday, April 29 2022 at 13:07:54"
3,Complete...,...,1FDXE4FS5BDA96599,DENIS SHARSHUNSKIY,00871457,08/12/2017,,
4,Complet...,...,1FDXE4FSXCDA06364,SP,866715-2017,08/07/2017,,"Monday, August 7 2017 at 12:13:58"


CDR Version Info:

In [47]:
imaged_with_pattern = re.compile(r'(Crash Data Retrieval Tool \d+\.\d+)')

def extract_two_instances(text):
    matches = imaged_with_pattern.findall(text)
    first_instance = matches[0] if len(matches) >= 1 else None
    second_instance = matches[1] if len(matches) >= 2 else None
    return first_instance, second_instance

extraction_dataframe['Imaged with CDR Version:'], extraction_dataframe['Reported with CDR Version:'] = zip(*extraction_dataframe['Complete Records String'].apply(lambda x: extract_two_instances(x)))

extraction_dataframe.head()


Unnamed: 0,Complete Records,Complete Records String,VIN,User,Case Number,EDR Data Imaging Date,Filename,Saved On,Imaged with CDR Version:,Reported with CDR Version:
0,Complet...,...,1FDXE45S29DA10452,M. Mikhailov,01371524-2019,02/20/2019,,,Crash Data Retrieval Tool 17.10,Crash Data Retrieval Tool 21.5
1,Complete...,...,1FDXE4FS3BDB02206,SP,01047504-2018,08/01/2018,,,Crash Data Retrieval Tool 17.8,Crash Data Retrieval Tool 21.5
2,Complet...,...,1FDXE4FS4BDA50634,M. Mikhailov,02596301-2022,04/29/2022,,"Friday, April 29 2022 at 13:07:54",Crash Data Retrieval Tool 21.4,Crash Data Retrieval Tool 21.5
3,Complete...,...,1FDXE4FS5BDA96599,DENIS SHARSHUNSKIY,00871457,08/12/2017,,,Crash Data Retrieval Tool 17.3,Crash Data Retrieval Tool 21.5
4,Complet...,...,1FDXE4FSXCDA06364,SP,866715-2017,08/07/2017,,"Monday, August 7 2017 at 12:13:58",Crash Data Retrieval Tool 17.4,Crash Data Retrieval Tool 23.0


In [2]:
extraction_dataframe.to_pickle("extraction_dataframe_with_keys.pkl")

NameError: name 'extraction_dataframe' is not defined