Search database for all CSV files.

In [22]:
import os
import pandas as pd
from m0_1_system_variables import test_data_path

Create a directory of source files:

In [4]:
complete_csv_path_list = []
for root, dirs, files in os.walk(test_data_path):
        for filename in files:
                if filename.lower().endswith('csv'):
                    complete_csv_path_list.append(r"{0}\{1}".format(root, filename))

Identify formatting of source files and import the data to buffer:

In [5]:
import chardet

data_type = []
# Determine the file encoding
for file_path in complete_csv_path_list:
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read())
    file_encoding = result['encoding']
    data_type.append(file_encoding)

In [6]:
print(data_type)

['ascii', 'ascii', 'ascii', 'ascii', 'ascii', 'ascii', 'ascii', 'ascii', 'ascii', 'ascii', 'ascii', 'ascii', 'ascii', 'ascii', 'ascii', 'ascii', 'ascii', 'ascii', 'ascii', 'Windows-1252', 'Windows-1252', 'ascii', 'ascii', 'ascii', 'ascii', 'ascii', 'ascii', 'ascii', 'ascii', 'ascii']


In [7]:
import pandas as pd

# Create an empty list to store the dataframes
df_list = []
vin_list = []
# Iterate through each file path in the list
for file_path in complete_csv_path_list:
    # Read the CSV file into a dataframe
    # Difficulty here: some files are ascii and some files are 'Windows-1252' encoding.
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read())
        detected_encoding = result['encoding']

    df = pd.read_csv(file_path, delimiter = '\r\n', encoding = detected_encoding, engine = 'python', names = ['Complete Record'])
    df_list.append(df)

In [8]:
df_list[:1]

[                                       Complete Record
 0                                 CDR FILE INFORMATION
 1                   User Entered VIN,1FDXE45S29DA10452
 2                                   User,M.  Mikhailov
 3                            Case Number,01371524-2019
 4    EDR Data Imaging Date,"02/20/2019             ...
 ..                                                 ...
 202                          Contains No Recorded Data
 203           LONGITUDINAL CRASH PULSE (SECOND RECORD)
 204                          Contains No Recorded Data
 205                LATERAL CRASH PULSE (SECOND RECORD)
 206                          Contains No Recorded Data
 
 [207 rows x 1 columns]]

In [9]:
master_import_dataframe = pd.DataFrame(columns=['Complete Records'])

for object in df_list:
    master_import_dataframe = pd.concat([master_import_dataframe, pd.DataFrame({'Complete Records': [object]})], ignore_index=True)


In [25]:
master_import_dataframe.head()

Unnamed: 0,Complete Records
0,Complet...
1,Complete...
2,Complet...
3,Complete...
4,Complet...


In [14]:
#The dataframe is complete, but the CSV is truncated. Something is breaking here.
master_import_dataframe.to_csv('master_dataframe.csv', index=False)

In [16]:
master_import_dataframe["Complete Records"][24].to_csv('test_single_instance.csv', index=False)

In [12]:
master_import_dataframe_from_file = pd.read_csv('master_dataframe.csv')

In [13]:
master_import_dataframe_from_file.head()

Unnamed: 0,Complete Records
0,Complet...
1,Complete...
2,Complet...
3,Complete...
4,Complet...


pd will export single instances to CSV but does not adequately preserve the entire dataframe. Pickle or Parquet will be more suitable. Because Parquet is interoperable with other programs, it was chosen instead.

In [27]:
master_import_dataframe.to_parquet("master_import_dataframe.parquet", engine = 'fastparquet')

ValueError: Can't infer object conversion type: 0                                            Complet...
1                                           Complete...
2                                            Complet...
3                                           Complete...
4                                            Complet...
5                                           Complete...
6                                            Complet...
7                                            Complet...
8                                           Complete...
9                                           Complete...
10                                           Complet...
11                                          Complete...
12                                          Complete...
13                                           Complet...
14                                          Complete...
15                                          Complete...
16                                           Complet...
17                                           Complet...
18                                           Complet...
19                                           Complet...
20                                           Complet...
21                                           Complet...
22                                          Complete...
23                                          Complete...
24                     Complete Record
0      Claim ...
25                                          Complete...
26                                           Complet...
27                                           Complet...
28                                           Complet...
29                                           Complet...
Name: Complete Records, dtype: object

Parquet encountered unexplained issues, so pickling will be implemented.

In [None]:
master_import_dataframe.to_pickle("master_import_dataframe.pkl")