# File ingestion and schema validation
### Used Cars dataset taken from Kaggle: (66 columns, 3 million entrees)

In [1]:
import os
import time

In [2]:
#Size of the file
os.path.getsize('C://Users//JCCLI//Downloads//used_cars_data.csv')

9980208148

### Read data with Dask

- Since our data has 66 rows, we will only be focusing on 6 columns of the entire dataset for this file we want to create

In [3]:
cols = ["vin", "body_type","daysonmarket","engine_displacement","engine_type","exterior_color"]
from dask import dataframe as dd
start = time.time()
dask_df = dd.read_csv('C://Users//JCCLI//Downloads//used_cars_data.csv',usecols = cols)
end = time.time()
print("Time to read CSV using Dask:", (end-start), "sec")

Time to read CSV using Dask: 0.15624594688415527 sec


### Read data with Pandas


In [4]:
import pandas as pd
start = time.time()
pd_df = pd.read_csv('C://Users//JCCLI//Downloads//used_cars_data.csv',usecols = cols, nrows = 3000000)
end = time.time()
print("Time to read CSV using Pandas:", (end-start), "sec")

Time to read CSV using Pandas: 282.5036242008209 sec


### Read data with modin and ray

#### As you can see, we will use dask for the final file since it's speed is much much faster than pandas after getting rid of most of the columns

In [5]:
from dask import dataframe as dd
df = dd.read_csv('C:/Users//JCCLI//Downloads//used_cars_data.csv', usecols = cols)

In [6]:
len(df.index)

3000040

In [7]:
# remove underscores
df.columns=df.columns.str.replace('[_]','')

  df.columns=df.columns.str.replace('[_]','')


In [8]:
df.columns

Index(['vin', 'bodytype', 'daysonmarket', 'enginedisplacement', 'enginetype',
       'exteriorcolor'],
      dtype='object')

#### Data Validation
- Here we will perform data validation. I saved the updated data frame as a csv so our final write contains only our selected columns

In [9]:
%%writefile testutility.py
import logging
import os
import subprocess
import yaml
import pandas as pd
import datetime 
import gc
import re
def read_config_file(filepath):
    with open(filepath, 'r') as stream:
        try:
            return yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            logging.error(exc)


def replacer(string, char):
    pattern = char + '{2,}'
    string = re.sub(pattern, char, string) 
    return string

def col_header_val(df,table_config):
    '''
    replace whitespaces in the column
    and standardized column names
    '''
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace('[^\w]','_',regex=True)
    df.columns = list(map(lambda x: x.strip('_'), list(df.columns)))
    df.columns = list(map(lambda x: replacer(x,'_'), list(df.columns)))
    expected_col = list(map(lambda x: x.lower(),  table_config['columns']))
    expected_col.sort()
    df.columns =list(map(lambda x: x.lower(), list(df.columns)))
    df = df.reindex(sorted(df.columns), axis=1)
    if len(df.columns) == len(expected_col) and list(expected_col)  == list(df.columns):
        print("column name and column length validation passed")
        return 1
    else:
        print("column name and column length validation failed")
        mismatched_columns_file = list(set(df.columns).difference(expected_col))
        print("Following File columns are not in the YAML file",mismatched_columns_file)
        missing_YAML_file = list(set(expected_col).difference(df.columns))
        print("Following YAML columns are not in the file uploaded",missing_YAML_file)
        logging.info(f'df columns: {df.columns}')
        logging.info(f'expected columns: {expected_col}')
        return 0

Overwriting testutility.py


### YAML

In [10]:
%%writefile file.yaml
file_type: csv
dataset_name: file
file_name: used_carsnew
table_name: edsurv
inbound_delimiter: ","
outbound_delimiter: "|"
skip_leading_rows: 1
columns: 
    - vin
    - bodytype
    - daysonmarket
    - enginedisplacement
    - enginetype
    - exteriorcolor

Overwriting file.yaml


In [11]:
# Read config file
import testutility as util
config_data = util.read_config_file("file.yaml")

In [12]:
config_data['inbound_delimiter']

','

In [13]:
#inspecting data of config file
config_data

{'file_type': 'csv',
 'dataset_name': 'file',
 'file_name': 'used_carsnew',
 'table_name': 'edsurv',
 'inbound_delimiter': ',',
 'outbound_delimiter': '|',
 'skip_leading_rows': 1,
 'columns': ['vin',
  'bodytype',
  'daysonmarket',
  'enginedisplacement',
  'enginetype',
  'exteriorcolor']}

In [14]:
# Normal reading process of the file
import dask.dataframe as dd
df_sample = dd.read_csv("used_carsnew.csv",delimiter=',')
df_sample.head()

Unnamed: 0.1,Unnamed: 0,vin,bodytype,daysonmarket,enginedisplacement,enginetype,exteriorcolor
0,0,ZACNJABB5KPJ92081,SUV / Crossover,522,1300.0,I4,Solar Yellow
1,1,SALCJ2FX1LH858117,SUV / Crossover,207,2000.0,I4,Narvik Black
2,2,JF1VA2M67G9829723,Sedan,1233,2500.0,H4,
3,3,SALRR2RV0L2433391,SUV / Crossover,196,3000.0,V6,Eiger Gray
4,4,SALCJ2FXXLH862327,SUV / Crossover,137,2000.0,I4,Narvik Black


In [15]:
# read the file using config file
file_type = config_data['file_type']
source_file = "./" + config_data['file_name'] + f'.{file_type}'
#print("",source_file)
df = pd.read_csv(source_file,config_data['inbound_delimiter'])
df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0.1,Unnamed: 0,vin,bodytype,daysonmarket,enginedisplacement,enginetype,exteriorcolor
0,0,ZACNJABB5KPJ92081,SUV / Crossover,522,1300.0,I4,Solar Yellow
1,1,SALCJ2FX1LH858117,SUV / Crossover,207,2000.0,I4,Narvik Black
2,2,JF1VA2M67G9829723,Sedan,1233,2500.0,H4,
3,3,SALRR2RV0L2433391,SUV / Crossover,196,3000.0,V6,Eiger Gray
4,4,SALCJ2FXXLH862327,SUV / Crossover,137,2000.0,I4,Narvik Black


In [16]:
#validate the header of the file
util.col_header_val(df,config_data)

column name and column length validation failed
Following File columns are not in the YAML file ['unnamed_0']
Following YAML columns are not in the file uploaded []


0

In [17]:
print("columns of files are:" ,df.columns)
print("columns of YAML are:" ,config_data['columns'])

columns of files are: Index(['unnamed_0', 'vin', 'bodytype', 'daysonmarket', 'enginedisplacement',
       'enginetype', 'exteriorcolor'],
      dtype='object')
columns of YAML are: ['vin', 'bodytype', 'daysonmarket', 'enginedisplacement', 'enginetype', 'exteriorcolor']


In [18]:
if util.col_header_val(df,config_data)==0:
    print("validation failed")
    # write code to reject the file
else:
    print("col validation passed")
    # write the code to perform further action
    # in the pipleine

column name and column length validation failed
Following File columns are not in the YAML file ['unnamed_0']
Following YAML columns are not in the file uploaded []
validation failed


In [32]:
# Write our csv file in gz format (pipe separated "|")
import csv
import datetime
import gzip

from dask import dataframe as dd
df = dd.read_csv('used_carsnew.csv',delimiter=',')

df.to_csv('used_carsnew.csv.gz',
          sep='|',
          header=True,
          index=False,
          quoting=csv.QUOTE_ALL,
          compression='gzip',
          quotechar='"',
          doublequote=True,
          line_terminator='\n')

['C:\\Users\\JCCLI\\used_carsnew.csv.gz\\0.part',
 'C:\\Users\\JCCLI\\used_carsnew.csv.gz\\1.part',
 'C:\\Users\\JCCLI\\used_carsnew.csv.gz\\2.part']

In [23]:
# checking number of files in gz format folder
import os
partitions = os.listdir('used_carsnew.csv.gz/')
for partition in partitions:
    print(partition)

0.part
1.part
2.part


In [24]:
#size of the gz format folder
os.path.getsize('used_carsnew.csv.gz')

4096

PermissionError: [Errno 13] Permission denied: 'used_carsnew.csv.gz'