In [2]:
%%writefile ingestion.py
import logging
import os
import subprocess
import yaml
import pandas as pd
import datetime 
import gc
import re


def read_config_file(filepath):
    with open(filepath, 'r') as stream:
        try:
            return yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            logging.error(exc)


def replacer(string, char):
    pattern = char + '{2,}'
    string = re.sub(pattern, char, string) 
    return string

def col_header_val(df,table_config):
    '''
    replace whitespaces in the column
    and standardized column names
    '''
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace('[^\w]','_',regex=True)
    df.columns = list(map(lambda x: x.strip('_'), list(df.columns)))
    df.columns = list(map(lambda x: replacer(x,'_'), list(df.columns)))
    expected_col = list(map(lambda x: x.lower(),  table_config['columns']))
    expected_col.sort()
    df.columns =list(map(lambda x: x.lower(), list(df.columns)))
    df = df.reindex(sorted(df.columns), axis=1)
    if len(df.columns) == len(expected_col) and list(expected_col)  == list(df.columns):
        print("column name and column length validation passed")
        return 1
    else:
        print("column name and column length validation failed")
        mismatched_columns_file = list(set(df.columns).difference(expected_col))
        print("Following File columns are not in the YAML file",mismatched_columns_file)
        missing_YAML_file = list(set(expected_col).difference(df.columns))
        print("Following YAML columns are not in the file uploaded",missing_YAML_file)
        logging.info(f'df columns: {df.columns}')
        logging.info(f'expected columns: {expected_col}')
        return 0

Overwriting ingestion.py


In [3]:
%%writefile file.yaml
file_type: csv
dataset_name: NYC parking tickets
file_name: Parking_Violations_Issued_-_Fiscal_Year_2015
table_name: Parking_Violations_Issued_-_Fiscal_Year_2015
inbound_delimiter: ","
outbound_delimiter: "|"
skip_leading_rows: 1
columns: 
    - Number
    - City
    - Gender
    - Age
    - Income

Overwriting file.yaml


In [4]:
import ingestion as ing
config_data = ing.read_config_file("file.yaml")

In [5]:
config_data

{'file_type': 'csv',
 'dataset_name': 'toy',
 'file_name': 'toy_dataset',
 'table_name': 'toy_dataset',
 'inbound_delimiter': ',',
 'outbound_delimiter': '|',
 'skip_leading_rows': 1,
 'columns': ['Number', 'City', 'Gender', 'Age', 'Income', 'Illness']}

In [6]:
import pandas as pd
df_sample = pd.read_csv("Parking_Violations_Issued_-_Fiscal_Year_2015.csv",delimiter=',')
df_sample.head()

Unnamed: 0,Number,City,Gender,Age,Income,Illness
0,1,Dallas,Male,41,40367.0,No
1,2,Dallas,Male,54,45084.0,No
2,3,Dallas,Male,42,52483.0,No
3,4,Dallas,Male,40,40941.0,No
4,5,Dallas,Male,46,50289.0,No


In [7]:
file_type = config_data['file_type']
source_file = "./" + config_data['file_name'] + f'.{file_type}'
df = pd.read_csv(source_file,config_data['inbound_delimiter'])
df.head()

Unnamed: 0,Number,City,Gender,Age,Income,Illness
0,1,Dallas,Male,41,40367.0,No
1,2,Dallas,Male,54,45084.0,No
2,3,Dallas,Male,42,52483.0,No
3,4,Dallas,Male,40,40941.0,No
4,5,Dallas,Male,46,50289.0,No


In [8]:
ing.col_header_val(df,config_data)

column name and column length validation passed


1

In [9]:
print("columns of files are:" ,df.columns)
print("columns of YAML are:" ,config_data['columns'])

columns of files are: Index(['number', 'city', 'gender', 'age', 'income', 'illness'], dtype='object')
columns of YAML are: ['Number', 'City', 'Gender', 'Age', 'Income', 'Illness']


In [10]:
if ing.col_header_val(df,config_data)==0:
    print("validation failed")
else:
    print("col validation passed")

column name and column length validation passed
col validation passed


In [11]:
import os
os.path.getsize('Parking_Violations_Issued_-_Fiscal_Year_2015.csv')

5735570

In [19]:
import gzip
import os
content1 = "Total number of rows:" + " " + str(df.shape[0]) 
content2 = "total number of columns:" + " " +  str(len(df.columns)) 
content3 = "file size:" + " " + str(os.path.getsize('Parking_Violations_Issued_-_Fiscal_Year_2015.csv')) + " " + "bytes" 
with gzip.open('/home/jupyter/file1.gz', 'wt') as f:
    f.write(content1 + '\n')
    f.write(content2 + '\n')
    f.write(content3 + '\n')