In [28]:
from os import chdir, getcwd, path
from numpy import nan
from time import time
from logging import info, error
import subprocess
from yaml import YAMLError, safe_load, dump
import datetime
from re import sub
from string import punctuation, ascii_letters
from random import random, choice, randint, uniform
from psutil import virtual_memory
import gc
from graphviz import Source
from objgraph import show_refs
from pandas import DataFrame, read_csv
from dask import dataframe as dd
import ray
from modin import pandas as mpd
import grpc

In [4]:
cwd = getcwd()
print(cwd)
chdir('C:/Users/andre/Repositories/Professional.Portfolio/Sample6_Data.Validation.Pipeline')
print(cwd)


C:\Users\andre\Repositories\Professional.Portfolio\Sample6_Data.Validation.Pipeline
C:\Users\andre\Repositories\Professional.Portfolio\Sample6_Data.Validation.Pipeline


## Helper Functions

In [5]:
# function to keep track of df objects in memory
def dfs():
    """
    List all Pandas DataFrame objects currently in memory.

    This function lists all Pandas DataFrame objects present in the global namespace.

    Returns:
        list: A list of DataFrame objects.
    """
    dataframes = [var for var in globals() if isinstance(globals()[var], DataFrame)]
    print(dataframes)

In [7]:
monitor_ram()

svmem(total=34033319936, available=24460357632, percent=28.1, used=9572962304, free=24460357632)

In [8]:
dfs()

[]


In [6]:
def monitor_ram():
    """
    Get information about the system's RAM (Random Access Memory) usage.

    Returns:
        psutil._common.svmem: A named tuple representing RAM usage statistics.
    """
    memory_usage = virtual_memory()
    return memory_usage

In [6]:
def generate_ram_graph(working_dir, dot_name, objects):
    """
    Generate a reference graph of Python objects and save it as a PNG image.

    Parameters:
        working_dir (str): The directory where the DOT file and PNG image will be saved.
        dot_name (str): The base name for the DOT and PNG files (without extensions).
        objects: The Python objects for which references will be graphed.

    Returns:
        graphviz.files.Source: A Graphviz Source object representing the rendered reference graph.
    """
    # Specify the full file path where the DOT file should be saved
    dot_file_path = f'{working_dir}/{dot_name}.dot'
    
    # Generate the object references graph and save it as a DOT file
    show_refs(objects, filename=dot_file_path)
    
    # Specify the full file path where the PNG image should be saved
    png_file_path = f'{working_dir}/{dot_name}.png'
    
    # Use graphviz to render the DOT file as a PNG image
    graph = Source.from_file(dot_file_path, format='png')
    
    return graph


## Create Mock Dataset with Special Characters, Strings, Integers, & Missing Values

In [9]:
def generate_random_value():
    """
    Generate a random value with various data types and potential missing values.

    Returns:
        str, int, float, or None: A randomly generated value, which can be a string, integer, float,
        or None (representing a missing value).

    Description:
        This function generates random values with different data types and the possibility of missing values.
        - 10% chance of returning a missing value (None).
        - 20% chance of returning a string consisting of 5 random special characters.
        - 50% chance of returning a string consisting of 10 random alphanumeric characters.
        - 20% chance of returning either a random integer between 1 and 1000 (inclusive) or
          a random float between 0.1 and 1000.0 (inclusive).

    Example:
        Possible outputs:
        - 'ABc!@#' (string with special characters)
        - 123 (integer)
        - 456.789 (float)
        - None (missing value)
    """
    
    # 10% chance of missing value
    if random() < 0.1:  
        return nan
    # 20% chance of special character
    elif random() < 0.2:  
        return ''.join(choice(punctuation) for _ in range(5))
    # 50% chance of string
    elif random() < 0.5:  
        return ''.join(choice(ascii_letters) for _ in range(10))
     # 20% chance of number (integer or float)
    else: 
        return choice([randint(1, 1000), uniform(0.1, 1000.0)])
    

In [10]:
def generate_and_save_large_dataframe():
    """
    Generate a large Pandas DataFrame with random data and save it to a CSV file when it exceeds 2GB in size.

    Description:
        This function generates random data and creates a Pandas DataFrame. It keeps adding rows to the DataFrame
        until its size exceeds 2GB. Once the size limit is reached, the DataFrame is saved to a CSV file
        with a filename indicating the size of the dataset in millions of rows (e.g., 'mock_dataset_6M.csv').

    Note:
        The function uses the `generate_random_value` function to create random data.

    Returns:
        None
    """
    data = {}
    num_columns = 25
    num_rows = 6000000

    while True:
        # Generate dict substructure for a Pandas dataframe
        for i in range(num_columns):
            column_name = f'column_{i+1}'
            data[column_name] = [generate_random_value() for _ in range(num_rows)]

        # Create a Pandas DataFrame
        df = DataFrame(data)
        df_size_bytes = df.memory_usage(index=True).sum()

        # Check size of dataframe: if > than 2GB, save the file
        if df_size_bytes > 2 * 1073741824: 
            filename = f'mock_dataset_{num_rows//1000000}M.csv'
            df.to_csv(filename, index=False)
            print(f"Saved DataFrame to {filename}")
            break  
        # Else, add another 2000000 rows to the df
        else:
            del df 
            num_rows += 2000000


## Decide Library to Use to Speed Up Validation

In [11]:
## > Pandas

def load_csv_with_pandas(file_path):
    """
    Load a CSV file using Pandas and measure the loading time.

    Parameters:
        file_path (str): The path to the CSV file.

    Returns:
        float: The time taken to load the CSV file using Pandas (in seconds).
    """
    start_time1 = time()
    df = read_csv(file_path)
    end_time1 = time() - start_time1
    print(end_time1)
    return df

In [12]:
## > Dask

def load_csv_with_dask(file_path):
    """
    Load a CSV file using Dask and measure the loading time.

    Parameters:
        file_path (str): The path to the CSV file.

    Returns:
        float: The time taken to load the CSV file using Dask (in seconds).
    """
    start_time2 = time()
    ddf = dd.read_csv(file_path)
    end_time2 = time() - start_time2
    print(end_time2)
    return ddf


In [17]:
## > Ray

ray.init(runtime_env={'env_vars': {'__MODIN_AUTOIMPORT_PANDAS__': '1'}})


2024-01-13 16:26:42,586	INFO worker.py:1724 -- Started a local Ray instance.


0,1
Python version:,3.11.7
Ray version:,2.9.0


In [18]:
## > Modin

def load_csv_with_modin(file_path):
    """
    Load a CSV file using Modin and measure the loading time.

    Parameters:
        file_path (str): The path to the CSV file.

    Returns:
        float: The time taken to load the CSV file using Modin (in seconds).
    """
    start_time3 = time()
    mdf = mpd.read_csv(file_path)
    end_time3 = time() - start_time3
    print(end_time3)
    return mdf


# Automated Data Validation Pipeline: Using Dask

## Utility functions

In [19]:

def read_config_file(filepath):
    
    """
    Read and parse a YAML configuration file.

    Parameters:
        filepath (str): The path to the YAML configuration file.

    Returns:
        dict: A dictionary containing the parsed configuration data.
    """
    
    # Open specified filepath
    with open(filepath, 'r') as datacreek:
        # Try-Catch for YAMLError
        try:
            return safe_load(datacreek)
        except YAMLError as exc:
            # Logging library error sent to 'stdout'
            error(exc)
            

def replacer(string, char):
    """
    Replace consecutive instances of a character in a string with a single instance.

    Parameters:
        string (str): The input string.
        char (str): The character to be replaced.

    Returns:
        str: The input string with consecutive instances of the character replaced.
    """
    
    # Regex pattern for 2 or more instances
    pattern = char + '{2,}'
    
    # Sub function to replace specified char
    string = sub(pattern, char, string)
    
    # Returns the string with replaced char
    return string

def col_header_val(df, table_config):
    """
    Validate and standardize column names in a DataFrame based on a table configuration.

    Parameters:
        df (dd.DataFrame): The Dask DataFrame to be validated.
        table_config (dict): The table configuration dictionary.

    Returns:
        bool: True if validation passes, False otherwise.
    """
    
    # Convert all strings to lowercase
    df.columns = df.columns.str.lower()
 
    # Replace all whitespce at the start of column names
    df.columns = df.columns.str.replace('[^\w]', '_', regex=True)
    
    # Removes underscores from beginning & end of column names
    df.columns = list(map(lambda x: x.strip('_'), list(df.columns)))
   
    # Replaces 2 or more consecutive underscores with a single underscore
    df.columns = list(map(lambda x: replacer(x, '_'), list(df.columns)))
    
    # Converts expected col_names for 'table_config' to ensure case insensitivity during comaparison
    expected_col = list(map(lambda x: x.lower(), table_config['columns']))
   
  
    # Converts all column names in df once again. Ensures case insensitivity when comparing with expected col_names
    df.columns = list(map(lambda x: x.lower(), list(df.columns)))
    
    # Sort the DataFrame by multiple columns
    df = df[table_config['columns']]

    if len(df.columns) == len(expected_col) and list(expected_col) == list(df.columns):
        print('column namd and column length validation passed')
        return True

    # If the above is false, then we check what the differences are between df.col and exp_col and print them
    else:
        print('column name and column length validation failed')
        # Uses set operations for taking the difference between df.col and exp_col
        mismatched_columns_file = list(set(df.columns).difference(expected_col))
        print('The following YAML columns are not in the YAML file', mismatched_columns_file )
        # Uses set operations to check diff between exp_col and df_col
        missing_yaml_file = list(set(expected_col).difference(df.columns))
        print('The following YAML columns are not in the uploaded file', missing_yaml_file)
        # log results
        info(f'df columns: {df.columns}')
        info(f'expected columns: {expected_col}')
        return False


def generate_yaml_config(file_type, file_name, table_name, columns, in_del, out_del):
    """
    Generate a YAML configuration file for data processing and save it.

    Parameters:
        file_type (str): The type of the data file (e.g., 'csv', 'parquet').
        file_name (str): The name of the data file.
        table_name (str): The name of the data processing table.
        columns (list): A list of column names.
        in_del (str): The inbound delimiter character.
        out_del (str): The outbound delimiter character.

    Returns:
        tuple: A tuple containing the generated file_name, file_type, and table_name.
    """
    # Create a configuration dictionary
    config_data = {
        'file_type': file_type,
        'dataset_name': 'testfile',
        'file_name': file_name,
        'table_name': table_name,
        'inbound_delimiter': in_del,
        'outbound_delimiter': out_del,
        'skip_leading_rows': 1,
        'columns': columns,
    }

    # Convert the dictionary to YAML format
    yaml_config = dump(config_data, default_flow_style=False)

    # Save the YAML configuration to a file
    with open(f'{table_name}_config.yaml', 'w') as yaml_file:
        yaml_file.write(yaml_config)
        
    return file_name, file_type, table_name




def read_file(file_type, source_file):
    """
    Read data from a file into a Dask DataFrame based on the file type.

    Parameters:
        file_type (str): The type of the data file (e.g., 'csv', 'excel', 'parquet').
        source_file (str): The path to the source data file.

    Returns:
        dd.DataFrame: A Dask DataFrame containing the data from the file.
    """
    if file_type == 'csv':
        return dd.read_csv(source_file)
    elif file_type == 'excel':
        return dd.read_excel(source_file)
    elif file_type == 'parquet':
        return dd.read_parquet(source_file)
    # Add more file types as needed...
    
    


def save_file(df, file_type, target_path):
    """
    Save a Dask DataFrame to a file with the specified file type.

    Parameters:
        df (dd.DataFrame): The Dask DataFrame to be saved.
        file_type (str): The file type (e.g., 'csv', 'parquet', 'excel').
        target_path (str): The path where the file should be saved.

    Returns:
        bool: True if the file was saved successfully or already exists, False otherwise.
    """
    try:
        if file_type == 'csv':
            if path.exists(target_path):
                print(f"File already exists: {target_path}")
            else:
                df.to_csv(target_path, single_file=True)
        elif file_type == 'excel':
            if path.exists(target_path):
                print(f"File already exists: {target_path}")
            else:
                df.to_parquet(target_path)
        elif file_type == 'parquet':
            if path.exists(target_path):
                print(f"File already exists: {target_path}")
            else:
                df.to_excel(target_path, index=False)
        # Add more file types as needed
        else:
            print(f"Unsupported file type: {file_type}")
            return False

        print(f"File saved successfully: {target_path}")
        return True
    except Exception as e:
        print(f"Error saving file: {e}")
        return False



## Main Script

In [None]:
# Create 2.6GB dataset. Time to create: ~20min
generate_and_save_large_dataframe()

In [23]:
# Load 2.6Gb dataset with pandas and evaluate time taken
load_csv_with_pandas('./mock_dataset_12M.csv')

56.559773206710815

In [24]:
load_csv_with_dask('./mock_dataset_12M.csv')

0.02824115753173828

In [29]:
load_csv_with_modin('./mock_dataset_12M.csv')

33.292614698410034

In [32]:
### > Main script


if __name__ == '__main__':
    
   
    
# List of file configurations
file_configurations = [

    {
        'file_name': 'file1',
        'file_type': 'csv',
        'table_name': 'table1',
        'columns': ['col1', 'col2', 'col3'],
        'in_del': ',',
        'out_del': '|'
    },
    {
        'file_name': 'file2',
        'file_type': 'excel',
        'table_name': 'table2',
        'columns': ['colA', 'colB', 'colC'],
        'in_del': '\t',
        'out_del': ','
    }

    # Add more configurations as needed
]

# Create a loop to process each configuration
for config in file_configurations:
    file_name, file_type, table_name = generate_yaml_config(

                config['file_type'],
                config['file_name'],
                config['table_name'],
                config['columns'],
                config['in_del'],
                config['out_del']
    )

    # Read config file
    config_data = read_config_file(file_name)

    # Read file using config file
    file_type = config_data['file_type']
    source_file = './' + config_data['file_name'] + f'.{file_type}'

    # Read the file based on its type
    ddf = read_file(file_type, source_file)

    # Perform column validation
    result = col_header_val(ddf, config_data)

    if result:
        # Define the target file path and file type
        target_file_path = f'completed_{config_data["file_name"]}.{config_data["file_type"]}'

        # Save the file using the save_file function
        save_result = save_file(ddf, config_data["file_type"], target_file_path)

        if save_result:
            print(f'Successfully processed and saved: {config_data["file_name"]}')
        else:
            print(f'Error saving the file for: {config_data["file_name"]}')
    else:
        print(f'Validation failed for: {config_data["file_name"]}')

    

    

ERROR:root:expected '<document start>', but found '<scalar>'
  in "mock_dataset_12M.csv", line 17, column 1


TypeError: 'NoneType' object is not subscriptable