# Cleaning Data and Preparing DataLoader

- Look into using parquet files to shrink data files down (comes at the cost of not being human readable)

## Imports

In [2]:
import pandas as pd
import torch
import os, logging, joblib, io
from logging import Logger
from typing import List, Optional
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from pathlib import Path
from importlib.metadata import version
from torch.utils.data import DataLoader, Dataset
from imblearn.over_sampling import RandomOverSampler

In [3]:
# Set up logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [None]:
packages = ["pandas", "importlib-metadata", "pyarrow"]
for package in packages:
    try:
        logger.info(f"{package} version: {version(package)}")
    except Exception as e:
        logger.warning(f"Could not get version for package {package}: {e}")

INFO:__main__:pandas version: 2.3.2
INFO:__main__:importlib-metadata version: 8.7.0
INFO:__main__:pyarrow version: 21.0.0




## Load Dataframe from csv file in local directory

In [5]:
DATA_ROOT = Path("../Data")
RAW_DATA_DIR_NAME = "Data"
# DATA_RAW_FILE_NAME = "credit-card-fraud.csv"

DATA_RAW_FILE_NAME = "credit-card-fraud-RAW.csv"
DATA_CLEAN_FILE_NAME = "credit-card-fraud-CLEAN.csv"

TEMP_DATA_PATH = DATA_ROOT / RAW_DATA_DIR_NAME / DATA_RAW_FILE_NAME
DATA_PATH = DATA_ROOT / RAW_DATA_DIR_NAME / DATA_CLEAN_FILE_NAME

In [120]:
df = pd.read_csv(TEMP_DATA_PATH)

## Data Cleaning

### Dropping Columns

In [121]:
drop_these =["Unnamed: 0", "trans_date_trans_time", "cc_num", "merchant", "first", "last", "street", "city", "zip", "job", "dob", "trans_num", "unix_time", "Unnamed: 23", "6006"]

In [122]:
df.drop(columns=drop_these, inplace=True)

In [123]:
df.columns

Index(['category', 'amt', 'gender', 'state', 'lat', 'long', 'city_pop',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')

In [124]:
df.shape

(1048575, 10)

In [125]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 10 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   category    1048575 non-null  object 
 1   amt         1048575 non-null  float64
 2   gender      1048575 non-null  object 
 3   state       1048575 non-null  object 
 4   lat         1048575 non-null  float64
 5   long        1048575 non-null  float64
 6   city_pop    1048575 non-null  int64  
 7   merch_lat   1048575 non-null  float64
 8   merch_long  1048575 non-null  float64
 9   is_fraud    1048575 non-null  int64  
dtypes: float64(5), int64(2), object(3)
memory usage: 80.0+ MB


### Create Dictionaries for mapping

In [126]:
categories = df["category"].unique().tolist()

In [127]:
categories.sort()

In [128]:
categories

['entertainment',
 'food_dining',
 'gas_transport',
 'grocery_net',
 'grocery_pos',
 'health_fitness',
 'home',
 'kids_pets',
 'misc_net',
 'misc_pos',
 'personal_care',
 'shopping_net',
 'shopping_pos',
 'travel']

In [129]:
categories_dict = {category: float(idx) for idx, category in enumerate(categories)}

In [130]:
categories_dict

{'entertainment': 0.0,
 'food_dining': 1.0,
 'gas_transport': 2.0,
 'grocery_net': 3.0,
 'grocery_pos': 4.0,
 'health_fitness': 5.0,
 'home': 6.0,
 'kids_pets': 7.0,
 'misc_net': 8.0,
 'misc_pos': 9.0,
 'personal_care': 10.0,
 'shopping_net': 11.0,
 'shopping_pos': 12.0,
 'travel': 13.0}

In [None]:
genders = sorted(df["gender"].unique().tolist())

In [132]:
gender_dict = {gender: float(idx) for idx, gender in enumerate(genders)}

In [133]:
gender_dict

{'F': 0.0, 'M': 1.0}

In [None]:
states = sorted(df["state"].unique().tolist())

In [135]:
states.sort()

In [136]:
states_dict = {state: float(idx) for idx, state in enumerate(states)}

In [137]:
states_dict

{'AK': 0.0,
 'AL': 1.0,
 'AR': 2.0,
 'AZ': 3.0,
 'CA': 4.0,
 'CO': 5.0,
 'CT': 6.0,
 'DC': 7.0,
 'DE': 8.0,
 'FL': 9.0,
 'GA': 10.0,
 'HI': 11.0,
 'IA': 12.0,
 'ID': 13.0,
 'IL': 14.0,
 'IN': 15.0,
 'KS': 16.0,
 'KY': 17.0,
 'LA': 18.0,
 'MA': 19.0,
 'MD': 20.0,
 'ME': 21.0,
 'MI': 22.0,
 'MN': 23.0,
 'MO': 24.0,
 'MS': 25.0,
 'MT': 26.0,
 'NC': 27.0,
 'ND': 28.0,
 'NE': 29.0,
 'NH': 30.0,
 'NJ': 31.0,
 'NM': 32.0,
 'NV': 33.0,
 'NY': 34.0,
 'OH': 35.0,
 'OK': 36.0,
 'OR': 37.0,
 'PA': 38.0,
 'RI': 39.0,
 'SC': 40.0,
 'SD': 41.0,
 'TN': 42.0,
 'TX': 43.0,
 'UT': 44.0,
 'VA': 45.0,
 'VT': 46.0,
 'WA': 47.0,
 'WI': 48.0,
 'WV': 49.0,
 'WY': 50.0}

### Apply mapping to the columns to convert to floats

In [138]:
df["category"] = df["category"].map(categories_dict)
df["gender"] = df["gender"].map(gender_dict)
df["state"] = df["state"].map(states_dict)

### Convert all columns into float32 datatypes

In [139]:
df = df.astype('float32')

### End of Data Cleaning

In [140]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 10 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   category    1048575 non-null  float32
 1   amt         1048575 non-null  float32
 2   gender      1048575 non-null  float32
 3   state       1048575 non-null  float32
 4   lat         1048575 non-null  float32
 5   long        1048575 non-null  float32
 6   city_pop    1048575 non-null  float32
 7   merch_lat   1048575 non-null  float32
 8   merch_long  1048575 non-null  float32
 9   is_fraud    1048575 non-null  float32
dtypes: float32(10)
memory usage: 40.0 MB


# SAVING CLEANED DATA TO FILE

In [141]:
df.to_csv(DATA_PATH, index=False)

In [None]:
# Test different file extensions
# df.to_feather("data.feather")
# df.to_parquet("data.parquet")

# read and test datatypes

In [5]:
dq = pd.read_csv(DATA_PATH, dtype='float32') # Does not convert to float32 by default, dtype has to be explicitly provided

In [None]:
#
# dq=dq.astype('float32')

In [6]:
print(dq.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 10 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   category    1048575 non-null  float32
 1   amt         1048575 non-null  float32
 2   gender      1048575 non-null  float32
 3   state       1048575 non-null  float32
 4   lat         1048575 non-null  float32
 5   long        1048575 non-null  float32
 6   city_pop    1048575 non-null  float32
 7   merch_lat   1048575 non-null  float32
 8   merch_long  1048575 non-null  float32
 9   is_fraud    1048575 non-null  float32
dtypes: float32(10)
memory usage: 40.0 MB
None


In [7]:
dq.head()

Unnamed: 0,category,amt,gender,state,lat,long,city_pop,merch_lat,merch_long,is_fraud
0,8.0,4.97,0.0,27.0,36.0788,-81.178101,3495.0,36.011292,-82.048317,0.0
1,4.0,107.230003,0.0,47.0,48.887798,-118.210503,149.0,49.159046,-118.186462,0.0
2,0.0,220.110001,1.0,13.0,42.180801,-112.262001,4154.0,43.150703,-112.15448,0.0
3,2.0,45.0,1.0,26.0,46.230598,-112.1138,1939.0,47.034332,-112.561073,0.0
4,9.0,41.959999,1.0,45.0,38.4207,-79.462898,99.0,38.674999,-78.632462,0.0


# Creating DataLoaders

### Clean Data Function

In [13]:
def clean_data(df: pd.DataFrame, logger: Logger, dropped_columns: Optional[List[str]]=None) -> pd.DataFrame:
    """Cleans the input DataFrame.
    Args:
        df (pd.DataFrame): The input DataFrame to be cleaned.
        logger (Logger): Logger object for logging information.
        dropped_columns (List[str], optional): Columns to drop from the features in original dataset.
    Returns:
        pd.DataFrame: The cleaned DataFrame.
    """
    show_dataframe_info = True  # Set to True to log DataFrame info

    # Log the initial state of the DataFrame
    logger.info(f"Initial DataFrame shape: {df.shape}")
    
    if show_dataframe_info:
        buffer = io.StringIO()  # Create a buffer to capture the info output
        df.info(buf=buffer) # Store the output into the buffer
        logger.info(f"Initial DataFrame info:\n " + buffer.getvalue())
    
    # Drop any unused columns
    try:
        df.drop(columns=dropped_columns, inplace=True)
    except Exception as e:
        logger.info(f"Problem dropping columns, {e}")


    # Create dictionaries for mapping/encoding
    categories = sorted(df["category"].unique().tolist())
    categories_dict = {category: idx for idx, category in enumerate(categories)}

    genders = sorted(df["gender"].unique().tolist())
    gender_dict = {gender: idx for idx, gender in enumerate(genders)}

    states = sorted(df["state"].unique().tolist())
    # Encode categorical variables and convert to float
    states_dict = {state: idx for idx, state in enumerate(states)}


    logger.info("Encoding categorical variables...")
    try:
        df['category'] = df['category'].map(categories_dict)
        df['gender'] = df['gender'].map(gender_dict)
        df['state'] = df['state'].map(states_dict)
    except Exception as e:
        logger.info(f"Problem encoding columns, {e}")
        
    # Handle missing values (if any)
    if df.isnull().sum().sum() > 0:
        logger.info("Handling missing values...")
        df = df.dropna()  # Example: Drop rows with missing values
        logger.info(f"DataFrame shape after dropping missing values: {df.shape}")
    
    # Convert to 'float32' to reduce memory usage
    logger.info("Converting Data Frame to 'float32'...")
    df = df.astype('float32')

    if show_dataframe_info:
        # Reinitialize the buffer to clear any previous content in order to log the final dataframe info
        buffer = io.StringIO()
        df.info(buf=buffer)
        logger.info(f"Final DataFrame info:\n " + buffer.getvalue())

    return df

### Custom Dataset Class

In [9]:
class CustomDataset(Dataset):
    def __init__(self, csv_file: str="../Data/DataSplits/test.csv", label_column: str="Label"):
        """Initializer for the Dataset class.
        Args:
            csv_file (str): Path to the CSV file containing the dataset.
            label_column (str): The name of the column indicating the label.
        """
        try:
            self.data = pd.read_csv(csv_file)   # Assign a pandas data frame
        except FileNotFoundError:   # Raise an error if the file is not found
            raise FileNotFoundError(f"File not found: {csv_file}")

        # Define feature and label columns
        self.label_column = label_column
        # Remove the Date column and the label column
        self.feature_columns = self.data.columns.drop([self.label_column])

    def __getitem__(self, idx) -> tuple[torch.tensor, torch.tensor]:
        """Returns a tuple (features, label) for the given index.
        Args:
            index (int): Index of the data sample to retrieve.
        Returns:
            tuple: (features, label) where features is a tensor of input features and label is the corresponding label.
        """
        row = self.dataframe.iloc[idx]
        features = torch.tensor(row.drop("is_fraud").values)    # Extract the features 
        label = torch.tensor(row["is_fraud"])   # Extract the label for this item
        return (features, label)

    def __len__(self) -> int:
        """Returns the amount of samples in the dataset."""
        return len(self.data)

### Data Pipeline Function

In [18]:
def data_pipeline(logger: Logger, dataset_url: str, root_data_dir: str= "../Data", data_file_path: str="Dataset.csv", data_splits_dir: str="DataSplits", scaler_dir = "Scalers", target_column: str="Target", dropped_columns: Optional[List[str]]=None, batch_size: int=64, num_workers: int=0, pin_memory: bool=False, drop_last: bool=True) -> tuple[Dataset, Dataset, Dataset, DataLoader, DataLoader, DataLoader, MinMaxScaler, MinMaxScaler]:
    """This function prepares the train, test, and validation datasets.
    Args:
        logger (Logger): The logger instance to log messages.
        dataset_url (str): The URL to download the dataset from, if not found locally.
        root_data_dir (str): The root of the Data Directory
        data_file_path (str): The name of the original dataset (with .csv file extension).
        data_splits_dir (str): Path to the train, test, and validation datasets.
        scaler_dir (str): Path to the feature and label scalers.
        target_column (str): The name of the target column to predict.
        dropped_columns (List[str], optional): Columns to drop from the features in original dataset.
        batch_size (int): The dataloader's batch_size.
        num_workers (int): The dataloader's number of workers.
        pin_memory (bool): The dataloader's pin memory option.
        drop_last (bool): The dataloader's drop_last option.

    Returns: 
        train_dataset (Dataset): Dataset Class for the training dataset.
        test_dataset (Dataset): Dataset Class for the test dataset.
        validation_dataset (Dataset): Dataset Class for the validation dataset.
        train_dataloader (DataLoader): The train dataloader.
        test_dataloader (DataLoader): The test dataloader.
        validation_dataloader (DataLoader): The validation dataloader.
        feature_scaler (MinMaxScaler): The scaler used to scale the features of the model input.
        label_scaler (MinMaxScaler): The scaler used to scale the labels of the model input.
        """
    if not root_data_dir or not data_file_path or not data_splits_dir:  # Check for empty strings at the beginning
        raise ValueError("File and directory paths cannot be empty strings.")
    DATA_ROOT = Path(root_data_dir)

    DATA_CLEAN_PATH = DATA_ROOT / data_file_path # Set the path to the complete dataset

    if DATA_CLEAN_PATH.exists():
        logger.info(f"CSV file detected, reading from '{DATA_ROOT}'")
        df = pd.read_csv(DATA_CLEAN_PATH, dtype='float32') # Convert data to float32 instead of, float64
    else:
        logger.info(f"Downloading CSV file from Internet and saving into '{DATA_ROOT}'")
        try:
            os.makedirs(DATA_ROOT, exist_ok=True)       # Create the Data Root Directory
            df = pd.read_csv(dataset_url)  # Download and read the data into a pandas dataframe

            # Clean the data before saving
            try:
                df = clean_data(df, logger, dropped_columns=dropped_columns)
            except Exception as e:
                raise RuntimeError(f"An unexpected error occurred cleaning the dataset")

            df.to_csv(DATA_CLEAN_PATH, index=False)     # Save the file, omitting saving the row index
        except Exception as e:
            raise RuntimeError(f"An unexpected error occurred when downloading or saving the dataset from '{dataset_url}' to '{DATA_CLEAN_PATH}'")

    # Define the paths for the data splits and scalers
    DATA_SPLITS_DIR = DATA_ROOT / data_splits_dir
    SCALER_DIR = DATA_ROOT / scaler_dir

    TRAIN_DATA_PATH = DATA_SPLITS_DIR / "train.csv"
    TEST_DATA_PATH = DATA_SPLITS_DIR / "test.csv"
    VALIDATION_DATA_PATH = DATA_SPLITS_DIR / "val.csv"

    FEATURE_SCALER_PATH = SCALER_DIR / "feature-scaler.joblib"
    # LABEL_SCALER_PATH = SCALER_DIR / "label-scaler.joblib"; Not used for this Classification task

    if os.path.exists(TRAIN_DATA_PATH) and os.path.exists(TEST_DATA_PATH) and os.path.exists(VALIDATION_DATA_PATH) :
        logger.info(f"Train, Test, and Validation CSV datasets detected in '{DATA_SPLITS_DIR}.' Skipping generation and loading scaler(s)")
        try:
            feature_scaler = joblib.load(FEATURE_SCALER_PATH)
            # label_scaler = joblib.load(LABEL_SCALER_PATH)
        except Exception as e:
            raise RuntimeError(f"An unexpected error occurred when loading scalers: {e}")
    else:
        logger.info(f"Datasets not found in '{DATA_SPLITS_DIR}' or incomplete. Generating datasets...")
        os.makedirs(DATA_SPLITS_DIR, exist_ok=True)     # Create the Data Splits Parent Directory
        os.makedirs(SCALER_DIR, exist_ok=True)     # Create the Scaler Parent Directory

        # Create the scaler objects
        feature_scaler = MinMaxScaler() 
        # label_scaler = MinMaxScaler(); Not for Classification
        try:
            df_features = df.drop(columns=[target_column], inplace=False)
            df_labels = df[[target_column]]     # Instead of returning a pandas Series using "[]", return a dataframe using the "[[]]" to get a shape with (-1,1)
        except Exception as e:
            raise RuntimeError(f"Error Reading from DataFrame: {e}")

        # Use OverSampling Technique to Balance out the Dataset
        ros = RandomOverSampler(random_state=42)
        df_features_resampled, df_labels_resampled = ros.fit_resample(df_features, df_labels)


        # Split into smaller DataFrames for the Train, Test, and Validation splits
        X_train, X_inter, Y_train, Y_inter = train_test_split(df_features_resampled, df_labels_resampled, test_size=0.1, random_state=42)
        X_validation, X_test, Y_validation, Y_test = train_test_split(X_inter, Y_inter, test_size=0.5, random_state=42)

        # Now Fit the data
        feature_scaler.fit(X_train)
        # label_scaler.fit(Y_train); Not for Classification

        # Save the fitted scaler object
        try:
            joblib.dump(feature_scaler, FEATURE_SCALER_PATH)
            logger.info(f"Feature scaler stored in: ({FEATURE_SCALER_PATH})")
            # joblib.dump(label_scaler, LABEL_SCALER_PATH)
            # logger.info(f"Label scaler stored in: ({LABEL_SCALER_PATH})")
        except Exception as e:
            raise RuntimeError(f"An unexpected error occurred when saving  Scalers: {e}")

        # Scale the rest of the data; returns numpy arrays
        X_train_scaled = feature_scaler.transform(X_train)
        # Y_train_scaled = label_scaler.transform(Y_train)
        X_validation_scaled = feature_scaler.transform(X_validation)
        # Y_validation_scaled = label_scaler.transform(Y_validation)
        X_test_scaled = feature_scaler.transform(X_test)
        # Y_test_scaled = label_scaler.transform(Y_test)

        logger.info(f"Train Features Scaled Shape: {X_train_scaled.shape}")
        logger.info(f"Train Labels Shape: {Y_train.shape}")
        logger.info(f"Validation Features Scaled Shape: {X_validation_scaled.shape}")
        logger.info(f"Validation Labels Shape: {Y_validation.shape}")
        logger.info(f"Test Features Scaled Shape: {X_test_scaled.shape}")
        logger.info(f"Test Labels Shape: {Y_test.shape}")
        # Define the column names of the features and label
        features_names = df_features.columns
        label_name = df_labels.columns
        # Create dataframes using the scaled data
        X_train_df = pd.DataFrame(X_train_scaled, columns=features_names)
        X_test_df = pd.DataFrame(X_test_scaled, columns=features_names)
        X_validation_df = pd.DataFrame(X_validation_scaled, columns=features_names)
        Y_train_df = pd.DataFrame(Y_train, columns=label_name)
        Y_test_df = pd.DataFrame(Y_test, columns=label_name)
        Y_validation_df = pd.DataFrame(Y_validation, columns=label_name)

        # Concatenate the features and labels back into a single DataFrame for each set
        train_data_frame = pd.concat([X_train_df, Y_train_df.reset_index(drop=True)], axis=1)
        test_data_frame = pd.concat([X_test_df, Y_test_df.reset_index(drop=True)], axis=1)
        validation_data_frame = pd.concat([X_validation_df, Y_validation_df.reset_index(drop=True)], axis=1)

        # Saving the split data to csv files
        try:
            train_data_frame.to_csv(TRAIN_DATA_PATH, index=False)
            test_data_frame.to_csv(TEST_DATA_PATH, index=False)
            validation_data_frame.to_csv(VALIDATION_DATA_PATH, index=False)
        except Exception as e:
            raise RuntimeError(f"An unexpected error occurred when saving datasets to CSV files: {e}")
    # Creating Datasets from the stored datasets
    logger.info(f"INITIALIZING DATASETS")
    train_dataset = CustomDataset(csv_file=TRAIN_DATA_PATH, label_column=target_column)
    test_dataset = CustomDataset(csv_file=TEST_DATA_PATH, label_column=target_column)
    val_dataset = CustomDataset(csv_file=VALIDATION_DATA_PATH, label_column=target_column)
    
    logger.info(f"Creating DataLoaders with 'batch_size'=({batch_size}), 'num_workers'=({num_workers}), 'pin_memory'=({pin_memory}). Training dataset 'drop_last'=({drop_last})")
    train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, num_workers=num_workers, pin_memory=pin_memory, drop_last=drop_last, shuffle=True)
    validation_dataloader = DataLoader(dataset=val_dataset, batch_size=batch_size, num_workers=num_workers, pin_memory=pin_memory, drop_last=drop_last, shuffle=False)
    test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size, num_workers=num_workers, pin_memory=pin_memory, drop_last=drop_last, shuffle=False)

    logger.info(f"Training DataLoader has ({len(train_dataloader)}) batches, Test DataLoader has ({len(test_dataloader)}) batches, Validation DataLoader has ({len(validation_dataloader)}) batches")

    # return (train_dataset, test_dataset, val_dataset, train_dataloader, test_dataloader, validation_dataloader, feature_scaler, label_scaler)
    return (train_dataset, test_dataset, val_dataset, train_dataloader, test_dataloader, validation_dataloader, feature_scaler)


# Test Data Pipeline

In [19]:
def test_data_pipeline():
    # Function input setup
    data = {
        "dataset_url":  "hf://datasets/dazzle-nu/CIS435-CreditCardFraudDetection/fraudTrain.csv",
        "root_data_dir": "../Data",
        "data_file_path": DATA_CLEAN_FILE_NAME,
        "data_splits_dir": "DataSplits",
        "scaler_dir": "Scalers",
        "target_column": "is_fraud",
        "dropped_columns": ["Unnamed: 0", "trans_date_trans_time", "cc_num", "merchant", "first", "last", "street", "city", "zip", "job", "dob", "trans_num", "unix_time", "Unnamed: 23", "6006"]
    }
    batch_size = 64
    num_workers = 0
    pin_memory = False
    drop_last = True

    logger = logging.getLogger(__name__)

    # Call the data pipeline function
    try:
        (train_dataset, test_dataset, val_dataset, train_dataloader, test_dataloader, validation_dataloader, feature_scaler) = data_pipeline(
        logger,
        **data,
        batch_size=batch_size,
        num_workers=num_workers,
        pin_memory=pin_memory,
        drop_last=drop_last
    )
    except Exception as e:
        logger.info(f"Caught Exception: {e}", stack_info=True)

    # Basic assertions to verify the outputs
    assert isinstance(train_dataset, Dataset), "train_dataset is not an instance of Dataset"
    assert isinstance(test_dataset, Dataset), "test_dataset is not an instance of Dataset"
    assert isinstance(val_dataset, Dataset), "val_dataset is not an instance of Dataset"
    assert isinstance(train_dataloader, DataLoader), "train_dataloader is not an instance of DataLoader"
    assert isinstance(test_dataloader, DataLoader), "test_dataloader is not an instance of DataLoader"
    assert isinstance(validation_dataloader, DataLoader), "validation_dataloader is not an instance of DataLoader"
    assert isinstance(feature_scaler, MinMaxScaler), "feature_scaler is not an instance of MinMaxScaler"
    # assert isinstance(label_scaler, MinMaxScaler), "label_scaler is not an instance of MinMaxScaler"

    logger.info("All assertions passed. Data pipeline test successful.")

In [20]:
test_data_pipeline()

INFO:__main__:CSV file detected, reading from '..\Data'
INFO:__main__:Train, Test, and Validation CSV datasets detected in '..\Data\DataSplits.' Skipping generation and loading scaler(s)
INFO:__main__:INITIALIZING DATASETS
INFO:__main__:Creating DataLoaders with 'batch_size'=(64), 'num_workers'=(0), 'pin_memory'=(False). Training dataset 'drop_last'=(True)
INFO:__main__:Training DataLoader has (29322) batches, Test DataLoader has (1629) batches, Validation DataLoader has (1629) batches
INFO:__main__:All assertions passed. Data pipeline test successful.
