# Cleaning Data and Creating DataLoader Function
- Balanced out the dataset using IMBlearn's RandomOversampler
- Look into using parquet files to shrink data files down (comes at the cost of not being human readable) - Not going to implement.
- Drop unnecessary columns

## Imports

In [1]:
import os
import io
import sys
from pathlib import Path
from importlib.metadata import version
from logging import Logger
from typing import List, Optional
import logging
import joblib
import pandas as pd
from pandas.errors import ParserError
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import RandomOverSampler

In [2]:
# Set up logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [3]:
packages = ["pandas", "importlib-metadata", "pyarrow"]
for package in packages:
    try:
        logger.info(f"{package} version: {version(package)}")
    except Exception as e:
        logger.warning(f"Could not get version for package {package}: {e}")

INFO:__main__:pandas version: 2.3.2
INFO:__main__:importlib-metadata version: 8.7.0
INFO:__main__:pyarrow version: 21.0.0


## Load Dataframe from csv file in local directory

In [4]:
DATA_ROOT = Path("../Data")
RAW_DATA_DIR_NAME = "Downloaded-Data"

DATA_RAW_FILE_NAME = "credit-card-fraud-RAW.csv"
DATA_INTERMEDIATE_FILE_NAME = "data-INTERMEDIATE.csv"
DATA_CLEAN_FILE_NAME = "credit-card-fraud-CLEAN.csv"

RAW_DATA_PATH = DATA_ROOT / RAW_DATA_DIR_NAME / DATA_RAW_FILE_NAME
INTERMEDIATE_DATA_PATH = DATA_ROOT / RAW_DATA_DIR_NAME / DATA_INTERMEDIATE_FILE_NAME
DATA_PATH = DATA_ROOT / RAW_DATA_DIR_NAME / DATA_CLEAN_FILE_NAME

MAPPING_DIR_NAME = "Feature-Mapping"

FEATURE_MAPPING_FILE_NAME = "feature_mappings.json"

FEATURE_MAPPING_PATH = DATA_ROOT / MAPPING_DIR_NAME / FEATURE_MAPPING_FILE_NAME

In [5]:
df = pd.read_csv(RAW_DATA_PATH)

## Data Cleaning

In [6]:
df.shape

(1048575, 25)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 25 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1048575 non-null  int64  
 1   trans_date_trans_time  1048575 non-null  object 
 2   cc_num                 1048575 non-null  float64
 3   merchant               1048575 non-null  object 
 4   category               1048575 non-null  object 
 5   amt                    1048575 non-null  float64
 6   first                  1048575 non-null  object 
 7   last                   1048575 non-null  object 
 8   gender                 1048575 non-null  object 
 9   street                 1048575 non-null  object 
 10  city                   1048575 non-null  object 
 11  state                  1048575 non-null  object 
 12  zip                    1048575 non-null  int64  
 13  lat                    1048575 non-null  float64
 14  long              

### Dropping Columns

In [8]:
drop_these = [
    "Unnamed: 0",
    "trans_date_trans_time",
    "cc_num",
    "merchant",
    "first",
    "last",
    "street",
    "city",
    "zip",
    "job",
    "dob",
    "trans_num",
    "unix_time",
    "Unnamed: 23",
    "6006",
]

In [9]:
df.drop(columns=drop_these, inplace=True)

### Filling in NaN entries, if Any

In [10]:
# df["column7"] = df["column7"].fillna("Light")

### Rearrange Columns

In [11]:
# Specify target column
target_col = "is_fraud"

# Get all columns except target
cols = [col for col in df.columns if col != target_col]

# Sort columns alphabetically
sorted_cols = sorted(cols)

# Add target column at the end
final_cols = sorted_cols + [target_col]

# Rearrange DataFrame
df = df[final_cols]

### Get final column order

In [12]:
print(df.columns.to_list())

['amt', 'category', 'city_pop', 'gender', 'lat', 'long', 'merch_lat', 'merch_long', 'state', 'is_fraud']


### SAVING CLEANED DATA TO FILE

In [13]:
df.to_csv(INTERMEDIATE_DATA_PATH, index=False)

## Load in the Clean Data File

In [14]:
df = pd.read_csv(INTERMEDIATE_DATA_PATH)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 10 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   amt         1048575 non-null  float64
 1   category    1048575 non-null  object 
 2   city_pop    1048575 non-null  int64  
 3   gender      1048575 non-null  object 
 4   lat         1048575 non-null  float64
 5   long        1048575 non-null  float64
 6   merch_lat   1048575 non-null  float64
 7   merch_long  1048575 non-null  float64
 8   state       1048575 non-null  object 
 9   is_fraud    1048575 non-null  int64  
dtypes: float64(5), int64(2), object(3)
memory usage: 80.0+ MB


## Encoding the Data
Maps any string objects into data that the model can use (floats).

In [16]:
print(df.columns.to_list())

['amt', 'category', 'city_pop', 'gender', 'lat', 'long', 'merch_lat', 'merch_long', 'state', 'is_fraud']


### Create Dictionaries for Encoding/Mapping Data

In [17]:
MAPPINGS = {}

### Create Dictionaries for Encoding/Mapping Data

In [18]:
categories = df["category"].unique().tolist()

In [19]:
categories.sort()

In [20]:
categories

['entertainment',
 'food_dining',
 'gas_transport',
 'grocery_net',
 'grocery_pos',
 'health_fitness',
 'home',
 'kids_pets',
 'misc_net',
 'misc_pos',
 'personal_care',
 'shopping_net',
 'shopping_pos',
 'travel']

In [21]:
CATEGORIES_MAPPING = {category: float(idx) for idx, category in enumerate(categories)}

In [22]:
CATEGORIES_MAPPING

{'entertainment': 0.0,
 'food_dining': 1.0,
 'gas_transport': 2.0,
 'grocery_net': 3.0,
 'grocery_pos': 4.0,
 'health_fitness': 5.0,
 'home': 6.0,
 'kids_pets': 7.0,
 'misc_net': 8.0,
 'misc_pos': 9.0,
 'personal_care': 10.0,
 'shopping_net': 11.0,
 'shopping_pos': 12.0,
 'travel': 13.0}

In [23]:
genders = sorted(df["gender"].unique().tolist())

In [24]:
GENDER_MAPPING = {gender: float(idx) for idx, gender in enumerate(genders)}

In [33]:
MAPPINGS["GENDER_MAPPING"] = GENDER_MAPPING

In [25]:
states = sorted(df["state"].unique().tolist())

In [26]:
states.sort()

In [27]:
STATES_MAPPING = {state: float(idx) for idx, state in enumerate(states)}

In [34]:
MAPPINGS["STATES_MAPPING"] = STATES_MAPPING

In [35]:
MAPPINGS

{'GENDER_MAPPING': {'F': 0.0, 'M': 1.0},
 'STATES_MAPPING': {'AK': 0.0,
  'AL': 1.0,
  'AR': 2.0,
  'AZ': 3.0,
  'CA': 4.0,
  'CO': 5.0,
  'CT': 6.0,
  'DC': 7.0,
  'DE': 8.0,
  'FL': 9.0,
  'GA': 10.0,
  'HI': 11.0,
  'IA': 12.0,
  'ID': 13.0,
  'IL': 14.0,
  'IN': 15.0,
  'KS': 16.0,
  'KY': 17.0,
  'LA': 18.0,
  'MA': 19.0,
  'MD': 20.0,
  'ME': 21.0,
  'MI': 22.0,
  'MN': 23.0,
  'MO': 24.0,
  'MS': 25.0,
  'MT': 26.0,
  'NC': 27.0,
  'ND': 28.0,
  'NE': 29.0,
  'NH': 30.0,
  'NJ': 31.0,
  'NM': 32.0,
  'NV': 33.0,
  'NY': 34.0,
  'OH': 35.0,
  'OK': 36.0,
  'OR': 37.0,
  'PA': 38.0,
  'RI': 39.0,
  'SC': 40.0,
  'SD': 41.0,
  'TN': 42.0,
  'TX': 43.0,
  'UT': 44.0,
  'VA': 45.0,
  'VT': 46.0,
  'WA': 47.0,
  'WI': 48.0,
  'WV': 49.0,
  'WY': 50.0}}

### Store Mappings for later use

In [29]:
import json

In [30]:
os.makedirs(DATA_ROOT / MAPPING_DIR_NAME, exist_ok=True)  # Create the Data Splits Parent Directory

In [36]:
# Save to JSON file
with open(FEATURE_MAPPING_PATH, "w") as f:
    json.dump(MAPPINGS, f, indent=4)  # indent for readability

In [None]:
df.head()

### Apply Encoding/Mapping to the columns

In [37]:
df["category"] = df["category"].map(CATEGORIES_MAPPING)
df["gender"] = df["gender"].map(GENDER_MAPPING)
df["state"] = df["state"].map(STATES_MAPPING)

### Check the new datatypes to make sure all string objects are converted

In [38]:
df.dtypes

amt           float64
category      float64
city_pop        int64
gender        float64
lat           float64
long          float64
merch_lat     float64
merch_long    float64
state         float64
is_fraud        int64
dtype: object

In [39]:
# Show all columns without truncation
pd.set_option("display.max_columns", None)

# Optional: Prevent column width truncation
pd.set_option("display.max_colwidth", None)

In [40]:
df.describe(include="all")  # Show all columns with different types of data

Unnamed: 0,amt,category,city_pop,gender,lat,long,merch_lat,merch_long,state,is_fraud
count,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0
mean,70.2791,6.229184,89057.76,0.4526209,38.53336,-90.22626,38.53346,-90.22648,26.66967,0.005727773
std,159.9518,3.914008,302435.1,0.4977504,5.076852,13.75858,5.111233,13.77093,14.3308,0.07546503
min,1.0,0.0,23.0,0.0,20.0271,-165.6723,19.02779,-166.6712,0.0,0.0
25%,9.64,3.0,743.0,0.0,34.6205,-96.798,34.72954,-96.89864,15.0,0.0
50%,47.45,6.0,2456.0,0.0,39.3543,-87.4769,39.36295,-87.43923,27.0,0.0
75%,83.05,10.0,20328.0,1.0,41.9404,-80.158,41.95602,-80.23228,38.0,0.0
max,28948.9,13.0,2906700.0,1.0,66.6933,-67.9503,67.51027,-66.9509,50.0,1.0


In [41]:
df.head()

Unnamed: 0,amt,category,city_pop,gender,lat,long,merch_lat,merch_long,state,is_fraud
0,4.97,8.0,3495,0.0,36.0788,-81.1781,36.011293,-82.048315,27.0,0
1,107.23,4.0,149,0.0,48.8878,-118.2105,49.159047,-118.186462,47.0,0
2,220.11,0.0,4154,1.0,42.1808,-112.262,43.150704,-112.154481,13.0,0
3,45.0,2.0,1939,1.0,46.2306,-112.1138,47.034331,-112.561071,26.0,0
4,41.96,9.0,99,1.0,38.4207,-79.4629,38.674999,-78.632459,45.0,0


### Convert all columns into Specific datatypes
* After conversion, it may produce NaNs. If so, try the whole process again. 

In [42]:
df = df.astype("float32")

### Print details of the Final Data Frame

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 10 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   amt         1048575 non-null  float32
 1   category    1048575 non-null  float32
 2   city_pop    1048575 non-null  float32
 3   gender      1048575 non-null  float32
 4   lat         1048575 non-null  float32
 5   long        1048575 non-null  float32
 6   merch_lat   1048575 non-null  float32
 7   merch_long  1048575 non-null  float32
 8   state       1048575 non-null  float32
 9   is_fraud    1048575 non-null  float32
dtypes: float32(10)
memory usage: 40.0 MB


### End of Data Mapping

In [44]:
df.head()

Unnamed: 0,amt,category,city_pop,gender,lat,long,merch_lat,merch_long,state,is_fraud
0,4.97,8.0,3495.0,0.0,36.0788,-81.178101,36.011292,-82.048317,27.0,0.0
1,107.230003,4.0,149.0,0.0,48.887798,-118.210503,49.159046,-118.186462,47.0,0.0
2,220.110001,0.0,4154.0,1.0,42.180801,-112.262001,43.150703,-112.15448,13.0,0.0
3,45.0,2.0,1939.0,1.0,46.230598,-112.1138,47.034332,-112.561073,26.0,0.0
4,41.959999,9.0,99.0,1.0,38.4207,-79.462898,38.674999,-78.632462,45.0,0.0


# SAVING CLEANED DATA TO FILE

In [45]:
df.to_csv(DATA_PATH, index=False)

# Read and Test datatypes

In [46]:
dq = pd.read_csv(
    DATA_PATH, dtype="float32"
)  # Does not convert to float32 by default, dtype has to be explicitly provided

In [47]:
dq.head()

Unnamed: 0,amt,category,city_pop,gender,lat,long,merch_lat,merch_long,state,is_fraud
0,4.97,8.0,3495.0,0.0,36.0788,-81.178101,36.011292,-82.048317,27.0,0.0
1,107.230003,4.0,149.0,0.0,48.887798,-118.210503,49.159046,-118.186462,47.0,0.0
2,220.110001,0.0,4154.0,1.0,42.180801,-112.262001,43.150703,-112.15448,13.0,0.0
3,45.0,2.0,1939.0,1.0,46.230598,-112.1138,47.034332,-112.561073,26.0,0.0
4,41.959999,9.0,99.0,1.0,38.4207,-79.462898,38.674999,-78.632462,45.0,0.0


In [48]:
print(dq.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 10 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   amt         1048575 non-null  float32
 1   category    1048575 non-null  float32
 2   city_pop    1048575 non-null  float32
 3   gender      1048575 non-null  float32
 4   lat         1048575 non-null  float32
 5   long        1048575 non-null  float32
 6   merch_lat   1048575 non-null  float32
 7   merch_long  1048575 non-null  float32
 8   state       1048575 non-null  float32
 9   is_fraud    1048575 non-null  float32
dtypes: float32(10)
memory usage: 40.0 MB
None


# Creating DataLoaders

### Clean Data Function

In [49]:
def clean_data(
    df: pd.DataFrame,
    logger: Logger,
    extra_dropped_columns: Optional[List[str]] = None,
    show_dataframe_info=True,
) -> pd.DataFrame:
    """Cleans the input DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame to be cleaned.
        logger (Logger): Logger object for logging information.
        extra_dropped_columns (List[str], optional): Columns to drop from the features in original dataset.
        show_dataframe_info (bool): Flag to toggle logging DataFrame info.

    Returns:
        pd.DataFrame: The cleaned DataFrame.
    """
    # Log the initial state of the DataFrame
    logger.info(f"Initial DataFrame shape: {df.shape}")

    if show_dataframe_info:
        buffer = io.StringIO()  # Create a buffer to capture the info output
        df.info(buf=buffer)  # Store the output into the buffer
        logger.info(f"Initial DataFrame info:\n " + buffer.getvalue())

    # Drop any unused columns
    try:
        df.drop(columns=extra_dropped_columns, inplace=True)
    except Exception as e:
        raise RuntimeError(f"Problem dropping columns:\n{e}")

    # Replacing any entry data (Missing Values/Misaligned values)

    # ================================
    # EXAMPLE PROCESS
    # ================================

    # df["group-7"] = df["group-7"].fillna("Light") # Not needed

    # ================================

    # Create dictionaries for mapping/encoding
    categories = sorted(df["category"].unique().tolist())
    CATEGORIES_MAPPING = {category: idx for idx, category in enumerate(categories)}

    genders = sorted(df["gender"].unique().tolist())
    GENDER_MAPPING = {gender: idx for idx, gender in enumerate(genders)}

    states = sorted(df["state"].unique().tolist())
    STATES_MAPPING = {state: idx for idx, state in enumerate(states)}

    logger.info("Encoding categorical variables...")
    try:
        df["category"] = df["category"].map(CATEGORIES_MAPPING)
        df["gender"] = df["gender"].map(GENDER_MAPPING)
        df["state"] = df["state"].map(STATES_MAPPING)
    except Exception as e:
        logger.info(f"Problem encoding columns, {e}")

    # Handle missing values (if any)
    if df.isnull().sum().sum() > 0:
        logger.info("Handling missing values...")
        df = df.dropna()  # Example: Drop rows with missing values
        logger.info(f"DataFrame shape after dropping missing values: {df.shape}")

    # Convert to 'float32' to reduce memory usage
    logger.info("Converting Entire Data Frame to 'float32'...")
    df = df.astype("float32")

    if show_dataframe_info:
        # Reinitialize the buffer to clear any previous content in order to log the final dataframe info
        buffer = io.StringIO()
        df.info(buf=buffer)
        logger.info(f"Final DataFrame info:\n " + buffer.getvalue())

    return df

### Custom Dataset Class

In [50]:
class CustomDataset(Dataset):
    """Dataset class For the Custom Dataset"""

    def __init__(self, csv_file: str = "../Data/DataSplits/test.csv", label_column: str = "Label"):
        """Initializer for the Dataset class.

        Args:
            csv_file (str): Path to the CSV file containing the dataset.
            label_column (str): The name of the column indicating the label.
        """
        try:
            self.data = pd.read_csv(csv_file)  # Assign a pandas data frame
        except FileNotFoundError:  # Raise an error if the file is not found
            raise FileNotFoundError(f"File not found: {csv_file}")

        # Define feature and label columns
        self.label_column = label_column
        # Omit the label column to create the list of feature columns
        self.feature_columns = self.data.columns.drop([self.label_column])

    def __getitem__(self, index: int) -> tuple[torch.Tensor, torch.Tensor]:
        """Returns a tuple (features, label) for the given index.

        Args:
            index (int): Index of the data sample to retrieve.

        Returns:
            tuple: (features, label) where features is a tensor of input features and label is the corresponding label.
        """
        # Use 'iloc' instead of 'loc' for efficiency
        features = self.data.iloc[index][self.feature_columns].values
        label = self.data.iloc[index][self.label_column]  # Extract the label for the given index
        return (torch.tensor(features, dtype=torch.float32), torch.tensor(label, dtype=torch.long))

    def __len__(self) -> int:
        """Returns the amount of samples in the dataset."""
        return len(self.data)

### Data Pipeline Function

In [70]:
def data_pipeline(
    logger: Logger,
    dataset_url: str,
    root_data_dir: str = "../Data",
    data_file_path: str = "Dataset.csv",
    data_splits_dir: str = "DataSplits",
    scaler_dir="Scalers",
    target_column: str = "Target",
    use_label_scaler: bool = False,  # TOGGLE IF NEEDED
    extra_dropped_columns: Optional[List[str]] = None,
    batch_size: int = 64,
    num_workers: int = 0,
    pin_memory: bool = False,
    drop_last: bool = True,
) -> tuple[
    Dataset, Dataset, Dataset, DataLoader, DataLoader, DataLoader, MinMaxScaler, MinMaxScaler
]:
    """This function prepares the train, test, and validation datasets.

    Args:
        logger (Logger): The logger instance to log messages.
        dataset_url (str): The URL to download the dataset from, if not found locally.
        root_data_dir (str): The root of the Data Directory
        data_file_path (str): The name of the original dataset (with .csv file extension).
        data_splits_dir (str): Path to the train, test, and validation datasets.
        scaler_dir (str): Path to the feature and label scalers.
        use_label_scaler (bool): Dictates whether to use label scaler
        target_column (str): The name of the target column to predict.
        extra_dropped_columns (List[str], optional): Columns to drop from the features in original dataset.
        batch_size (int): The dataloader's batch_size.
        num_workers (int): The dataloader's number of workers.
        pin_memory (bool): The dataloader's pin memory option.
        drop_last (bool): The dataloader's drop_last option.

    Returns:
        train_dataset (Dataset): Dataset Class for the training dataset.
        test_dataset (Dataset): Dataset Class for the test dataset.
        validation_dataset (Dataset): Dataset Class for the validation dataset.
        train_dataloader (DataLoader): The train dataloader.
        test_dataloader (DataLoader): The test dataloader.
        validation_dataloader (DataLoader): The validation dataloader.
        feature_scaler (MinMaxScaler): The scaler used to scale the features of the model input.
        label_scaler (MinMaxScaler): The scaler used to scale the labels of the model input.
    """
    if (
        not root_data_dir or not data_file_path or not data_splits_dir
    ):  # Check for empty strings at the beginning
        raise ValueError("File and directory paths cannot be empty strings.")
    DATA_ROOT = Path(root_data_dir)

    DATA_CLEAN_PATH = DATA_ROOT / data_file_path  # Set the path to the complete dataset

    if DATA_CLEAN_PATH.exists():
        logger.info(f"CSV file detected, reading from '{DATA_ROOT}'")
        df = pd.read_csv(
            DATA_CLEAN_PATH, dtype="float32"
        )  # Convert data to float32 instead of, float64
    else:
        logger.info(f"Downloading CSV file from '{dataset_url}'\nand saving into '{DATA_ROOT}'")
        try:
            os.makedirs(DATA_ROOT, exist_ok=True)  # Create the Data Root Directory
            # Download and read the data into a pandas dataframe
            df = pd.read_csv(dataset_url)  # Keep data as is, may not be able to expect float32 data

            # Clean the data before saving
            try:
                df = clean_data(df, logger, extra_dropped_columns=extra_dropped_columns)
            except Exception as e:
                raise RuntimeError(f"An unexpected error occurred cleaning the dataset:\n{e}")

            df.to_csv(DATA_CLEAN_PATH, index=False)  # Save the file, omitting saving the row index
        except OSError as e:
            raise RuntimeError(f"OS error occurred: {e}")
        except ParserError:
            raise RuntimeError(f"Failed to parse CSV from '{dataset_url}'")
        except ValueError as e:
            raise RuntimeError(f"Data cleaning error:\n{e}")
        except Exception as e:
            raise RuntimeError(
                f"An unexpected error occurred when downloading or saving the "
                f"dataset from '{dataset_url}' to '{DATA_CLEAN_PATH}':\n{e}"
            )

    # Define the paths for the data splits and scalers
    DATA_SPLITS_DIR = DATA_ROOT / data_splits_dir
    SCALER_DIR = DATA_ROOT / scaler_dir

    TRAIN_DATA_PATH = DATA_SPLITS_DIR / "train.csv"
    TEST_DATA_PATH = DATA_SPLITS_DIR / "test.csv"
    VALIDATION_DATA_PATH = DATA_SPLITS_DIR / "val.csv"

    FEATURE_SCALER_PATH = SCALER_DIR / "feature-scaler.joblib"
    LABEL_SCALER_PATH = SCALER_DIR / "label-scaler.joblib"

    # Define the columns to drop from the features
    columns_to_drop = [target_column]

    # Define the Data Splits
    TRAIN_SPLIT_PERCENTAGE = 0.9
    VALIDATION_SPLIT_PERCENTAGE = 0.5

    if (
        os.path.exists(TRAIN_DATA_PATH)
        and os.path.exists(TEST_DATA_PATH)
        and os.path.exists(VALIDATION_DATA_PATH)
    ):
        logger.info(
            f"Train, Test, and Validation CSV datasets detected in '{DATA_SPLITS_DIR}.' Skipping generation and loading scaler(s)"
        )
        try:
            feature_scaler = joblib.load(FEATURE_SCALER_PATH)
            logger.info(f"Feature scaler stored in: ({FEATURE_SCALER_PATH})")
            if use_label_scaler:
                joblib.dump(
                    label_scaler, LABEL_SCALER_PATH
                )  # Not used for this classification task
                logger.info(f"Label scaler stored in: ({LABEL_SCALER_PATH})")
            else:
                label_scaler = None  # Omit the label scaler loading

        except FileNotFoundError as e:
            raise RuntimeError(f"Scaler file not found: {e}")
        except EOFError as e:
            raise RuntimeError(f"Scaler file appears to be empty or corrupted: {e}")
        except Exception as e:
            raise RuntimeError(f"An unexpected error occurred when loading scalers: {e}")
    else:
        logger.info(
            f"Datasets not found in '{DATA_SPLITS_DIR}' or incomplete. Generating datasets..."
        )
        os.makedirs(DATA_SPLITS_DIR, exist_ok=True)  # Create the Data Splits Parent Directory
        os.makedirs(SCALER_DIR, exist_ok=True)  # Create the Scaler Parent Directory

        # Create the scaler objects
        feature_scaler = MinMaxScaler()
        if use_label_scaler:
            label_scaler = MinMaxScaler()
        else:
            label_scaler = None  # Not used for this Classification task

        try:
            df_features = df.drop(columns=columns_to_drop, inplace=False)
            df_labels = df[
                [target_column]
            ]  # Instead of returning a pandas Series using "[]", return a dataframe using the "[[]]" to get a shape with (-1,1)
        except KeyError as e:
            raise KeyError(
                f"One or more specified columns to drop do not exist in the DataFrame: {e}"
            )

        # ================================
        # ADD OVERSAMPLING AND OTHER DATA BALANCING TECHNIQUES HERE
        # ================================

        # Example of using OverSampling Technique to Balance out the Dataset for an Unbalanced Dataset
        ros = RandomOverSampler(random_state=42)
        df_features_resampled, df_labels_resampled = ros.fit_resample(df_features, df_labels)

        # Split into smaller DataFrames for the Train, Test, and Validation splits
        X_train, X_inter, Y_train, Y_inter = train_test_split(
            df_features_resampled,
            df_labels_resampled,
            test_size=1 - TRAIN_SPLIT_PERCENTAGE,
            random_state=42,
        )
        X_validation, X_test, Y_validation, Y_test = train_test_split(
            X_inter, Y_inter, test_size=1 - VALIDATION_SPLIT_PERCENTAGE, random_state=42
        )

        # Fit the scalers to the data
        feature_scaler.fit(X_train)
        # Only scale the labels if required
        if use_label_scaler:
            label_scaler.fit(Y_train)  # Not used for this Classification task

        # Save the fitted scaler object
        try:
            joblib.dump(feature_scaler, FEATURE_SCALER_PATH)
            logger.info(f"Feature scaler stored in: ({FEATURE_SCALER_PATH})")
            # Save the Label Scaler if utilized
            if use_label_scaler:
                joblib.dump(
                    label_scaler, LABEL_SCALER_PATH
                )  # Not used for this Classification task
                logger.info(f"Label scaler stored in: ({LABEL_SCALER_PATH})")
        except FileNotFoundError as e:
            raise RuntimeError(f"Save path not found: {e}")
        except Exception as e:
            raise RuntimeError(f"An unexpected error occurred when saving  Scaler(s): {e}")

        # Scale all Feature Inputs
        X_train_scaled = feature_scaler.transform(X_train)
        X_validation_scaled = feature_scaler.transform(X_validation)
        X_test_scaled = feature_scaler.transform(X_test)

        if use_label_scaler:  # HANDLE EACH ON A CASE BY CASE BASIS
            Y_train = label_scaler.transform(Y_train)
            Y_validation = label_scaler.transform(Y_validation)
            Y_test = label_scaler.transform(Y_test)

        logger.info(f"Train Features (Scaled) Shape: {X_train_scaled.shape}")
        logger.info(f"Validation Features (Scaled) Shape: {X_validation_scaled.shape}")
        logger.info(f"Test Features (Scaled) Shape: {X_test_scaled.shape}")

        if use_label_scaler:
            logger.info(f"Train Labels (Scaled) Shape: {Y_train.shape}")
            logger.info(f"Validation Labels (Scaled) Shape: {Y_validation.shape}")
            logger.info(f"Test Labels (Scaled) Shape: {Y_test.shape}")
        else:
            logger.info(f"Train Labels Shape: {Y_train.shape}")
            logger.info(f"Validation Labels Shape: {Y_validation.shape}")
            logger.info(f"Test Labels Shape: {Y_test.shape}")

        # Define the column names of the features and label
        features_names = df_features.columns
        label_name = df_labels.columns

        # Create dataframes using the scaled data
        X_train_df = pd.DataFrame(X_train_scaled, columns=features_names)
        X_test_df = pd.DataFrame(X_test_scaled, columns=features_names)
        X_validation_df = pd.DataFrame(X_validation_scaled, columns=features_names)
        Y_train_df = pd.DataFrame(Y_train, columns=label_name)
        Y_test_df = pd.DataFrame(Y_test, columns=label_name)
        Y_validation_df = pd.DataFrame(Y_validation, columns=label_name)

        # Concatenate the features and labels back into a single DataFrame for each set
        train_data_frame = pd.concat([X_train_df, Y_train_df.reset_index(drop=True)], axis=1)
        test_data_frame = pd.concat([X_test_df, Y_test_df.reset_index(drop=True)], axis=1)
        validation_data_frame = pd.concat(
            [X_validation_df, Y_validation_df.reset_index(drop=True)], axis=1
        )

        # Saving the split data to csv files
        try:
            train_data_frame.to_csv(TRAIN_DATA_PATH, index=False)
            test_data_frame.to_csv(TEST_DATA_PATH, index=False)
            validation_data_frame.to_csv(VALIDATION_DATA_PATH, index=False)
        except FileNotFoundError as e:
            raise RuntimeError(f"Save path not found: {e}")
        except Exception as e:
            raise RuntimeError(
                f"An unexpected error occurred when saving datasets to CSV files:\n{e}"
            )

    # Creating Datasets from the stored datasets
    logger.info(f"INITIALIZING DATASETS")
    train_dataset = CustomDataset(csv_file=TRAIN_DATA_PATH, label_column=target_column)
    test_dataset = CustomDataset(csv_file=TEST_DATA_PATH, label_column=target_column)
    val_dataset = CustomDataset(csv_file=VALIDATION_DATA_PATH, label_column=target_column)

    logger.info(
        f"Creating DataLoaders with 'batch_size'=({batch_size}), 'num_workers'=({num_workers}), 'pin_memory'=({pin_memory}). Training dataset 'drop_last'=({drop_last})"
    )
    train_dataloader = DataLoader(
        dataset=train_dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        pin_memory=pin_memory,
        drop_last=drop_last,
        shuffle=True,
    )
    validation_dataloader = DataLoader(
        dataset=val_dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        pin_memory=pin_memory,
        drop_last=drop_last,
        shuffle=False,
    )
    test_dataloader = DataLoader(
        dataset=test_dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        pin_memory=pin_memory,
        drop_last=drop_last,
        shuffle=False,
    )

    logger.info(
        f"Training DataLoader has ({len(train_dataloader)}) batches, Test DataLoader has ({len(test_dataloader)}) batches, Validation DataLoader has ({len(validation_dataloader)}) batches"
    )

    logger.info("==================================================================")
    for name, dataloader in [
        ("Train", train_dataloader),
        ("Validation", validation_dataloader),
        ("Test", test_dataloader),
    ]:
        features, labels = next(iter(dataloader))  # Get one batch

        logger.info(f"{name} Dataloader Batch Information")
        logger.info(f"Features Shape: '{features.shape}' |  DataTypes: '{features.dtype}'")
        logger.info(f"Labels Shape: '{labels.shape}'   |  DataTypes: '{labels.dtype}' ")
        logger.info("==================================================================")

    return (
        train_dataset,
        test_dataset,
        val_dataset,
        train_dataloader,
        test_dataloader,
        validation_dataloader,
        feature_scaler,
        label_scaler,
    )

# Testing the Data Pipeline

## Testing with the raw dataset as a fallback

In [71]:
# USED WHEN TESTING THE RAW DATASET
def test_data_pipeline():
    # Function input setup
    data = {
        "dataset_url": "hf://datasets/dazzle-nu/CIS435-CreditCardFraudDetection/fraudTrain.csv",
        "root_data_dir": "../Data",
        "data_file_path": DATA_CLEAN_FILE_NAME,
        "data_splits_dir": "DataSplits",
        "scaler_dir": "Scalers",
        "target_column": "is_fraud",
        "extra_dropped_columns": [
            "Unnamed: 0",
            "trans_date_trans_time",
            "cc_num",
            "merchant",
            "first",
            "last",
            "street",
            "city",
            "zip",
            "job",
            "dob",
            "trans_num",
            "unix_time",
            "Unnamed: 23",
            "6006",
        ],
    }
    batch_size = 64
    num_workers = 0
    pin_memory = False
    drop_last = True

    logger = logging.getLogger(__name__)

    # Call the data pipeline function
    try:
        (
            train_dataset,
            test_dataset,
            val_dataset,
            train_dataloader,
            test_dataloader,
            validation_dataloader,
            feature_scaler,
            label_scaler,
        ) = data_pipeline(
            logger,
            **data,
            batch_size=batch_size,
            num_workers=num_workers,
            pin_memory=pin_memory,
            drop_last=drop_last,
        )
    except Exception as e:
        logger.info(f"Caught Exception: {e}", stack_info=True)

    # Basic assertions to verify the outputs
    assert isinstance(train_dataset, Dataset), "train_dataset is not an instance of Dataset"
    assert isinstance(test_dataset, Dataset), "test_dataset is not an instance of Dataset"
    assert isinstance(val_dataset, Dataset), "val_dataset is not an instance of Dataset"
    assert isinstance(
        train_dataloader, DataLoader
    ), "train_dataloader is not an instance of DataLoader"
    assert isinstance(
        test_dataloader, DataLoader
    ), "test_dataloader is not an instance of DataLoader"
    assert isinstance(
        validation_dataloader, DataLoader
    ), "validation_dataloader is not an instance of DataLoader"
    assert isinstance(
        feature_scaler, MinMaxScaler
    ), "feature_scaler is not an instance of MinMaxScaler"
    # assert isinstance(label_scaler, MinMaxScaler), "label_scaler is not an instance of MinMaxScaler"

    logger.info("All assertions passed. Data pipeline test successful.")

    return (
        train_dataset,
        test_dataset,
        val_dataset,
        train_dataloader,
        test_dataloader,
        validation_dataloader,
        feature_scaler,
        label_scaler,
    )

In [72]:
(
    train_dataset,
    test_dataset,
    val_dataset,
    train_dataloader,
    test_dataloader,
    validation_dataloader,
    feature_scaler,
    label_scaler,
) = test_data_pipeline()

INFO:__main__:Downloading CSV file from 'hf://datasets/dazzle-nu/CIS435-CreditCardFraudDetection/fraudTrain.csv'
and saving into '..\Data'
INFO:__main__:Initial DataFrame shape: (1048575, 25)
INFO:__main__:Initial DataFrame info:
 <class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 25 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1048575 non-null  int64  
 1   trans_date_trans_time  1048575 non-null  object 
 2   cc_num                 1048575 non-null  float64
 3   merchant               1048575 non-null  object 
 4   category               1048575 non-null  object 
 5   amt                    1048575 non-null  float64
 6   first                  1048575 non-null  object 
 7   last                   1048575 non-null  object 
 8   gender                 1048575 non-null  object 
 9   street                 1048575 non-null  object 
 10  cit

In [73]:
len(validation_dataloader)

1629

In [74]:
logger.info("==================================================================")
for name, dataloader in [
    ("Train", train_dataloader),
    ("Validation", validation_dataloader),
    ("Test", test_dataloader),
]:
    features, labels = next(iter(dataloader))  # Get one batch

    logger.info(f"{name} Dataloader Batch Information")
    logger.info(f"Features Shape: '{features.shape}' |  DataTypes: '{features.dtype}'")
    logger.info(f"Labels Shape: '{labels.shape}'   |  DataTypes: '{labels.dtype}' ")
    logger.info(f"The labels: {labels}")  # Optional
    logger.info("==================================================================")

INFO:__main__:Train Dataloader Batch Information
INFO:__main__:Features Shape: 'torch.Size([64, 9])' |  DataTypes: 'torch.float32'
INFO:__main__:Labels Shape: 'torch.Size([64])'   |  DataTypes: 'torch.int64' 
INFO:__main__:The labels: tensor([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0,
        0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
        0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0])
INFO:__main__:Validation Dataloader Batch Information
INFO:__main__:Features Shape: 'torch.Size([64, 9])' |  DataTypes: 'torch.float32'
INFO:__main__:Labels Shape: 'torch.Size([64])'   |  DataTypes: 'torch.int64' 
INFO:__main__:The labels: tensor([1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1])
INFO:__main__:Test Dataloader Batch Information
INFO:__main__:Features Shape: 'torch.Size([

## Testing with the cleaned dataset as a fallback

In [76]:
# USED WHEN TESTING THE CLEAN DATASET
def test_data_pipeline_2():
    # Function input setup
    data = {
        "dataset_url": "hf://datasets/MaxPrestige/credit-card-fraud-CLEAN/credit-card-fraud-CLEAN.csv",
        "root_data_dir": "../Data",
        "data_file_path": DATA_CLEAN_FILE_NAME,
        "data_splits_dir": "DataSplits",
        "scaler_dir": "Scalers",
        "target_column": "is_fraud",
        "extra_dropped_columns": [],
    }
    batch_size = 64
    num_workers = 0
    pin_memory = False
    drop_last = True

    logger = logging.getLogger(__name__)

    # Call the data pipeline function
    try:
        (
            train_dataset,
            test_dataset,
            val_dataset,
            train_dataloader,
            test_dataloader,
            validation_dataloader,
            feature_scaler,
            label_scaler,
        ) = data_pipeline(
            logger,
            **data,
            batch_size=batch_size,
            num_workers=num_workers,
            pin_memory=pin_memory,
            drop_last=drop_last,
        )
    except Exception as e:
        logger.info(f"Caught Exception: {e}", stack_info=True)
        sys.exit(1)

    # Basic assertions to verify the outputs
    assert isinstance(train_dataset, Dataset), "train_dataset is not an instance of Dataset"
    assert isinstance(test_dataset, Dataset), "test_dataset is not an instance of Dataset"
    assert isinstance(val_dataset, Dataset), "val_dataset is not an instance of Dataset"
    assert isinstance(
        train_dataloader, DataLoader
    ), "train_dataloader is not an instance of DataLoader"
    assert isinstance(
        test_dataloader, DataLoader
    ), "test_dataloader is not an instance of DataLoader"
    assert isinstance(
        validation_dataloader, DataLoader
    ), "validation_dataloader is not an instance of DataLoader"
    assert isinstance(
        feature_scaler, MinMaxScaler
    ), "feature_scaler is not an instance of MinMaxScaler"
    # assert isinstance(label_scaler, MinMaxScaler), "label_scaler is not an instance of MinMaxScaler" # Label Scaler Not Utilized

    logger.info("All assertions passed. Data pipeline test successful.")

    return (
        train_dataset,
        test_dataset,
        val_dataset,
        train_dataloader,
        test_dataloader,
        validation_dataloader,
        feature_scaler,
        label_scaler,
    )

In [77]:
(
    train_dataset,
    test_dataset,
    val_dataset,
    train_dataloader,
    test_dataloader,
    validation_dataloader,
    feature_scaler,
    label_scaler,
) = test_data_pipeline_2()

INFO:__main__:CSV file detected, reading from '..\Data'
INFO:__main__:Train, Test, and Validation CSV datasets detected in '..\Data\DataSplits.' Skipping generation and loading scaler(s)
INFO:__main__:Feature scaler stored in: (..\Data\Scalers\feature-scaler.joblib)
INFO:__main__:INITIALIZING DATASETS
INFO:__main__:Creating DataLoaders with 'batch_size'=(64), 'num_workers'=(0), 'pin_memory'=(False). Training dataset 'drop_last'=(True)
INFO:__main__:Training DataLoader has (29322) batches, Test DataLoader has (1629) batches, Validation DataLoader has (1629) batches
INFO:__main__:Train Dataloader Batch Information
INFO:__main__:Features Shape: 'torch.Size([64, 9])' |  DataTypes: 'torch.float32'
INFO:__main__:Labels Shape: 'torch.Size([64])'   |  DataTypes: 'torch.int64' 
INFO:__main__:Validation Dataloader Batch Information
INFO:__main__:Features Shape: 'torch.Size([64, 9])' |  DataTypes: 'torch.float32'
INFO:__main__:Labels Shape: 'torch.Size([64])'   |  DataTypes: 'torch.int64' 
INFO:_

In [78]:
len(validation_dataloader)

1629

In [79]:
logger.info("==================================================================")
for name, dataloader in [
    ("Train", train_dataloader),
    ("Validation", validation_dataloader),
    ("Test", test_dataloader),
]:
    features, labels = next(iter(dataloader))  # Get one batch

    logger.info(f"{name} Dataloader Batch Information")
    logger.info(f"Features Shape: '{features.shape}' |  DataTypes: '{features.dtype}'")
    logger.info(f"Labels Shape: '{labels.shape}'   |  DataTypes: '{labels.dtype}' ")
    logger.info(f"The labels: {labels}")  # Optional
    logger.info("==================================================================")

INFO:__main__:Train Dataloader Batch Information
INFO:__main__:Features Shape: 'torch.Size([64, 9])' |  DataTypes: 'torch.float32'
INFO:__main__:Labels Shape: 'torch.Size([64])'   |  DataTypes: 'torch.int64' 
INFO:__main__:The labels: tensor([1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0,
        1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1,
        0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0])
INFO:__main__:Validation Dataloader Batch Information
INFO:__main__:Features Shape: 'torch.Size([64, 9])' |  DataTypes: 'torch.float32'
INFO:__main__:Labels Shape: 'torch.Size([64])'   |  DataTypes: 'torch.int64' 
INFO:__main__:The labels: tensor([1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1])
INFO:__main__:Test Dataloader Batch Information
INFO:__main__:Features Shape: 'torch.Size([

# End