# Creating Dataloaders

## Imports

In [1]:
from importlib.metadata import version
import pandas as pd
import numpy as np
import seaborn as sns
from pathlib import Path
import os
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
import torch
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [2]:
from importlib.metadata import version

In [3]:
list = ['pandas', 'numpy', 'seaborn', 'matplotlib', 'torch', 'joblib']
for package in list:
    try:
        print(f"{package} version: {version(package)}") # Raises PackageNotFoundError if not found
    except:
         print(f"❌ Package '{package}' not found. Please install it.")

pandas version: 2.3.1
numpy version: 1.23.5
seaborn version: 0.13.2
matplotlib version: 3.10.5
torch version: 2.5.1
joblib version: 1.5.1


## Data Preparation

### Prepare Paths

In [None]:
DATA_ROOT = Path("../Data")
# OIL_PATH_ORIGINAL = DATA_ROOT / "OIL_Dataset_1984-2025.csv"     # Set the data source path
DATA_CLEAN = "OIL_DATASET.csv"
DATA_CLEAN_PATH = DATA_ROOT / DATA_CLEAN

# MODEL_ROOT = Path("../Models")  # To store trained models and scalers

# TRAIN_FEATURE_SCALER = "train_feature_scaler.joblib"
# TRAIN_LABEL_SCALER = "train_label_scaler.joblib"

# TRAIN_FEATURE_SCALER_PATH = MODEL_ROOT / TRAIN_FEATURE_SCALER
# TRAIN_LABEL_SCALER_PATH = MODEL_ROOT / TRAIN_LABEL_SCALER


In [5]:
# Login using e.g. `huggingface-cli login` to access this dataset

if DATA_CLEAN_PATH.exists():
    print(f"CSV file detected, reading from '{DATA_ROOT}'")
    df = pd.read_csv(DATA_CLEAN_PATH)

else:
    print(f"Downloading CSV file from HuggingFace")
    os.makedirs(DATA_ROOT, exist_ok=True)
    df = pd.read_csv("hf://datasets/MaxPrestige/CRUDE_OIL_PRICES/Data/OIL_DATASET.csv")
    df.to_csv(DATA_CLEAN_PATH, index=False)


CSV file detected, reading from '..\Data'


#### File Verification

In [6]:
paths = [DATA_CLEAN_PATH]

try:
    for path in paths:
        if not path.exists():
            raise FileNotFoundError(f"The file '{path}' does not exist.")
except FileNotFoundError as e:
    print(e)

#### Reading File to DataFrame

In [23]:
df = pd.read_csv(DATA_CLEAN_PATH, parse_dates=['Date'])


In [26]:
type(df.columns)

pandas.core.indexes.base.Index

In [7]:
label_col = "Close"

In [22]:
df_labels.head(5)

Unnamed: 0,Close
0,66.63
1,67.31
2,67.82
3,67.61
4,67.82


In [9]:
print(f"shape of df_features: {df_features.shape}")
print(f"shape of df_labels: {df_labels.shape}")

shape of df_features: (6364, 22)
shape of df_labels: (6364, 1)


In [15]:
df_features.head()

Unnamed: 0,Open,High,Low,California_Crude_Oil_First_Purchase_Price_$/bbl,Texas_Crude_Oil_First_Purchase_Price_$/bbl,US_Crude_Oil_First_Purchase_Price_$/bbl,US_Imports_from_Canada_of_Crude_Oil_Mbbl/d,US_Imports_from_Colombia_of_Crude_Oil_Mbbl/d,US_Imports_from_United_Kingdom_of_Crude_Oil_Mbbl/d,US_Imports_from_Mexico_of_Crude_Oil_Mbbl/d,...,US_Imports_of_Crude_Oil_Mbbl/d,US_Exports_to_Canada_of_Crude_Oil_Mbbl/d,US_Exports_of_Crude_Oil_Mbbl/d,US_Net_Imports_from_Canada_of_Crude_Oil_Mbbl/d,US_Net_Imports_from_Colombia_of_Crude_Oil_Mbbl/d,US_Net_Imports_from_Mexico_of_Crude_Oil_Mbbl/d,US_Net_Imports_from_United_Kingdom_of_Crude_Oil_Mbbl/d,US_Net_Imports_from_OPEC_Countries_of_Crude_Oil_Mbbl/d,US_Net_Imports_from_Non-OPEC_Countries_of_Crude_Oil_Mbbl/d,US_Net_Imports_of_Crude_Oil_Mbbl/d
0,67.33,67.2,65.92,62.41,60.56,59.94,3814.0,223.0,81.0,431.0,...,6259.0,218.0,3629.0,3596.0,207.0,431.0,-93.0,697.0,1934.0,2631.0
1,67.91,68.42,67.2,62.41,60.56,59.94,3814.0,223.0,81.0,431.0,...,6259.0,218.0,3629.0,3596.0,207.0,431.0,-93.0,697.0,1934.0,2631.0
2,67.77,69.05,67.38,62.41,60.56,59.94,3814.0,223.0,81.0,431.0,...,6259.0,218.0,3629.0,3596.0,207.0,431.0,-93.0,697.0,1934.0,2631.0
3,67.88,68.78,67.32,62.41,60.56,59.94,3814.0,223.0,81.0,431.0,...,6259.0,218.0,3629.0,3596.0,207.0,431.0,-93.0,697.0,1934.0,2631.0
4,68.37,70.2,66.82,62.41,60.56,59.94,3814.0,223.0,81.0,431.0,...,6259.0,218.0,3629.0,3596.0,207.0,431.0,-93.0,697.0,1934.0,2631.0


In [16]:
df_labels.head()

0    66.63
1    67.31
2    67.82
3    67.61
4    67.82
Name: Close, dtype: float64

## Data Splitting

Train Features: (5727, 22)
Train Labels: (5727, 1)
validation Features: (318, 22)
validation Labels: (318, 1)
test Features: (319, 22)
test Labels: (319, 1)


## Scaling The Data

In [20]:
data_splits_dir = "DataSplits"
scaler_dir = "Scalers"
DATA_SPLITS_DIR = DATA_ROOT / data_splits_dir
SCALER_DIR = DATA_ROOT / scaler_dir

TRAIN_DATA_PATH = DATA_SPLITS_DIR / "train.csv"
TEST_DATA_PATH = DATA_SPLITS_DIR / "test.csv"
VALIDATION_DATA_PATH = DATA_SPLITS_DIR / "val.csv"
FEATURE_SCALER_PATH = SCALER_DIR / "feature-scaler.joblib"
LABEL_SCALER_PATH = SCALER_DIR / "label-scaler.joblib"

In [12]:
label_col = "Close"
extra_dropped_cols = 'Date'

### Notes
Only fit the scalers on the training data to prevent any data leakage.

In [32]:
if os.path.exists(TRAIN_DATA_PATH) and os.path.exists(TEST_DATA_PATH) and os.path.exists(VALIDATION_DATA_PATH) :
    print(f"Train, Test, and Validation csv datasets detected in '{DATA_SPLITS_DIR}', skipping generation")
    try:
        feature_scaler = joblib.load(FEATURE_SCALER_PATH)
        label_scaler = joblib.load(LABEL_SCALER_PATH)
    except Exception as e:
        raise RuntimeError(f"An unexpected error occurred when loading scalers: {e}")
else:
    print(f"Datasets not found in '{DATA_SPLITS_DIR}' or incomplete. Generating datasets...")
    os.makedirs(MODEL_ROOT, exist_ok=True)
    os.makedirs(DATA_SPLITS_DIR, exist_ok=True)     # Create the Data Splits Parent Directory
    os.makedirs(SCALER_DIR, exist_ok=True)     # Create the Data Splits Parent Directory

    
    df_features = df.drop(columns=[label_col, extra_dropped_cols], inplace=False)
    df_labels = df[[label_col]]     # Instead of returning a pandas Series using "[]", return a dataframe using the "[[]]" to get a shape with (-1,1)

    feature_scaler = MinMaxScaler()
    label_scaler = MinMaxScaler()

    # Split the whole pandas DataFrame into smaller DataFrames
    X_train, X_inter, Y_train, Y_inter = train_test_split(df_features, df_labels, test_size=0.1, random_state=42)
    X_validation, X_test, Y_validation, Y_test = train_test_split(X_inter, Y_inter, test_size=0.5, random_state=42)

    print(f"Train Features: {X_train.shape}")
    print(f"Train Labels: {Y_train.shape}")
    print(f"validation Features: {X_validation.shape}")
    print(f"validation Labels: {Y_validation.shape}")
    print(f"test Features: {X_test.shape}")
    print(f"test Labels: {Y_test.shape}")

    feature_scaler.fit(X_train)
    label_scaler.fit(Y_train)

    # Save the fitted scaler object
    try:
        joblib.dump(feature_scaler, FEATURE_SCALER_PATH)
        print(f"Feature scaler stored in: ({FEATURE_SCALER_PATH})")
    except Exception as e:
        raise RuntimeError(f"An unexpected error occurred when saving Feature Scaler: {e}")
    try:
        joblib.dump(label_scaler, LABEL_SCALER_PATH)
        print(f"Label scaler stored in: ({LABEL_SCALER_PATH})")
    except Exception as e:
        raise RuntimeError(f"An unexpected error occurred when saving Label Scaler: {e}")


# Scale the rest of the data; returns numpy arrays
X_train_scaled = feature_scaler.transform(X_train)
Y_train_scaled = label_scaler.transform(Y_train)
X_validation_scaled = feature_scaler.transform(X_validation)
Y_validation_scaled = label_scaler.transform(Y_validation)
X_test_scaled = feature_scaler.transform(X_test)
Y_test_scaled = label_scaler.transform(Y_test)


print(f"Train Features Scaled Shape: {X_train_scaled.shape}")
print(f"Train Labels Scaled Shape: {Y_test_scaled.shape}")
print(f"validation Features Scaled Shape: {X_validation_scaled.shape}")
print(f"validation Labels: {Y_validation_scaled.shape}")
print(f"test Features Scaled Shape: {X_test_scaled.shape}")
print(f"test Labels Scaled Shape: {Y_test_scaled.shape}")



Datasets not found in '..\Data\DataSplits' or incomplete. Generating datasets...
Train Features: (5727, 22)
Train Labels: (5727, 1)
validation Features: (318, 22)
validation Labels: (318, 1)
test Features: (319, 22)
test Labels: (319, 1)
Feature scaler stored in: (..\Data\Scalers\feature-scaler.joblib)
Label scaler stored in: (..\Data\Scalers\label-scaler.joblib)
Train Features Scaled Shape: (5727, 22)
Train Labels Scaled Shape: (319, 1)
validation Features Scaled Shape: (318, 22)
validation Labels: (318, 1)
test Features Scaled Shape: (319, 22)
test Labels Scaled Shape: (319, 1)


In [33]:
features = df_features.columns
labels = df_labels.columns

In [35]:
X_train_df = pd.DataFrame(X_train_scaled, columns=features)
X_test_df = pd.DataFrame(X_test_scaled, columns=features)
X_validation_df = pd.DataFrame(X_validation_scaled, columns=features)
Y_train_df = pd.DataFrame(Y_train_scaled, columns=labels)
Y_test_df = pd.DataFrame(Y_test_scaled, columns=labels)
Y_validation_df = pd.DataFrame(Y_validation_scaled, columns=labels)

In [36]:
X_test_df.head()

Unnamed: 0,Open,High,Low,California_Crude_Oil_First_Purchase_Price_$/bbl,Texas_Crude_Oil_First_Purchase_Price_$/bbl,US_Crude_Oil_First_Purchase_Price_$/bbl,US_Imports_from_Canada_of_Crude_Oil_Mbbl/d,US_Imports_from_Colombia_of_Crude_Oil_Mbbl/d,US_Imports_from_United_Kingdom_of_Crude_Oil_Mbbl/d,US_Imports_from_Mexico_of_Crude_Oil_Mbbl/d,...,US_Imports_of_Crude_Oil_Mbbl/d,US_Exports_to_Canada_of_Crude_Oil_Mbbl/d,US_Exports_of_Crude_Oil_Mbbl/d,US_Net_Imports_from_Canada_of_Crude_Oil_Mbbl/d,US_Net_Imports_from_Colombia_of_Crude_Oil_Mbbl/d,US_Net_Imports_from_Mexico_of_Crude_Oil_Mbbl/d,US_Net_Imports_from_United_Kingdom_of_Crude_Oil_Mbbl/d,US_Net_Imports_from_OPEC_Countries_of_Crude_Oil_Mbbl/d,US_Net_Imports_from_Non-OPEC_Countries_of_Crude_Oil_Mbbl/d,US_Net_Imports_of_Crude_Oil_Mbbl/d
0,0.816492,0.818934,0.827768,0.89213,0.740693,0.769353,0.404693,0.757519,0.016026,0.465409,...,0.597405,0.113924,0.015243,0.4319,0.757519,0.465409,0.445348,0.629321,0.764043,0.750305
1,0.338057,0.338331,0.337164,0.406034,0.329044,0.340567,0.813565,0.528195,0.091346,0.255066,...,0.301718,0.791139,0.591026,0.734409,0.490602,0.255066,0.247516,0.206516,0.382313,0.274167
2,0.180256,0.19204,0.185122,0.1743,0.215389,0.204074,0.094503,0.174812,0.472756,0.844864,...,0.878655,0.045886,0.006011,0.101434,0.174812,0.844864,0.7028,0.833303,0.852478,0.925479
3,0.069299,0.068315,0.077419,0.068157,0.077029,0.065633,0.065574,0.304511,0.969551,0.790356,...,0.736659,0.006329,0.000644,0.078136,0.304511,0.790356,0.982836,0.624796,0.930224,0.84221
4,0.721642,0.720556,0.725268,0.762177,0.619317,0.659522,0.32594,0.731203,0.014423,0.468903,...,0.528143,0.113924,0.015243,0.344086,0.731203,0.468903,0.444444,0.664796,0.652284,0.708338


In [37]:
Y_test_df

Unnamed: 0,Close
0,0.818527
1,0.345100
2,0.198231
3,0.073939
4,0.729071
...,...
314,0.466599
315,0.689891
316,0.642408
317,0.467763


In [38]:
# Concatenate the features and labels back into a single DataFrame for each set
train_data_frame = pd.concat([X_train_df, Y_train_df.reset_index(drop=True)], axis=1)
test_data_frame = pd.concat([X_test_df, Y_test_df.reset_index(drop=True)], axis=1)
validation_data_frame = pd.concat([X_validation_df, Y_validation_df.reset_index(drop=True)], axis=1)

In [39]:
# Saving the split data to csv files
train_data_frame.to_csv(TRAIN_DATA_PATH, index=False)
test_data_frame.to_csv(TEST_DATA_PATH, index=False)
validation_data_frame.to_csv(VALIDATION_DATA_PATH, index=False)

## Creating Dataset

### Creating Dataset Class

In [43]:
class OilDataset(Dataset):
    """Dataset class For the OIL_DATASET"""
    def __init__(self, csv_file="../Data/DataSplits/test.csv"):
        try:
            self.data = pd.read_csv(csv_file)   # Assign a pandas data frame
            
        except FileNotFoundError:
            raise FileNotFoundError(f"File not found: {csv_file}")

        # Define feature and label columns
        self.label_column = "Close"
        # Remove the Date column and the label column
        self.feature_columns = self.data.columns.drop([self.label_column])
        

    def __getitem__(self, index):
        features = self.data.loc[index, self.feature_columns].values
        
        label = self.data.loc[index, self.label_column] # Extract the label for the given index
        return (
            torch.tensor(features, dtype=torch.float),
            torch.tensor(label, dtype=torch.float)
        )

    def __len__(self):
        return len(self.data)

### Initializing Datasets for different splits

In [44]:
train_dataset = OilDataset(TRAIN_DATA_PATH)
test_dataset = OilDataset(TEST_DATA_PATH)
val_dataset = OilDataset(VALIDATION_DATA_PATH)

In [45]:
len(train_dataset)

5727

## Creating the DataLoaders

In [46]:
batch_size, workers, pin_memory, drop_last = 64, 0, True, True

In [50]:
train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, num_workers=workers, pin_memory=pin_memory, drop_last=drop_last, shuffle=True)
validation_dataloader = DataLoader(dataset=val_dataset, batch_size=batch_size, num_workers=workers, pin_memory=pin_memory, drop_last=drop_last, shuffle=False)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size, num_workers=workers, pin_memory=pin_memory, drop_last=drop_last, shuffle=False)


In [51]:

print(f"Training DataLoader has ({len(train_dataloader)}) batches, Test DataLoader has ({len(test_dataloader)}) batches, Validation DataLoader has ({len(validation_dataloader)}) batches")

Training DataLoader has (89) batches, Test DataLoader has (4) batches, Validation DataLoader has (4) batches


# Complete Version

In [64]:
def data_pipeline(root_data_dir: str= "../Data", data_file_path: str="OIL_DATASET.csv", data_splits_dir: str="DataSplits", scaler_dir = "Scalers", batch_size: int=64, num_workers=0, pin_memory: bool=False, drop_last: bool=True) -> tuple[Dataset, Dataset, Dataset, DataLoader, DataLoader, DataLoader, MinMaxScaler, MinMaxScaler]:
    """This function prepares the train, test, and validation datasets.
    Args:
        root_data_dir (str): The root of the Data Directory
        data_file_path (str): The name of the original dataset (with .csv file extension).
        data_splits_dir (str): Path to the train, test, and validation datasets.
        scaler_dir (str): Path to the feature and label scalers.
        batch_size (int): The dataloader's batch_size.
        num_workers (int): The dataloader's number of workers.
        pin_memory (bool): The dataloader's pin memory option.
        drop_last (bool): The dataloader's drop_last option.

    Returns: 
        train_dataset (Dataset): Dataset Class for the training dataset.
        test_dataset (Dataset): Dataset Class for the test dataset.
        validation_dataset (Dataset): Dataset Class for the validation dataset.
        train_dataloader (DataLoader): The train dataloader.
        test_dataloader (DataLoader): The test dataloader.
        validation_dataloader (DataLoader): The validation dataloader.
        feature_scaler (MinMaxScaler): The scaler used to scale the features of the model input.
        label_scaler (MinMaxScaler): The scaler used to scale the labels of the model input.
        """
    if not root_data_dir or not data_file_path or not data_splits_dir:  # Check for empty strings at the beginning
        raise ValueError("File and directory paths cannot be empty strings.")
    DATA_ROOT = Path(root_data_dir)
    # OIL_PATH_ORIGINAL = DATA_ROOT / "OIL_Dataset_1984-2025.csv"     # Set the data source path

    DATA_CLEAN_PATH = DATA_ROOT / data_file_path # Set the path to the complete dataset

    if DATA_CLEAN_PATH.exists():
        print(f"CSV file detected, reading from '{DATA_ROOT}'")
        df = pd.read_csv(DATA_CLEAN_PATH)
    else:
        print(f"Downloading CSV file from HuggingFace")
        os.makedirs(DATA_ROOT, exist_ok=True)       # Create the Data Root Directory
        df = pd.read_csv("hf://datasets/MaxPrestige/CRUDE_OIL_PRICES/Data/OIL_DATASET.csv")  # Download and read the data into a pandas dataframe
        df.to_csv(DATA_CLEAN_PATH, index=False)     # Save the file, omitting saving the index

    DATA_SPLITS_DIR = DATA_ROOT / data_splits_dir
    SCALER_DIR = DATA_ROOT / scaler_dir

    TRAIN_DATA_PATH = DATA_SPLITS_DIR / "train.csv"
    TEST_DATA_PATH = DATA_SPLITS_DIR / "test.csv"
    VALIDATION_DATA_PATH = DATA_SPLITS_DIR / "val.csv"

    FEATURE_SCALER_PATH = SCALER_DIR / "feature-scaler.joblib"
    LABEL_SCALER_PATH = SCALER_DIR / "label-scaler.joblib"

    label_col = "Close"
    extra_dropped_cols = 'Date'

    if os.path.exists(TRAIN_DATA_PATH) and os.path.exists(TEST_DATA_PATH) and os.path.exists(VALIDATION_DATA_PATH) :
        print(f"Train, Test, and Validation csv datasets detected in '{DATA_SPLITS_DIR}.' Skipping generation and loading scaler(s)")
        try:
            feature_scaler = joblib.load(FEATURE_SCALER_PATH)
            label_scaler = joblib.load(LABEL_SCALER_PATH)
        except Exception as e:
            raise RuntimeError(f"An unexpected error occurred when loading scalers: {e}")
    else:
        print(f"Datasets not found in '{DATA_SPLITS_DIR}' or incomplete. Generating datasets...")
        # os.makedirs(MODEL_ROOT, exist_ok=True)
        os.makedirs(DATA_SPLITS_DIR, exist_ok=True)     # Create the Data Splits Parent Directory
        os.makedirs(SCALER_DIR, exist_ok=True)     # Create the Data Splits Parent Directory

        feature_scaler = MinMaxScaler()
        label_scaler = MinMaxScaler()
        # Split the Dataframe into separate features and labels DataFrames
        df_features = df.drop(columns=[label_col, extra_dropped_cols], inplace=False)
        df_labels = df[[label_col]]     # Instead of returning a pandas Series using "[]", return a dataframe using the "[[]]" to get a shape with (-1,1)

        # Split into smaller DataFrames for the Train, Test, and Validation splits
        X_train, X_inter, Y_train, Y_inter = train_test_split(df_features, df_labels, test_size=0.1, random_state=42)
        X_validation, X_test, Y_validation, Y_test = train_test_split(X_inter, Y_inter, test_size=0.5, random_state=42)

        # print(f"Train Features: {X_train.shape}")
        # print(f"Train Labels: {Y_train.shape}")
        # print(f"validation Features: {X_validation.shape}")
        # print(f"validation Labels: {Y_validation.shape}")
        # print(f"test Features: {X_test.shape}")
        # print(f"test Labels: {Y_test.shape}")

        feature_scaler.fit(X_train)
        label_scaler.fit(Y_train)

        # Save the fitted scaler object
        try:
            joblib.dump(feature_scaler, FEATURE_SCALER_PATH)
            print(f"Feature scaler stored in: ({FEATURE_SCALER_PATH})")
            joblib.dump(label_scaler, LABEL_SCALER_PATH)
            print(f"Label scaler stored in: ({LABEL_SCALER_PATH})")
        except Exception as e:
            raise RuntimeError(f"An unexpected error occurred when saving  Scalers: {e}")

        # Scale the rest of the data; returns numpy arrays
        X_train_scaled = feature_scaler.transform(X_train)
        Y_train_scaled = label_scaler.transform(Y_train)
        X_validation_scaled = feature_scaler.transform(X_validation)
        Y_validation_scaled = label_scaler.transform(Y_validation)
        X_test_scaled = feature_scaler.transform(X_test)
        Y_test_scaled = label_scaler.transform(Y_test)

        print(f"Train Features Scaled Shape: {X_train_scaled.shape}")
        print(f"Train Labels Scaled Shape: {Y_test_scaled.shape}")
        print(f"validation Features Scaled Shape: {X_validation_scaled.shape}")
        print(f"validation Labels: {Y_validation_scaled.shape}")
        print(f"test Features Scaled Shape: {X_test_scaled.shape}")
        print(f"test Labels Scaled Shape: {Y_test_scaled.shape}")
        # Define the column names of the features and label
        features_names = df_features.columns
        label_name = df_labels.columns
        # Create dataframes using the scaled data
        X_train_df = pd.DataFrame(X_train_scaled, columns=features_names)
        X_test_df = pd.DataFrame(X_test_scaled, columns=features_names)
        X_validation_df = pd.DataFrame(X_validation_scaled, columns=features_names)
        Y_train_df = pd.DataFrame(Y_train_scaled, columns=label_name)
        Y_test_df = pd.DataFrame(Y_test_scaled, columns=label_name)
        Y_validation_df = pd.DataFrame(Y_validation_scaled, columns=label_name)

        # Concatenate the features and labels back into a single DataFrame for each set
        train_data_frame = pd.concat([X_train_df, Y_train_df.reset_index(drop=True)], axis=1)
        test_data_frame = pd.concat([X_test_df, Y_test_df.reset_index(drop=True)], axis=1)
        validation_data_frame = pd.concat([X_validation_df, Y_validation_df.reset_index(drop=True)], axis=1)

        # Saving the split data to csv files
        train_data_frame.to_csv(TRAIN_DATA_PATH, index=False)
        test_data_frame.to_csv(TEST_DATA_PATH, index=False)
        validation_data_frame.to_csv(VALIDATION_DATA_PATH, index=False)
    # Creating Datasets from the stored datasets
    print(f"Initializing Datasets")
    train_dataset = OilDataset(TRAIN_DATA_PATH)
    test_dataset = OilDataset(TEST_DATA_PATH)
    val_dataset = OilDataset(VALIDATION_DATA_PATH)
    
    print(f"Creating DataLoaders with batch_size ({batch_size}), num_workers ({num_workers}), pin_memory ({pin_memory}). Training dataset drop_last: ({drop_last})")
    train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, num_workers=workers, pin_memory=pin_memory, drop_last=drop_last, shuffle=True)
    validation_dataloader = DataLoader(dataset=val_dataset, batch_size=batch_size, num_workers=workers, pin_memory=pin_memory, drop_last=drop_last, shuffle=False)
    test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size, num_workers=workers, pin_memory=pin_memory, drop_last=drop_last, shuffle=False)

    print(f"Training DataLoader has ({len(train_dataloader)}) batches, Test DataLoader has ({len(test_dataloader)}) batches, Validation DataLoader has ({len(validation_dataloader)}) batches")

    return (train_dataset, test_dataset, val_dataset, train_dataloader, test_dataloader, validation_dataloader, feature_scaler, label_scaler)

In [66]:
try:
    data_pipeline(root_data_dir="../Data", data_file_path="OIL_DATASET.csv", data_splits_dir="DataSplits")
except Exception as e:
    raise RuntimeError(f"An unexpected error occurred when running the data pipeline function:{e}")

CSV file detected, reading from '..\Data'
Datasets not found in '..\Data\DataSplits' or incomplete. Generating datasets...
Feature scaler stored in: (..\Data\Scalers\feature-scaler.joblib)
Label scaler stored in: (..\Data\Scalers\label-scaler.joblib)
Train Features Scaled Shape: (5727, 22)
Train Labels Scaled Shape: (319, 1)
validation Features Scaled Shape: (318, 22)
validation Labels: (318, 1)
test Features Scaled Shape: (319, 22)
test Labels Scaled Shape: (319, 1)
Initializing Datasets
Creating DataLoaders with batch_size (64), num_workers (0), pin_memory (False). Training dataset drop_last: (True)
Training DataLoader has (89) batches, Test DataLoader has (4) batches, Validation DataLoader has (4) batches
