# Creating Dataloaders

## Imports

In [1]:
from importlib.metadata import version
import pandas as pd
import numpy as np
import seaborn as sns
from pathlib import Path
import os
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
import torch
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [2]:
from importlib.metadata import version

In [3]:
list = ['pandas', 'numpy', 'seaborn', 'matplotlib', 'torch', 'joblib']
for package in list:
    try:
        print(f"{package} version: {version(package)}") # Raises PackageNotFoundError if not found
    except:
         print(f"❌ Package '{package}' not found. Please install it.")

pandas version: 2.3.1
numpy version: 1.23.5
seaborn version: 0.13.2
matplotlib version: 3.10.5
torch version: 2.5.1
joblib version: 1.5.1


## Data Preparation

### Prepare Paths

In [5]:
DATA_ROOT = Path("../Data")
# OIL_PATH_ORIGINAL = DATA_ROOT / "OIL_Dataset_1984-2025.csv"     # Set the data source path
DATA_CLEAN = "OIL_DATASET.csv"
DATA_CLEAN_PATH = DATA_ROOT / DATA_CLEAN

MODEL_ROOT = Path("../Models")

TRAIN_FEATURE_SCALER = "train_feature_scaler.joblib"
TRAIN_LABEL_SCALER = "train_label_scaler.joblib"

TRAIN_FEATURE_SCALER_PATH = MODEL_ROOT / TRAIN_FEATURE_SCALER
TRAIN_LABEL_SCALER_PATH = MODEL_ROOT / TRAIN_LABEL_SCALER


In [9]:
# Login using e.g. `huggingface-cli login` to access this dataset

if DATA_CLEAN_PATH.exists():
    print(f"CSV file detected, reading from '{DATA_ROOT}'")
    df = pd.read_csv(DATA_CLEAN_PATH)

else:
    print(f"Downloading CSV file from HuggingFace")
    os.makedirs(DATA_ROOT, exist_ok=True)
    df = pd.read_csv("hf://datasets/MaxPrestige/CRUDE_OIL_PRICES/Data/OIL_DATASET.csv")
    df.to_csv(DATA_CLEAN_PATH, index=False)


CSV file detected, reading from '..\Data'


#### File Verification

In [10]:
paths = [DATA_CLEAN_PATH]

try:
    for path in paths:
        if not path.exists():
            raise FileNotFoundError(f"The file '{path}' does not exist.")
except FileNotFoundError as e:
    print(e)

#### Reading File to DataFrame

In [None]:
# df = pd.read_csv(DATA_CLEAN_PATH, parse_dates=['Date'])


In [None]:
# df

Unnamed: 0,Date,Open,Close,High,Low,California_Crude_Oil_First_Purchase_Price_$/bbl,Texas_Crude_Oil_First_Purchase_Price_$/bbl,US_Crude_Oil_First_Purchase_Price_$/bbl,US_Imports_from_Canada_of_Crude_Oil_Mbbl/d,US_Imports_from_Colombia_of_Crude_Oil_Mbbl/d,...,US_Imports_of_Crude_Oil_Mbbl/d,US_Exports_to_Canada_of_Crude_Oil_Mbbl/d,US_Exports_of_Crude_Oil_Mbbl/d,US_Net_Imports_from_Canada_of_Crude_Oil_Mbbl/d,US_Net_Imports_from_Colombia_of_Crude_Oil_Mbbl/d,US_Net_Imports_from_Mexico_of_Crude_Oil_Mbbl/d,US_Net_Imports_from_United_Kingdom_of_Crude_Oil_Mbbl/d,US_Net_Imports_from_OPEC_Countries_of_Crude_Oil_Mbbl/d,US_Net_Imports_from_Non-OPEC_Countries_of_Crude_Oil_Mbbl/d,US_Net_Imports_of_Crude_Oil_Mbbl/d
0,2025-06-30,67.33,66.63,67.20,65.92,62.41,60.56,59.94,3814.0,223.0,...,6259.0,218.0,3629.0,3596.0,207.0,431.0,-93.0,697.0,1934.0,2631.0
1,2025-06-27,67.91,67.31,68.42,67.20,62.41,60.56,59.94,3814.0,223.0,...,6259.0,218.0,3629.0,3596.0,207.0,431.0,-93.0,697.0,1934.0,2631.0
2,2025-06-26,67.77,67.82,69.05,67.38,62.41,60.56,59.94,3814.0,223.0,...,6259.0,218.0,3629.0,3596.0,207.0,431.0,-93.0,697.0,1934.0,2631.0
3,2025-06-25,67.88,67.61,68.78,67.32,62.41,60.56,59.94,3814.0,223.0,...,6259.0,218.0,3629.0,3596.0,207.0,431.0,-93.0,697.0,1934.0,2631.0
4,2025-06-24,68.37,67.82,70.20,66.82,62.41,60.56,59.94,3814.0,223.0,...,6259.0,218.0,3629.0,3596.0,207.0,431.0,-93.0,697.0,1934.0,2631.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6359,2000-09-27,31.95,31.48,32.23,31.42,26.31,29.64,27.91,1318.0,299.0,...,9939.0,17.0,17.0,1301.0,299.0,1381.0,343.0,5347.0,4574.0,9922.0
6360,2000-09-26,32.00,31.47,32.05,31.40,26.31,29.64,27.91,1318.0,299.0,...,9939.0,17.0,17.0,1301.0,299.0,1381.0,343.0,5347.0,4574.0,9922.0
6361,2000-09-25,31.40,31.51,32.20,30.86,26.31,29.64,27.91,1318.0,299.0,...,9939.0,17.0,17.0,1301.0,299.0,1381.0,343.0,5347.0,4574.0,9922.0
6362,2000-09-22,34.00,32.60,34.40,32.50,26.31,29.64,27.91,1318.0,299.0,...,9939.0,17.0,17.0,1301.0,299.0,1381.0,343.0,5347.0,4574.0,9922.0


In [11]:
label_col = "Close"

In [None]:
df_features = df.drop(columns=['Date', label_col], inplace=False)
df_labels = df[[label_col]]     # Instead of returning a pandas Series using "[]", return a dataframe using the "[[]]" to get a shape with (-1,1)


In [22]:
df_labels.head(5)

Unnamed: 0,Close
0,66.63
1,67.31
2,67.82
3,67.61
4,67.82


In [23]:
print(f"shape of df_features: {df_features.shape}")
print(f"shape of df_labels: {df_labels.shape}")


shape of df_features: (6364, 22)
shape of df_labels: (6364, 1)


In [15]:
df_features.head()

Unnamed: 0,Open,High,Low,California_Crude_Oil_First_Purchase_Price_$/bbl,Texas_Crude_Oil_First_Purchase_Price_$/bbl,US_Crude_Oil_First_Purchase_Price_$/bbl,US_Imports_from_Canada_of_Crude_Oil_Mbbl/d,US_Imports_from_Colombia_of_Crude_Oil_Mbbl/d,US_Imports_from_United_Kingdom_of_Crude_Oil_Mbbl/d,US_Imports_from_Mexico_of_Crude_Oil_Mbbl/d,...,US_Imports_of_Crude_Oil_Mbbl/d,US_Exports_to_Canada_of_Crude_Oil_Mbbl/d,US_Exports_of_Crude_Oil_Mbbl/d,US_Net_Imports_from_Canada_of_Crude_Oil_Mbbl/d,US_Net_Imports_from_Colombia_of_Crude_Oil_Mbbl/d,US_Net_Imports_from_Mexico_of_Crude_Oil_Mbbl/d,US_Net_Imports_from_United_Kingdom_of_Crude_Oil_Mbbl/d,US_Net_Imports_from_OPEC_Countries_of_Crude_Oil_Mbbl/d,US_Net_Imports_from_Non-OPEC_Countries_of_Crude_Oil_Mbbl/d,US_Net_Imports_of_Crude_Oil_Mbbl/d
0,67.33,67.2,65.92,62.41,60.56,59.94,3814.0,223.0,81.0,431.0,...,6259.0,218.0,3629.0,3596.0,207.0,431.0,-93.0,697.0,1934.0,2631.0
1,67.91,68.42,67.2,62.41,60.56,59.94,3814.0,223.0,81.0,431.0,...,6259.0,218.0,3629.0,3596.0,207.0,431.0,-93.0,697.0,1934.0,2631.0
2,67.77,69.05,67.38,62.41,60.56,59.94,3814.0,223.0,81.0,431.0,...,6259.0,218.0,3629.0,3596.0,207.0,431.0,-93.0,697.0,1934.0,2631.0
3,67.88,68.78,67.32,62.41,60.56,59.94,3814.0,223.0,81.0,431.0,...,6259.0,218.0,3629.0,3596.0,207.0,431.0,-93.0,697.0,1934.0,2631.0
4,68.37,70.2,66.82,62.41,60.56,59.94,3814.0,223.0,81.0,431.0,...,6259.0,218.0,3629.0,3596.0,207.0,431.0,-93.0,697.0,1934.0,2631.0


In [16]:
df_labels.head()

0    66.63
1    67.31
2    67.82
3    67.61
4    67.82
Name: Close, dtype: float64

## Data Splitting

In [24]:
# Split the whole pandas DataFrame into smaller DataFrames
X_train, X_inter, Y_train, Y_inter = train_test_split(df_features, df_labels, test_size=0.1, random_state=42)
X_validation, X_test, Y_validation, Y_test = train_test_split(X_inter, Y_inter, test_size=0.5, random_state=42)


In [25]:
print(f"Train Features: {X_train.shape}")
print(f"Train Labels: {Y_train.shape}")
print(f"validation Features: {X_validation.shape}")
print(f"validation Labels: {Y_validation.shape}")
print(f"test Features: {X_test.shape}")
print(f"test Labels: {Y_test.shape}")

Train Features: (5727, 22)
Train Labels: (5727, 1)
validation Features: (318, 22)
validation Labels: (318, 1)
test Features: (319, 22)
test Labels: (319, 1)


## Scaling The Data

In [26]:
feature_scaler = MinMaxScaler()
label_scaler = MinMaxScaler()

In [27]:
os.makedirs(MODEL_ROOT, exist_ok=True)

### Notes
Only fit the scalers on the training data to prevent any data leakage.

In [28]:
# Fit the scaler on the training features and transform them
X_train_scaled = feature_scaler.fit_transform(X_train)
# Transform the test features using the same scaler
Y_test_scaled = label_scaler.fit_transform(Y_train)


In [None]:
data_splits_dir = "DataSplits"
DATA_SPLITS_DIR = DATA_ROOT / data_splits_dir
TRAIN_DATA_PATH = DATA_SPLITS_DIR / "train.csv"
TEST_DATA_PATH = DATA_SPLITS_DIR / "test.csv"
VALIDATION_DATA_PATH = DATA_SPLITS_DIR / "val.csv"
FEATURE_SCALER_PATH = DATA_SPLITS_DIR / "feature-scaler.joblib"
LABEL_SCALER_PATH = DATA_SPLITS_DIR / "label-scaler.joblib"

# features = ['DAY_OF_YEAR', 'PRECIPITATION', 'LAGGED_PRECIPITATION', 'AVG_WIND_SPEED', 'MIN_TEMP']
# features = ['PRECIPITATION','AVG_WIND_SPEED', 'MIN_TEMP']

target = 'MAX_TEMP'

if os.path.exists(TRAIN_DATA_PATH) and os.path.exists(TEST_DATA_PATH) and os.path.exists(VALIDATION_DATA_PATH) :
    print(f"Train, Test, and Validation csv datasets detected in '{DATA_SPLITS_DIR}', skipping generation")
    f_scaler = joblib.load(FEATURE_SCALER_PATH)
    l_scaler = joblib.load(LABEL_SCALER_PATH)


## Creating Dataset

### Creating Dataset Class

In [21]:
class OilDataset(Dataset):
    """Dataset class For the OIL_DATASET"""
    def __init__(self, csv_file="../Data/OIL_DATASET.csv"):
        try:
            self.data = pd.read_csv(csv_file)   # Assign a pandas data frame
            
        except FileNotFoundError:
            raise FileNotFoundError(f"File not found: {csv_file}")

        # Define feature and label columns
        self.label_column = "Close"
        # Remove the Date column and the label column
        self.feature_columns = self.data.columns.drop(["Date", self.label_column], inplace=False)
        

    def __getitem__(self, index):
        features = self.data.loc[index, self.feature_columns].values
        
        label = self.data.loc[index, self.label_column] # Extract the label for the given index
        return (
            torch.tensor(features, dtype=torch.float),
            torch.tensor(label, dtype=torch.float)
        )

    def __len__(self):
        return len(self.data)

### Initializing Datasets for different splits

In [None]:
train_dataset = OilDataset(TRAIN_DATA_PATH)
test_dataset = OilDataset(TEST_DATA_PATH)
val_dataset = OilDataset(VALIDATION_DATA_PATH)