# Creating DataLoaders

### Versions
- 01:
    - Normalizing the Inputs
- 00:
    - Initial creation

## Imports

In [2]:
from importlib.metadata import version
import pandas as pd
import numpy as np
import seaborn as sn
from pathlib import Path
import os
import torch
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Data Preparation

In [7]:
# Login using e.g. `huggingface-cli login` to access this dataset
# df = pd.read_csv("hf://datasets/MaxPrestige/CA_Weather_Fire_Dataset_Cleaned/Data/CA_Weather_Fire_Dataset_Cleaned.csv")

WEATHER_DATA_ROOT = Path("../Data")
WEATHER_PATH_ORIGINAL = WEATHER_DATA_ROOT / "CA_Weather_Fire_Dataset_1984-2025.csv"     # Set the data source path
WEATHER_DATA_CLEAN="CA_Weather_Fire_Dataset_Cleaned.csv"
WEATHER_DATA_CLEAN_PATH = WEATHER_DATA_ROOT / WEATHER_DATA_CLEAN
df = pd.read_csv(WEATHER_DATA_CLEAN_PATH, sep=',',header=0)


In [8]:
df

Unnamed: 0,DAY_OF_YEAR,PRECIPITATION,LAGGED_PRECIPITATION,AVG_WIND_SPEED,MIN_TEMP,MAX_TEMP
0,1.0,0.0,0.0,4.70,51.0,79.0
1,2.0,0.0,0.0,5.59,46.0,71.0
2,3.0,0.0,0.0,5.37,47.0,70.0
3,4.0,0.0,0.0,4.70,45.0,76.0
4,5.0,0.0,0.0,5.14,49.0,74.0
...,...,...,...,...,...,...
14983,8.0,0.0,0.0,10.51,53.0,73.0
14984,9.0,0.0,0.0,4.92,46.0,68.0
14985,10.0,0.0,0.0,3.58,46.0,70.0
14986,11.0,0.0,0.0,,46.0,66.0


In [9]:
DATA_SPLITS_DIR = WEATHER_DATA_ROOT / "DataSplits"

In [10]:
TRAIN_DATA_PATH = DATA_SPLITS_DIR / "train.csv"
TEST_DATA_PATH = DATA_SPLITS_DIR / "test.csv"
VALIDATION_DATA_PATH = DATA_SPLITS_DIR / "val.csv"


In [11]:
os.makedirs(DATA_SPLITS_DIR, exist_ok=True)

In [16]:
features = ["DAY_OF_YEAR", "PRECIPITATION", "LAGGED_PRECIPITATION", "AVG_WIND_SPEED", "MIN_TEMP"]
target = "MAX_TEMP"
X = df[features]
y = df[target]

# split your data before scaling, shuffling the data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_test, X_validation, y_test, y_validation = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data ONLY. Need to use the scaler on all inputs that the model receives.
# This means the mean and standard deviation are calculated from the training set.
scaler.fit(X_train)

# Transform the training, validation, and test data using the fitted scaler
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_val_scaled = scaler.transform(X_validation)

In [17]:
X_train_df = pd.DataFrame(X_train_scaled, columns=features)
X_test_df = pd.DataFrame(X_test_scaled, columns=features)
X_val_df = pd.DataFrame(X_val_scaled, columns=features)

# Concatenate the features and labels back into a single DataFrame for each set
train_data_frame = pd.concat([X_train_df, y_train.reset_index(drop=True)], axis=1)
test_data_frame = pd.concat([X_test_df, y_test.reset_index(drop=True)], axis=1)
val_data_frame = pd.concat([X_val_df, y_validation.reset_index(drop=True)], axis=1)

In [18]:
train_data_frame.to_csv(TRAIN_DATA_PATH, index=False)
test_data_frame.to_csv(TEST_DATA_PATH, index=False)
val_data_frame.to_csv(VALIDATION_DATA_PATH, index=False)

In [19]:
print(f"length of train: {len(train_data_frame)}")
print(f"length of test: {len(test_data_frame)}")
print(f"length of val: {len(val_data_frame)}")
print(f"Sum: {len(train_data_frame) +len(test_data_frame) + len(val_data_frame)}")


length of train: 11990
length of test: 1499
length of val: 1499
Sum: 14988


In [20]:
class WeatherDataset(Dataset):
    """Dataset class For the CA Weather Fire Dataset"""
    def __init__(self, csv_file="../Data/CA_Weather_Fire_Dataset_Cleaned.csv"):
        try:
            self.data = pd.read_csv(csv_file)   # Assign a pandas data frame
        except FileNotFoundError:
            raise FileNotFoundError(f"File not found: {csv_file}")

        # Define feature and label columns
        self.feature_columns = self.data.columns.drop("MAX_TEMP")
        self.label_column = "MAX_TEMP"
        

    def __getitem__(self, index):
        features = self.data.loc[index, self.feature_columns].values
        
        label = self.data.loc[index, self.label_column] # Extract the label for the given index
        return (
            torch.tensor(features, dtype=torch.float),
            torch.tensor(label, dtype=torch.float)
        )

    def __len__(self):
        return len(self.data)

In [21]:
train_dataset = WeatherDataset(TRAIN_DATA_PATH)
test_dataset = WeatherDataset(TEST_DATA_PATH)
val_dataset = WeatherDataset(VALIDATION_DATA_PATH)


In [22]:
batch_size=64
num_workers=0
pin_memory=True
drop_last=True

In [23]:
train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, num_workers=num_workers, pin_memory=pin_memory, drop_last=drop_last, shuffle=True)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size, num_workers=num_workers, pin_memory=pin_memory, drop_last=drop_last)
val_dataloader = DataLoader(dataset=val_dataset, batch_size=batch_size, num_workers=num_workers, pin_memory=pin_memory, drop_last=drop_last)