# Creating DataLoaders

## Imports

In [1]:
from importlib.metadata import version
import pandas as pd
import numpy as np
import seaborn as sn
from pathlib import Path
import os
import torch
from torch.utils.data import Dataset, DataLoader

## Data Preparation

In [2]:
# Login using e.g. `huggingface-cli login` to access this dataset
df = pd.read_csv("hf://datasets/MaxPrestige/CA_Weather_Fire_Dataset_Cleaned/Data/CA_Weather_Fire_Dataset_Cleaned.csv")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df

Unnamed: 0,DAY_OF_YEAR,MONTH,YEAR,SEASON,PRECIPITATION,LAGGED_PRECIPITATION,AVG_WIND_SPEED,LAGGED_AVG_WIND_SPEED,WIND_TEMP_RATIO,MIN_TEMP,TEMP_RANGE,FIRE_START_DAY,MAX_TEMP
0,1.0,1.0,1984.0,1.0,0.0,0.0,4.70,4.700000,0.059494,51.0,28.0,0.0,79.0
1,2.0,1.0,1984.0,1.0,0.0,0.0,5.59,5.145000,0.078732,46.0,25.0,0.0,71.0
2,3.0,1.0,1984.0,1.0,0.0,0.0,5.37,5.220000,0.076714,47.0,23.0,0.0,70.0
3,4.0,1.0,1984.0,1.0,0.0,0.0,4.70,5.090000,0.061842,45.0,31.0,0.0,76.0
4,5.0,1.0,1984.0,1.0,0.0,0.0,5.14,5.100000,0.069459,49.0,25.0,0.0,74.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14983,8.0,1.0,2025.0,1.0,0.0,0.0,10.51,6.485714,0.143973,53.0,20.0,0.0,73.0
14984,9.0,1.0,2025.0,1.0,0.0,0.0,4.92,6.550000,0.072353,46.0,22.0,0.0,68.0
14985,10.0,1.0,2025.0,1.0,0.0,0.0,3.58,6.327143,0.051143,46.0,24.0,0.0,70.0
14986,11.0,1.0,2025.0,1.0,0.0,0.0,,6.561667,,46.0,20.0,0.0,66.0


In [4]:
# WEATHER_PATH_ORIGINAL = Path("../Data/CA_Weather_Fire_Dataset_1984-2025.csv")     # Set the data source path
WEATHER_DATA_CLEAN="CA_Weather_Fire_Dataset_Cleaned.csv"
WEATHER_DATA_CLEAN_PATH = Path("../Data") / WEATHER_DATA_CLEAN

In [11]:
DATA_SPLITS_DIR = Path("../Data/DataSplits")

In [12]:
TRAIN_DATA_PATH = DATA_SPLITS_DIR / "train.csv"
TEST_DATA_PATH = DATA_SPLITS_DIR / "test.csv"
VALIDATION_DATA_PATH = DATA_SPLITS_DIR / "val.csv"


In [13]:
os.makedirs(DATA_SPLITS_DIR, exist_ok=True)

In [5]:
shuffled_data = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [6]:
num_samples = len(shuffled_data)
train_size=.80
test_size=.10
val_size=.10

train_index = int(num_samples * train_size)
test_end_index = int(num_samples * (train_size + test_size))

In [None]:
train_data_frame = shuffled_data.iloc[:train_index]
test_data_frame = shuffled_data.iloc[train_index:test_end_index]
val_data_frame = shuffled_data.iloc[test_end_index:]

In [15]:
train_data_frame.to_csv(TRAIN_DATA_PATH, index=False)
test_data_frame.to_csv(TEST_DATA_PATH, index=False)
val_data_frame.to_csv(VALIDATION_DATA_PATH, index=False)

In [8]:
print(f"length of train: {len(train_data_frame)}")
print(f"length of test: {len(test_data_frame)}")
print(f"length of val: {len(val_data_frame)}")
print(f"Sum: {len(train_data_frame) +len(test_data_frame) + len(val_data_frame)}")


length of train: 11990
length of test: 1499
length of val: 1499
Sum: 14988


In [9]:
class WeatherDataset(Dataset):
    """Dataset class For the CA Weather Fire Dataset"""
    def __init__(self, csv_file="../Data/CA_Weather_Fire_Dataset_Cleaned.csv"):
        try:
            self.data = pd.read_csv(csv_file)   # Assign a pandas data frame
        except FileNotFoundError:
            raise FileNotFoundError(f"File not found: {csv_file}")

        # Define feature and label columns
        self.feature_columns = self.data.columns.drop("MAX_TEMP")
        self.label_column = "MAX_TEMP"
        

    def __getitem__(self, index):
        features = self.data.loc[index, self.feature_columns].values
        
        label = self.data.loc[index, self.label_column] # Extract the label for the given index
        return (
            torch.tensor(features, dtype=torch.float),
            torch.tensor(label, dtype=torch.float)
        )

    def __len__(self):
        return len(self.data)

In [16]:
train_dataset = WeatherDataset(TRAIN_DATA_PATH)
test_dataset = WeatherDataset(TEST_DATA_PATH)
val_dataset = WeatherDataset(VALIDATION_DATA_PATH)


In [20]:
batch_size=64
num_workers=0
pin_memory=True
drop_last=True

In [None]:
train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, num_workers=num_workers, pin_memory=pin_memory, drop_last=drop_last, shuffle=True)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size, num_workers=num_workers, pin_memory=pin_memory, drop_last=drop_last)
val_dataloader = DataLoader(dataset=val_dataset, batch_size=batch_size, num_workers=num_workers, pin_memory=pin_memory, drop_last=drop_last)