# Sandbox

## Imports

In [18]:
from importlib.metadata import version
import pandas as pd
import numpy as np
import seaborn as sn
from pathlib import Path
import os
import torch
from torch.utils.data import Dataset, DataLoader

## Data Preparation

In [2]:
WEATHER_PATH_ORIGINAL = Path("./Data/CA_Weather_Fire_Dataset_1984-2025.csv")     # Set the data source path
WEATHER_DATA_CLEAN="CA_Weather_Fire_Dataset_Cleaned.csv"
WEATHER_DATA_CLEAN_PATH = Path("./Data") / WEATHER_DATA_CLEAN

In [48]:
# Replace with the actual path to your CSV file
data = pd.read_csv(WEATHER_PATH_ORIGINAL, sep=',',header=0)

test_df = data.copy()

In [49]:
test_df.shape

(14988, 14)

In [50]:
test_df.head(5)

Unnamed: 0,DATE,PRECIPITATION,MAX_TEMP,MIN_TEMP,AVG_WIND_SPEED,FIRE_START_DAY,YEAR,TEMP_RANGE,WIND_TEMP_RATIO,MONTH,SEASON,LAGGED_PRECIPITATION,LAGGED_AVG_WIND_SPEED,DAY_OF_YEAR
0,1984-01-01,0.0,79.0,51.0,4.7,False,1984,28.0,0.059494,1,Winter,0.0,4.7,1
1,1984-01-02,0.0,71.0,46.0,5.59,False,1984,25.0,0.078732,1,Winter,0.0,5.145,2
2,1984-01-03,0.0,70.0,47.0,5.37,False,1984,23.0,0.076714,1,Winter,0.0,5.22,3
3,1984-01-04,0.0,76.0,45.0,4.7,False,1984,31.0,0.061842,1,Winter,0.0,5.09,4
4,1984-01-05,0.0,74.0,49.0,5.14,False,1984,25.0,0.069459,1,Winter,0.0,5.1,5


In [51]:
test_df.tail(5)

Unnamed: 0,DATE,PRECIPITATION,MAX_TEMP,MIN_TEMP,AVG_WIND_SPEED,FIRE_START_DAY,YEAR,TEMP_RANGE,WIND_TEMP_RATIO,MONTH,SEASON,LAGGED_PRECIPITATION,LAGGED_AVG_WIND_SPEED,DAY_OF_YEAR
14983,2025-01-08,0.0,73.0,53.0,10.51,False,2025,20.0,0.143973,1,Winter,0.0,6.485714,8
14984,2025-01-09,0.0,68.0,46.0,4.92,False,2025,22.0,0.072353,1,Winter,0.0,6.55,9
14985,2025-01-10,0.0,70.0,46.0,3.58,False,2025,24.0,0.051143,1,Winter,0.0,6.327143,10
14986,2025-01-11,0.0,66.0,46.0,,False,2025,20.0,,1,Winter,0.0,6.561667,11
14987,2025-01-12,0.0,69.0,46.0,,False,2025,23.0,,1,Winter,0.0,7.068,12


In [None]:
# Convert SEASON column to float
season_map = {'Winter': 1.0, 'Spring': 2.0, 'Summer': 3.0, 'Fall': 4.0}
test_df['SEASON'] = test_df['SEASON'].map(season_map)


In [53]:

# Convert int to floats
test_df[['FIRE_START_DAY', 'YEAR', 'MONTH', 'DAY_OF_YEAR']] = test_df[['FIRE_START_DAY', 'YEAR', 'MONTH', 'DAY_OF_YEAR']].astype(float)


In [55]:
test_df.drop(columns=['DATE'], inplace=True)

In [56]:
test_df.head()

Unnamed: 0,PRECIPITATION,MAX_TEMP,MIN_TEMP,AVG_WIND_SPEED,FIRE_START_DAY,YEAR,TEMP_RANGE,WIND_TEMP_RATIO,MONTH,SEASON,LAGGED_PRECIPITATION,LAGGED_AVG_WIND_SPEED,DAY_OF_YEAR
0,0.0,79.0,51.0,4.7,0.0,1984.0,28.0,0.059494,1.0,1.0,0.0,4.7,1.0
1,0.0,71.0,46.0,5.59,0.0,1984.0,25.0,0.078732,1.0,1.0,0.0,5.145,2.0
2,0.0,70.0,47.0,5.37,0.0,1984.0,23.0,0.076714,1.0,1.0,0.0,5.22,3.0
3,0.0,76.0,45.0,4.7,0.0,1984.0,31.0,0.061842,1.0,1.0,0.0,5.09,4.0
4,0.0,74.0,49.0,5.14,0.0,1984.0,25.0,0.069459,1.0,1.0,0.0,5.1,5.0


In [57]:
test_df.tail()

Unnamed: 0,PRECIPITATION,MAX_TEMP,MIN_TEMP,AVG_WIND_SPEED,FIRE_START_DAY,YEAR,TEMP_RANGE,WIND_TEMP_RATIO,MONTH,SEASON,LAGGED_PRECIPITATION,LAGGED_AVG_WIND_SPEED,DAY_OF_YEAR
14983,0.0,73.0,53.0,10.51,0.0,2025.0,20.0,0.143973,1.0,1.0,0.0,6.485714,8.0
14984,0.0,68.0,46.0,4.92,0.0,2025.0,22.0,0.072353,1.0,1.0,0.0,6.55,9.0
14985,0.0,70.0,46.0,3.58,0.0,2025.0,24.0,0.051143,1.0,1.0,0.0,6.327143,10.0
14986,0.0,66.0,46.0,,0.0,2025.0,20.0,,1.0,1.0,0.0,6.561667,11.0
14987,0.0,69.0,46.0,,0.0,2025.0,23.0,,1.0,1.0,0.0,7.068,12.0


In [58]:
test_df.shape

(14988, 13)

In [66]:

new_order = ['DAY_OF_YEAR', 'MONTH', 'YEAR', 'SEASON', 'PRECIPITATION', 'LAGGED_PRECIPITATION', 'AVG_WIND_SPEED', 'LAGGED_AVG_WIND_SPEED', 'WIND_TEMP_RATIO', 'MIN_TEMP', 'TEMP_RANGE', 'FIRE_START_DAY', 'MAX_TEMP']  # Put your desired column names here, leaving the target on the furthest on the right
test_df = test_df[new_order]


In [67]:
prep_greater_than_zero_2 = test_df[test_df['PRECIPITATION'] > 0.0]

In [60]:
prep_greater_than_zero_2.shape

(4939, 13)

In [68]:
prep_greater_than_zero_2.head(20)

Unnamed: 0,DAY_OF_YEAR,MONTH,YEAR,SEASON,PRECIPITATION,LAGGED_PRECIPITATION,AVG_WIND_SPEED,LAGGED_AVG_WIND_SPEED,WIND_TEMP_RATIO,MIN_TEMP,TEMP_RANGE,FIRE_START_DAY,MAX_TEMP
15,16.0,1.0,1984.0,1.0,0.39,0.39,6.71,6.007143,0.122,45.0,10.0,0.0,55.0
40,41.0,2.0,1984.0,1.0,0.01,0.01,11.18,6.871429,0.172,51.0,14.0,0.0,65.0
73,74.0,3.0,1984.0,2.0,0.14,0.14,12.3,8.597143,0.189231,56.0,9.0,0.0,65.0
96,97.0,4.0,1984.0,2.0,0.87,0.87,10.51,9.522857,0.159242,53.0,13.0,0.0,66.0
108,109.0,4.0,1984.0,2.0,0.04,0.04,10.29,7.925714,0.155909,55.0,11.0,1.0,66.0
109,110.0,4.0,1984.0,2.0,0.1,0.14,19.91,10.035714,0.311094,55.0,9.0,0.0,64.0
117,118.0,4.0,1984.0,2.0,0.15,0.15,8.5,11.472857,0.132812,50.0,14.0,0.0,64.0
227,228.0,8.0,1984.0,3.0,0.29,0.29,7.83,8.372857,0.097875,65.0,15.0,0.0,80.0
253,254.0,9.0,1984.0,4.0,0.02,0.02,7.38,7.318571,0.088916,73.0,10.0,1.0,83.0
254,255.0,9.0,1984.0,4.0,0.02,0.04,5.82,7.191429,0.0776,71.0,4.0,0.0,75.0


In [64]:
# Ensure the directory exists
os.makedirs(WEATHER_DATA_CLEAN_PATH.parent, exist_ok=True)


In [75]:
test_df.to_csv(WEATHER_DATA_CLEAN_PATH, index=False)

In [None]:

print(test_df.columns.tolist())
# output
['DAY_OF_YEAR', 'MONTH', 'YEAR', 'SEASON', 'PRECIPITATION', 'LAGGED_PRECIPITATION', 'AVG_WIND_SPEED', 'LAGGED_AVG_WIND_SPEED', 'WIND_TEMP_RATIO', 'MIN_TEMP', 'TEMP_RANGE', 'FIRE_START_DAY', 'MAX_TEMP']


['DAY_OF_YEAR', 'MONTH', 'YEAR', 'SEASON', 'PRECIPITATION', 'LAGGED_PRECIPITATION', 'AVG_WIND_SPEED', 'LAGGED_AVG_WIND_SPEED', 'WIND_TEMP_RATIO', 'MIN_TEMP', 'TEMP_RANGE', 'FIRE_START_DAY', 'MAX_TEMP']


In [76]:
data2 = pd.read_csv(WEATHER_DATA_CLEAN_PATH, sep=',',header=0)

In [77]:
data2.head()

Unnamed: 0,DAY_OF_YEAR,MONTH,YEAR,SEASON,PRECIPITATION,LAGGED_PRECIPITATION,AVG_WIND_SPEED,LAGGED_AVG_WIND_SPEED,WIND_TEMP_RATIO,MIN_TEMP,TEMP_RANGE,FIRE_START_DAY,MAX_TEMP
0,1.0,1.0,1984.0,1.0,0.0,0.0,4.7,4.7,0.059494,51.0,28.0,0.0,79.0
1,2.0,1.0,1984.0,1.0,0.0,0.0,5.59,5.145,0.078732,46.0,25.0,0.0,71.0
2,3.0,1.0,1984.0,1.0,0.0,0.0,5.37,5.22,0.076714,47.0,23.0,0.0,70.0
3,4.0,1.0,1984.0,1.0,0.0,0.0,4.7,5.09,0.061842,45.0,31.0,0.0,76.0
4,5.0,1.0,1984.0,1.0,0.0,0.0,5.14,5.1,0.069459,49.0,25.0,0.0,74.0


In [3]:
data = pd.read_csv(WEATHER_DATA_CLEAN_PATH)

In [None]:
# Separate features (X) and target (y)
# X = data.drop('MAX_TEMP', axis=1)
# y = data['MAX_TEMP']

In [None]:
# print(f"X data shape: {X.shape[0]}")
# print(f"y data shape: {y.shape}")


X data shape: 14988
y data shape: (14988,)


In [None]:
# samples= X.shape[0]

In [8]:
shuffled_data = data.sample(frac=1, random_state=42).reset_index(drop=True)

In [9]:
train_size=.80
test_size=.10
val_size=.10

train_index = int(samples * train_size)
test_end_index = int(samples * (train_size + test_size))

In [14]:
train_data_frame = shuffled_data.iloc[:train_index]
test_data_frame = shuffled_data.iloc[train_index:test_end_index]
val_data_frame = shuffled_data.iloc[test_end_index:]


In [16]:
print(f"length of train: {len(train_data_frame)}")
print(f"length of test: {len(test_data_frame)}")
print(f"length of val: {len(val_data_frame)}")
print(f"Sum: {len(train_data_frame) +len(test_data_frame) + len(val_data_frame)}")


length of train: 11990
length of test: 1499
length of val: 1499
Sum: 14988


In [None]:
class WeatherDataset(Dataset):
    """Dataset class For the CA Weather Fire Dataset"""
    def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256):
        try:
            self.data = pd.read_csv(csv_file)
        except FileNotFoundError:
            raise FileNotFoundError(f"File not found: {csv_file}")

        

    def __getitem__(self, index):
        encoded = self.encoded_texts[index]
        label = self.data.iloc[index]["Label"]
        return (
            torch.tensor(encoded, dtype=torch.long),
            torch.tensor(label, dtype=torch.long)
        )

    def __len__(self):
        return len(self.data)

    def _longest_encoded_length(self):
        max_length = 0
        for encoded_text in self.encoded_texts:
            encoded_length = len(encoded_text)
            if encoded_length > max_length:
                max_length = encoded_length
        return max_length