# Cleaning 

Dropping any Row that has 'NaN' entries (incomplete data)

Removing columns: "DATE", "TEMP_RANGE", "YEAR", "LAGGED_AVG_WIND_SPEED", "MONTH", "SEASON", "WIND_TEMP_RATIO", "FIRE_START_DAY"

## Imports

In [1]:
from importlib.metadata import version
import pandas as pd
import numpy as np
import seaborn as sn
from pathlib import Path
import os
import torch
from torch.utils.data import Dataset, DataLoader

## Data Preparation

### Set Paths

In [2]:
WEATHER_DATA_ROOT = Path("../Data")
WEATHER_PATH_ORIGINAL = WEATHER_DATA_ROOT / "CA_Weather_Fire_Dataset_1984-2025.csv"     # Set the data source path
WEATHER_DATA_CLEAN="CA_Weather_Fire_Dataset_Cleaned.csv"
WEATHER_DATA_CLEAN_PATH = WEATHER_DATA_ROOT / WEATHER_DATA_CLEAN

### Read Raw Dataset

In [4]:
# Replace with the actual path to your CSV file
data = pd.read_csv(WEATHER_PATH_ORIGINAL, sep=',',header=0)

test_df = data.copy()
print(len(test_df))

14988


In [80]:
test_df.head(5)

Unnamed: 0,DATE,PRECIPITATION,MAX_TEMP,MIN_TEMP,AVG_WIND_SPEED,FIRE_START_DAY,YEAR,TEMP_RANGE,WIND_TEMP_RATIO,MONTH,SEASON,LAGGED_PRECIPITATION,LAGGED_AVG_WIND_SPEED,DAY_OF_YEAR
0,1984-01-01,0.0,79.0,51.0,4.7,False,1984,28.0,0.059494,1,Winter,0.0,4.7,1
1,1984-01-02,0.0,71.0,46.0,5.59,False,1984,25.0,0.078732,1,Winter,0.0,5.145,2
2,1984-01-03,0.0,70.0,47.0,5.37,False,1984,23.0,0.076714,1,Winter,0.0,5.22,3
3,1984-01-04,0.0,76.0,45.0,4.7,False,1984,31.0,0.061842,1,Winter,0.0,5.09,4
4,1984-01-05,0.0,74.0,49.0,5.14,False,1984,25.0,0.069459,1,Winter,0.0,5.1,5


In [5]:
test_df = test_df.dropna()
test_df

Unnamed: 0,DATE,PRECIPITATION,MAX_TEMP,MIN_TEMP,AVG_WIND_SPEED,FIRE_START_DAY,YEAR,TEMP_RANGE,WIND_TEMP_RATIO,MONTH,SEASON,LAGGED_PRECIPITATION,LAGGED_AVG_WIND_SPEED,DAY_OF_YEAR
0,1984-01-01,0.0,79.0,51.0,4.70,False,1984,28.0,0.059494,1,Winter,0.0,4.700000,1
1,1984-01-02,0.0,71.0,46.0,5.59,False,1984,25.0,0.078732,1,Winter,0.0,5.145000,2
2,1984-01-03,0.0,70.0,47.0,5.37,False,1984,23.0,0.076714,1,Winter,0.0,5.220000,3
3,1984-01-04,0.0,76.0,45.0,4.70,False,1984,31.0,0.061842,1,Winter,0.0,5.090000,4
4,1984-01-05,0.0,74.0,49.0,5.14,False,1984,25.0,0.069459,1,Winter,0.0,5.100000,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14981,2025-01-06,0.0,71.0,47.0,2.91,False,2025,24.0,0.040986,1,Winter,0.0,4.294000,6
14982,2025-01-07,0.0,73.0,49.0,13.42,False,2025,24.0,0.183836,1,Winter,0.0,5.815000,7
14983,2025-01-08,0.0,73.0,53.0,10.51,False,2025,20.0,0.143973,1,Winter,0.0,6.485714,8
14984,2025-01-09,0.0,68.0,46.0,4.92,False,2025,22.0,0.072353,1,Winter,0.0,6.550000,9


In [77]:
print(len(test_df))

14976


In [6]:
# Show all entries that are null (empty)
empty_entries_avg_wnd = test_df[test_df['AVG_WIND_SPEED'].isnull()]
empty_entries_avg_wnd

Unnamed: 0,DATE,PRECIPITATION,MAX_TEMP,MIN_TEMP,AVG_WIND_SPEED,FIRE_START_DAY,YEAR,TEMP_RANGE,WIND_TEMP_RATIO,MONTH,SEASON,LAGGED_PRECIPITATION,LAGGED_AVG_WIND_SPEED,DAY_OF_YEAR


In [71]:
print(len(empty_entries_avg_wnd))

12


In [7]:
empty_entries = test_df[test_df.isnull().any(axis=1)]

empty_entries

Unnamed: 0,DATE,PRECIPITATION,MAX_TEMP,MIN_TEMP,AVG_WIND_SPEED,FIRE_START_DAY,YEAR,TEMP_RANGE,WIND_TEMP_RATIO,MONTH,SEASON,LAGGED_PRECIPITATION,LAGGED_AVG_WIND_SPEED,DAY_OF_YEAR


In [73]:
print(len(empty_entries))

12


In [82]:
test_df.shape

(14976, 14)

In [None]:
test_df.head(5)

Unnamed: 0,DATE,PRECIPITATION,MAX_TEMP,MIN_TEMP,AVG_WIND_SPEED,FIRE_START_DAY,YEAR,TEMP_RANGE,WIND_TEMP_RATIO,MONTH,SEASON,LAGGED_PRECIPITATION,LAGGED_AVG_WIND_SPEED,DAY_OF_YEAR
0,1984-01-01,0.0,79.0,51.0,4.7,False,1984,28.0,0.059494,1,Winter,0.0,4.7,1
1,1984-01-02,0.0,71.0,46.0,5.59,False,1984,25.0,0.078732,1,Winter,0.0,5.145,2
2,1984-01-03,0.0,70.0,47.0,5.37,False,1984,23.0,0.076714,1,Winter,0.0,5.22,3
3,1984-01-04,0.0,76.0,45.0,4.7,False,1984,31.0,0.061842,1,Winter,0.0,5.09,4
4,1984-01-05,0.0,74.0,49.0,5.14,False,1984,25.0,0.069459,1,Winter,0.0,5.1,5


In [None]:
test_df.tail(5)

Unnamed: 0,DATE,PRECIPITATION,MAX_TEMP,MIN_TEMP,AVG_WIND_SPEED,FIRE_START_DAY,YEAR,TEMP_RANGE,WIND_TEMP_RATIO,MONTH,SEASON,LAGGED_PRECIPITATION,LAGGED_AVG_WIND_SPEED,DAY_OF_YEAR
14983,2025-01-08,0.0,73.0,53.0,10.51,False,2025,20.0,0.143973,1,Winter,0.0,6.485714,8
14984,2025-01-09,0.0,68.0,46.0,4.92,False,2025,22.0,0.072353,1,Winter,0.0,6.55,9
14985,2025-01-10,0.0,70.0,46.0,3.58,False,2025,24.0,0.051143,1,Winter,0.0,6.327143,10
14986,2025-01-11,0.0,66.0,46.0,,False,2025,20.0,,1,Winter,0.0,6.561667,11
14987,2025-01-12,0.0,69.0,46.0,,False,2025,23.0,,1,Winter,0.0,7.068,12


In [8]:

# Convert int to floats
test_df[['DAY_OF_YEAR']] = test_df[['DAY_OF_YEAR']].astype(float)


In [9]:
test_df.drop(columns=["DATE", "TEMP_RANGE", "YEAR", "LAGGED_AVG_WIND_SPEED", "MONTH", "SEASON", "WIND_TEMP_RATIO", "FIRE_START_DAY"], inplace=True, errors='ignore')

In [10]:
test_df.shape

(14976, 6)

In [11]:

new_order = ['DAY_OF_YEAR', 'PRECIPITATION', 'LAGGED_PRECIPITATION', 'AVG_WIND_SPEED', 'MIN_TEMP', 'MAX_TEMP']  # Put your desired column names here, leaving the target on the furthest on the right
test_df = test_df[new_order]


In [12]:
test_df.head()

Unnamed: 0,DAY_OF_YEAR,PRECIPITATION,LAGGED_PRECIPITATION,AVG_WIND_SPEED,MIN_TEMP,MAX_TEMP
0,1.0,0.0,0.0,4.7,51.0,79.0
1,2.0,0.0,0.0,5.59,46.0,71.0
2,3.0,0.0,0.0,5.37,47.0,70.0
3,4.0,0.0,0.0,4.7,45.0,76.0
4,5.0,0.0,0.0,5.14,49.0,74.0


In [88]:
prep_greater_than_zero_2 = test_df[test_df['PRECIPITATION'] > 0.0]

In [89]:
prep_greater_than_zero_2.shape

(1381, 6)

In [None]:
prep_greater_than_zero_2.head(20)

Unnamed: 0,DAY_OF_YEAR,PRECIPITATION,LAGGED_PRECIPITATION,AVG_WIND_SPEED,MIN_TEMP,MAX_TEMP
15,16.0,0.39,0.39,6.71,45.0,55.0
40,41.0,0.01,0.01,11.18,51.0,65.0
73,74.0,0.14,0.14,12.3,56.0,65.0
96,97.0,0.87,0.87,10.51,53.0,66.0
108,109.0,0.04,0.04,10.29,55.0,66.0
109,110.0,0.1,0.14,19.91,55.0,64.0
117,118.0,0.15,0.15,8.5,50.0,64.0
227,228.0,0.29,0.29,7.83,65.0,80.0
253,254.0,0.02,0.02,7.38,73.0,83.0
254,255.0,0.02,0.04,5.82,71.0,75.0


### Save Cleaned Dataset

In [13]:
# Ensure the directory exists
os.makedirs(WEATHER_DATA_CLEAN_PATH.parent, exist_ok=True)

In [14]:
test_df.to_csv(WEATHER_DATA_CLEAN_PATH, index=False)

In [15]:

print(test_df.columns.tolist())
# output
# ['DAY_OF_YEAR', 'MONTH', 'YEAR', 'SEASON', 'PRECIPITATION', 'LAGGED_PRECIPITATION', 'AVG_WIND_SPEED', 'LAGGED_AVG_WIND_SPEED', 'WIND_TEMP_RATIO', 'MIN_TEMP', 'TEMP_RANGE', 'FIRE_START_DAY', 'MAX_TEMP']


['DAY_OF_YEAR', 'PRECIPITATION', 'LAGGED_PRECIPITATION', 'AVG_WIND_SPEED', 'MIN_TEMP', 'MAX_TEMP']


In [16]:
data = pd.read_csv(WEATHER_DATA_CLEAN_PATH, sep=',',header=0)

In [17]:
data.head()

Unnamed: 0,DAY_OF_YEAR,PRECIPITATION,LAGGED_PRECIPITATION,AVG_WIND_SPEED,MIN_TEMP,MAX_TEMP
0,1.0,0.0,0.0,4.7,51.0,79.0
1,2.0,0.0,0.0,5.59,46.0,71.0
2,3.0,0.0,0.0,5.37,47.0,70.0
3,4.0,0.0,0.0,4.7,45.0,76.0
4,5.0,0.0,0.0,5.14,49.0,74.0


In [18]:
shuffled_data = data.sample(frac=1, random_state=42).reset_index(drop=True)

In [19]:
num_samples = len(shuffled_data)
train_size=.80
test_size=.10
val_size=.10

train_index = int(num_samples * train_size)
test_end_index = int(num_samples * (train_size + test_size))

In [20]:
train_data_frame = shuffled_data.iloc[:train_index]
test_data_frame = shuffled_data.iloc[train_index:test_end_index]
val_data_frame = shuffled_data.iloc[test_end_index:]


In [21]:
print(f"length of train: {len(train_data_frame)}")
print(f"length of test: {len(test_data_frame)}")
print(f"length of val: {len(val_data_frame)}")
print(f"Sum: {len(train_data_frame) +len(test_data_frame) + len(val_data_frame)}")


length of train: 11980
length of test: 1498
length of val: 1498
Sum: 14976
