In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv('raw_fire_data.csv', low_memory=False)
df.shape

(5307011, 16)

In [3]:
df.head()

Unnamed: 0,latitude,longitude,bright_ti4,scan,track,acq_date,acq_time,satellite,instrument,confidence,version,bright_ti5,frp,daynight,type,year
0,-41.56622,147.95204,346.47,0.37,0.58,2020-01-01,315,N,VIIRS,n,2,289.89,48.52,D,0,2020
1,-41.56641,147.94772,352.94,0.37,0.58,2020-01-01,315,N,VIIRS,n,2,295.35,48.52,D,0,2020
2,-41.56716,147.93088,353.88,0.37,0.58,2020-01-01,315,N,VIIRS,n,2,283.05,22.18,D,0,2020
3,-41.56773,147.91777,340.81,0.37,0.58,2020-01-01,315,N,VIIRS,n,2,285.54,6.93,D,0,2020
4,-41.56793,147.91336,339.01,0.37,0.58,2020-01-01,315,N,VIIRS,n,2,281.39,6.93,D,0,2020


In [4]:
df.isnull().mean()*100

latitude      0.0
longitude     0.0
bright_ti4    0.0
scan          0.0
track         0.0
acq_date      0.0
acq_time      0.0
satellite     0.0
instrument    0.0
confidence    0.0
version       0.0
bright_ti5    0.0
frp           0.0
daynight      0.0
type          0.0
year          0.0
dtype: float64

In [5]:
df.duplicated().sum()

0

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5307011 entries, 0 to 5307010
Data columns (total 16 columns):
 #   Column      Dtype  
---  ------      -----  
 0   latitude    float64
 1   longitude   float64
 2   bright_ti4  float64
 3   scan        float64
 4   track       float64
 5   acq_date    object 
 6   acq_time    int64  
 7   satellite   object 
 8   instrument  object 
 9   confidence  object 
 10  version     int64  
 11  bright_ti5  float64
 12  frp         float64
 13  daynight    object 
 14  type        int64  
 15  year        int64  
dtypes: float64(7), int64(4), object(5)
memory usage: 647.8+ MB


In [7]:
df['type'].value_counts()

type
0    5237904
2      64622
3       4485
Name: count, dtype: int64

In [8]:
df = df[df['type'] == 0]

### Data Type Conversion:

In [9]:
df['acq_date'] = pd.to_datetime(df['acq_date'], errors='coerce')

In [10]:
# Filter Australia
df = df[
    (df['latitude'] >= -44) & (df['latitude'] <= -10) &
    (df['longitude'] >= 113) & (df['longitude'] <= 154)
]

In [11]:
# Convert confidence
confidence_map = {'l': 50, 'n': 75, 'h': 90}
df['confidence'] = df['confidence'].map(confidence_map)

In [12]:
# Ensure numeric types
numeric_cols = ['frp', 'bright_ti4', 'bright_ti5', 'scan', 'track']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

### Temporal Feature Engineering:

In [13]:
df['month'] = df['acq_date'].dt.month
df['day_of_year'] = df['acq_date'].dt.dayofyear
df['week_of_year'] = df['acq_date'].dt.isocalendar().week

In [14]:
df['season'] = df['month'].apply(
    lambda x: 'Summer' if x in [12, 1, 2] else 
              'Autumn' if x in [3, 4, 5] else
              'Winter' if x in [6, 7, 8] else 'Spring'
)

In [15]:
# Peak fire season indicator (Nov-Feb in Australia)
df['is_fire_season'] = df['month'].isin([11, 12, 1, 2]).astype(int)

In [16]:
# Day/night indicator
df['is_daytime'] = (df['daynight'] == 'D').astype(int)

In [17]:
# Time of acquisition
df['acq_hour'] = (df['acq_time'] // 100).astype(int)
df['is_afternoon'] = (df['acq_hour'] >= 12).astype(int)

### Spatial Feature Engineering:

In [18]:

df['region'] = pd.cut(
    df['latitude'], 
    bins=[-44, -28, -20, -10], 
    labels=['South', 'Central', 'North']
)

In [19]:
## Target:
def create_severity_classes(df):
    conditions = [
        df['frp'] < 10,
        (df['frp'] >= 10) & (df['frp'] < 50),
        (df['frp'] >= 50) & (df['frp'] < 100),
        df['frp'] >= 100
    ]
    labels = [0, 1, 2, 3]  # Low, Medium, High, Very High
    
    df['severity_class'] = np.select(conditions, labels, default=1)
    return df

df = create_severity_classes(df)

# Print severity distribution
severity_dist = df['severity_class'].value_counts().sort_index()
severity_labels = {0: 'Low', 1: 'Medium', 2: 'High', 3: 'Very High'}

print(f"Severity class distribution:")
for severity, count in severity_dist.items():
    pct = (count / len(df)) * 100
    print(f"  {severity_labels[severity]:>10}: {count:>7,} ({pct:>5.2f}%)")

Severity class distribution:
         Low: 3,816,318 (72.86%)
      Medium: 1,241,844 (23.71%)
        High: 121,985 ( 2.33%)
   Very High:  57,697 ( 1.10%)


### Sampling

In [20]:
## Sampling for computational effiney:

if len(df) > 200000:
    df_sampled, _ = train_test_split(
        df, 
        train_size=200000, 
        stratify=df['severity_class'], 
        random_state=42
    )
    print(f"Sampled {len(df_sampled):,} records (stratified by severity)")
else:
    df_sampled = df.copy()
    print(f"Using full dataset ({len(df_sampled):,} records)")

Sampled 200,000 records (stratified by severity)


In [21]:
# Feature Selection:
columns_to_drop = [
    'frp', 'bright_ti4', 'bright_ti5',  
    'acq_date', 'acq_time',             
    'type'                                
]

df_final = df_sampled.drop(columns=columns_to_drop, errors='ignore')


In [22]:
output_file = 'preprocessed_fire_data_clean.csv'
df_final.to_csv(output_file, index=False)
print(f"Saved to: {output_file}")

Saved to: preprocessed_fire_data_clean.csv
