#### Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import utils
import model_utils
from importlib import reload

#### Index

# Introduction

## Data Loading

In [2]:
df_1 = pd.read_pickle('sample_data/30k_engineered.pkl')
df_2 = pd.read_pickle('sample_data/large_fires_cleaned.pkl')

In [3]:
df_1.head()

Unnamed: 0,DATE,FIRE_YEAR,DISCOVERY_DOY,FIRE_SIZE,FIRE_SIZE_CLASS,LATITUDE,LONGITUDE,STATE,avg_tempmax,avg_temp,...,precip_variance,precip_delta,dew_variance,dew_delta,windspeed_variance,windspeed_delta,winddir_variance,winddir_delta,pressure_variance,pressure_delta
0,1992-01-01,1992,1,0.1,A,43.325,-101.0185,SD,5.257143,-2.928571,...,0.0,0.0,5.399184,3.5,47.062041,-1.8,1889.074286,-48.6,28.913469,-4.3
1,1992-01-01,1992,1,1.0,B,33.058333,-79.979167,SC,13.7,9.885714,...,55.589796,-4.4,7.196327,0.1,5.57551,2.0,15648.559592,-14.9,19.07102,2.4
2,1992-01-02,1992,2,0.25,A,40.775,-74.85416,NJ,6.385714,1.585714,...,9.54651,0.0,19.73102,4.0,34.516735,-10.6,16492.948163,-254.6,75.711429,-0.9
3,1992-01-03,1992,3,1.91,B,31.0185,-83.2973,GA,14.457143,11.828571,...,59.321224,-9.7,9.142041,-4.1,11.228571,0.2,16679.204898,223.6,21.585306,-8.4
4,1992-01-03,1992,3,2.0,B,30.7289,-87.2381,FL,14.428571,10.985714,...,0.213355,-1.32,10.262857,-7.9,27.382857,6.7,19587.093469,-24.8,13.107755,-2.7


In [4]:
df_2.head()

Unnamed: 0,FIRE_YEAR,DISCOVERY_DOY,FIRE_SIZE,FIRE_SIZE_CLASS,LATITUDE,LONGITUDE,STATE,avg_tempmax,avg_temp,avg_humidity,...,precip_variance,precip_delta,dew_variance,dew_delta,windspeed_variance,windspeed_delta,winddir_variance,winddir_delta,pressure_variance,pressure_delta
0,2003,104,232.0,D,41.363889,-88.173056,IL,14.057143,8.014286,54.885714,...,0.0,0.0,7.864898,5.5,31.552653,11.3,4393.119592,160.9,16.827755,-9.5
1,1992,52,150.0,D,34.587299,-95.611298,OK,18.571429,11.485714,53.085714,...,0.0,0.0,5.124898,-3.0,12.656327,3.6,5591.173469,-160.7,28.450612,6.2
2,2010,166,277.0,D,27.0012,-81.4362,FL,33.828571,27.514286,75.757143,...,0.556367,0.25,1.770612,2.8,8.504082,-4.2,6388.276735,-65.9,0.568163,-0.6
3,1992,120,125.0,D,45.966667,-68.466668,ME,8.757143,3.514286,62.514286,...,0.0,0.0,12.290612,-7.2,17.136735,12.9,4668.631429,141.8,4.49551,-2.9
4,2011,71,285.0,D,36.27996,-93.94546,AR,13.471429,7.214286,64.271429,...,0.188278,0.0,12.262041,3.8,45.43551,2.2,8314.153878,-39.5,24.979184,-6.0


In [5]:
# Check that the shapes match
df_1.shape[1] == df_2.shape[1] 

False

In [6]:
# Drop index column
df_1.drop('DATE', axis=1, inplace=True)

# Check that the shapes match
df_1.shape[1] == df_2.shape[1] 

True

In [7]:
df = pd.concat([df_1, df_2])
df.shape

(29734, 34)

In [8]:
# Check that they were added
df.shape[0] == df_1.shape[0] + df_2.shape[0]

True

In [9]:
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,FIRE_YEAR,DISCOVERY_DOY,FIRE_SIZE,FIRE_SIZE_CLASS,LATITUDE,LONGITUDE,STATE,avg_tempmax,avg_temp,avg_humidity,...,precip_variance,precip_delta,dew_variance,dew_delta,windspeed_variance,windspeed_delta,winddir_variance,winddir_delta,pressure_variance,pressure_delta
0,1992,1,0.1,A,43.325,-101.0185,SD,5.257143,-2.928571,81.142857,...,0.0,0.0,5.399184,3.5,47.062041,-1.8,1889.074286,-48.6,28.913469,-4.3
1,1992,1,1.0,B,33.058333,-79.979167,SC,13.7,9.885714,81.428571,...,55.589796,-4.4,7.196327,0.1,5.57551,2.0,15648.559592,-14.9,19.07102,2.4
2,1992,2,0.25,A,40.775,-74.85416,NJ,6.385714,1.585714,67.185714,...,9.54651,0.0,19.73102,4.0,34.516735,-10.6,16492.948163,-254.6,75.711429,-0.9
3,1992,3,1.91,B,31.0185,-83.2973,GA,14.457143,11.828571,78.957143,...,59.321224,-9.7,9.142041,-4.1,11.228571,0.2,16679.204898,223.6,21.585306,-8.4
4,1992,3,2.0,B,30.7289,-87.2381,FL,14.428571,10.985714,79.971429,...,0.213355,-1.32,10.262857,-7.9,27.382857,6.7,19587.093469,-24.8,13.107755,-2.7


## Preprocessing

In [10]:
numeric_df = df.select_dtypes('number').copy()
categorical_df = df.select_dtypes('object').copy()

print("Numeric columns: \n", ',\n '.join(numeric_df.columns))
print("Categorical columns: \n", ', \n '.join(categorical_df.columns))

Numeric columns: 
 FIRE_YEAR,
 DISCOVERY_DOY,
 FIRE_SIZE,
 LATITUDE,
 LONGITUDE,
 avg_tempmax,
 avg_temp,
 avg_humidity,
 avg_precip,
 avg_dew,
 avg_windspeed,
 avg_winddir,
 avg_pressure,
 ch4,
 co2,
 n2o,
 tempmax_variance,
 tempmax_delta,
 temp_variance,
 temp_delta,
 humidity_variance,
 humidity_delta,
 precip_variance,
 precip_delta,
 dew_variance,
 dew_delta,
 windspeed_variance,
 windspeed_delta,
 winddir_variance,
 winddir_delta,
 pressure_variance,
 pressure_delta
Categorical columns: 
 FIRE_SIZE_CLASS, 
 STATE


### Preprocessing Categorical Data

In [11]:
categorical_df.nunique().sort_values()

FIRE_SIZE_CLASS     7
STATE              52
dtype: int64

#### Processing `STATE`

While previously we decided to numerically encode the `FIRE_SIZE_CLASS`, in this model we will use it as our target variable, meaning that we do not need to process this column. Instead we will solely focus on encoding the `STATE` column using `pd.getdummies()`.

In [12]:
# Get dummies
state_dummies = pd.get_dummies(categorical_df['STATE'], prefix='state')

# Check the DataFrame
state_dummies.head()

Unnamed: 0,state_AK,state_AL,state_AR,state_AZ,state_CA,state_CO,state_CT,state_DC,state_DE,state_FL,...,state_SD,state_TN,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Concatenate the dummy DataFrame to the original
categorical_df = pd.concat([categorical_df, state_dummies], axis=1)

# Check output of the new DataFrame
categorical_df.head(2)

Unnamed: 0,FIRE_SIZE_CLASS,STATE,state_AK,state_AL,state_AR,state_AZ,state_CA,state_CO,state_CT,state_DC,...,state_SD,state_TN,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY
0,A,SD,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,B,SC,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# Drop the redundant column
categorical_df.drop('STATE', axis=1, inplace=True)

# Check output
categorical_df.head(2)

Unnamed: 0,FIRE_SIZE_CLASS,state_AK,state_AL,state_AR,state_AZ,state_CA,state_CO,state_CT,state_DC,state_DE,...,state_SD,state_TN,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY
0,A,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
