In [55]:
#Load Libraries
import pandas as pd
import sqlite3 as sqlite3
import numpy as np
from sklearn.model_selection import train_test_split

In [39]:
con = sqlite3.connect("FPA_FOD_20170508.sqlite")
# Load the data into a DataFrame
# Do not need ID here :)
Fires = pd.read_sql_query(
    "select OBJECTID as ID, NWCG_REPORTING_AGENCY,CONT_DATE,CONT_DOY,CONT_DATE - DISCOVERY_DATE as CONT_TIME,LONGITUDE,LATITUDE,NWCG_REPORTING_UNIT_ID,SOURCE_REPORTING_UNIT,OWNER_CODE,SOURCE_SYSTEM_TYPE,DISCOVERY_DATE,DISCOVERY_DOY,STAT_CAUSE_DESCR,FIRE_SIZE from fires", con)
con.close()

In [40]:
Fires.head(6)

Unnamed: 0,ID,NWCG_REPORTING_AGENCY,CONT_DATE,CONT_DOY,CONT_TIME,LONGITUDE,LATITUDE,NWCG_REPORTING_UNIT_ID,SOURCE_REPORTING_UNIT,OWNER_CODE,SOURCE_SYSTEM_TYPE,DISCOVERY_DATE,DISCOVERY_DOY,STAT_CAUSE_DESCR,FIRE_SIZE
0,1,FS,2453403.5,33.0,0.0,-121.005833,40.036944,USCAPNF,511,5.0,FED,2453403.5,33,Miscellaneous,0.1
1,2,FS,2453137.5,133.0,0.0,-120.404444,38.933056,USCAENF,503,5.0,FED,2453137.5,133,Lightning,0.25
2,3,FS,2453156.5,152.0,0.0,-120.735556,38.984167,USCAENF,503,13.0,FED,2453156.5,152,Debris Burning,0.1
3,4,FS,2453189.5,185.0,5.0,-119.913333,38.559167,USCAENF,503,5.0,FED,2453184.5,180,Lightning,0.1
4,5,FS,2453189.5,185.0,5.0,-119.933056,38.559167,USCAENF,503,5.0,FED,2453184.5,180,Lightning,0.1
5,6,FS,2453187.5,183.0,1.0,-120.103611,38.635278,USCAENF,503,5.0,FED,2453186.5,182,Lightning,0.1


In [41]:
# Remove rows with any missing values(NaN)
Fires=Fires.dropna(how="any")

In [42]:
# Remove the miscellaneous class
Fires=Fires[Fires.STAT_CAUSE_DESCR != "Miscellaneous"]

In [43]:
Fires.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 825771 entries, 1 to 1880460
Data columns (total 15 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   ID                      825771 non-null  int64  
 1   NWCG_REPORTING_AGENCY   825771 non-null  object 
 2   CONT_DATE               825771 non-null  float64
 3   CONT_DOY                825771 non-null  float64
 4   CONT_TIME               825771 non-null  float64
 5   LONGITUDE               825771 non-null  float64
 6   LATITUDE                825771 non-null  float64
 7   NWCG_REPORTING_UNIT_ID  825771 non-null  object 
 8   SOURCE_REPORTING_UNIT   825771 non-null  object 
 9   OWNER_CODE              825771 non-null  float64
 10  SOURCE_SYSTEM_TYPE      825771 non-null  object 
 11  DISCOVERY_DATE          825771 non-null  float64
 12  DISCOVERY_DOY           825771 non-null  int64  
 13  STAT_CAUSE_DESCR        825771 non-null  object 
 14  FIRE_SIZE          

In [47]:
# Split the Fires data into 90% train data and 10% test data
# Here need to find out how to make the split repeatable
Fires_train, Fires_test =  train_test_split(Fires, test_size=0.1)

In [51]:
print(f"Display the datatype of the test dataset: {type(Fires_test)}")
print(f" Train dataset      : {Fires_train.shape}")
print(f" Test dataset       : {Fires_test.shape}")

Display the datatype of the test dataset: <class 'pandas.core.frame.DataFrame'>
 Train dataset      : (743193, 15)
 Test dataset       : (82578, 15)


## Random Undersampling the Training Data
### Using random undersampling temporarily, may use more advanced method later

In [53]:
minority_class_len = len(Fires_train[Fires_train.STAT_CAUSE_DESCR=="Structure"])
minority_class_len

2712

In [94]:
# Select unique values from the response column
CAUSES = Fires_train["STAT_CAUSE_DESCR"].unique()
print(CAUSES)
# Delete the minority class: Structure
index = len(CAUSES)-1
CAUSES = np.delete(CAUSES,index)
CAUSES

['Campfire' 'Missing/Undefined' 'Lightning' 'Arson' 'Equipment Use'
 'Debris Burning' 'Smoking' 'Children' 'Railroad' 'Powerline' 'Fireworks'
 'Structure']


array(['Campfire', 'Missing/Undefined', 'Lightning', 'Arson',
       'Equipment Use', 'Debris Burning', 'Smoking', 'Children',
       'Railroad', 'Powerline', 'Fireworks'], dtype=object)

In [97]:
# 
undersampled_indices=Fires_train[Fires_train.STAT_CAUSE_DESCR=='Structure'].index
# Generate the indices of the undersampled dataset
for cause in CAUSES:
    cause_index = Fires_train[Fires_train.STAT_CAUSE_DESCR==cause].index
    undersampled_cause_index = np.random.choice(cause_index,
                                               minority_class_len,
                                               replace=False)
    undersampled_indices=np.concatenate([undersampled_indices,
                                        undersampled_cause_index])

# Generate the undersampled training dataset
train_undersampled = Fires_train.loc[undersampled_indices]

In [98]:
train_undersampled.shape

(32544, 15)

In [100]:
train_undersampled.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32544 entries, 263399 to 1736395
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ID                      32544 non-null  int64  
 1   NWCG_REPORTING_AGENCY   32544 non-null  object 
 2   CONT_DATE               32544 non-null  float64
 3   CONT_DOY                32544 non-null  float64
 4   CONT_TIME               32544 non-null  float64
 5   LONGITUDE               32544 non-null  float64
 6   LATITUDE                32544 non-null  float64
 7   NWCG_REPORTING_UNIT_ID  32544 non-null  object 
 8   SOURCE_REPORTING_UNIT   32544 non-null  object 
 9   OWNER_CODE              32544 non-null  float64
 10  SOURCE_SYSTEM_TYPE      32544 non-null  object 
 11  DISCOVERY_DATE          32544 non-null  float64
 12  DISCOVERY_DOY           32544 non-null  int64  
 13  STAT_CAUSE_DESCR        32544 non-null  object 
 14  FIRE_SIZE               32544 n