In [1]:
import pandas as pd
from datetime import datetime as dt

In [2]:
us_wildfires = pd.read_csv('us_wildfires_filled.csv')

In [3]:
us_wildfires['DISCOVERY_DATE'] = pd.to_datetime(us_wildfires["DISCOVERY_DATE"], format = "%Y-%m-%d")
us_wildfires['CONT_DATE'] = pd.to_datetime(us_wildfires["CONT_DATE"], format = "%Y-%m-%d")

In [4]:
us_wildfires.isna().sum()

OBJECTID                     0
NWCG_REPORTING_AGENCY        0
NWCG_REPORTING_UNIT_ID       0
FIRE_NAME                    0
FIRE_YEAR                    0
DISCOVERY_DATE               0
NWCG_CAUSE_CLASSIFICATION    0
NWCG_GENERAL_CAUSE           0
NWCG_CAUSE_AGE_CATEGORY      0
CONT_DATE                    0
FIRE_SIZE                    0
FIRE_SIZE_CLASS              0
LATITUDE                     0
LONGITUDE                    0
OWNER_DESCR                  0
DURATION                     0
COUNTY                       0
STATE                        0
PRECIPITATION                0
TEMPERATURE                  0
WIND_SPEED                   0
ELEVATION                    0
dtype: int64

In [5]:
to_drop = ['OBJECTID', 'LATITUDE', 'LONGITUDE', 'FIRE_NAME']
model_data = us_wildfires.drop(to_drop, axis=1)

In [6]:
model_data['DAY_OF_WEEK'] = model_data['DISCOVERY_DATE'].dt.dayofweek
model_data['DAY_OF_YEAR'] = model_data['DISCOVERY_DATE'].dt.dayofyear

In [7]:
date_drop = ['DISCOVERY_DATE', 'CONT_DATE']
model_data = model_data.drop(model_data[date_drop], axis=1)

more_drops = ['NWCG_REPORTING_UNIT_ID', 'FIRE_SIZE_CLASS',
             'NWCG_CAUSE_CLASSIFICATION', 'NWCG_CAUSE_AGE_CATEGORY']
model_data = model_data.drop(more_drops, axis=1)

model_data['COUNTY_STATE'] = model_data['COUNTY'] + str(" ") + model_data['STATE']

In [8]:
model_data[['COUNTY_STATE', 'COUNTY', 'STATE']].head()

Unnamed: 0,COUNTY_STATE,COUNTY,STATE
0,Plumas County CA,Plumas County,CA
1,El Dorado County CA,El Dorado County,CA
2,Placer County CA,Placer County,CA
3,Alpine County CA,Alpine County,CA
4,Alpine County CA,Alpine County,CA


In [9]:
model_data = model_data.drop('COUNTY', axis=1)

In [10]:
model_data['COUNTY_STATE'] = model_data['COUNTY_STATE'].astype('category')
model_data['COUNTY_STATE'] = model_data['COUNTY_STATE'].cat.codes.astype(int)

model_data['STATE'] = model_data['STATE'].astype('category')
model_data['STATE'] = model_data['STATE'].cat.codes.astype(int)

model_data['NWCG_REPORTING_AGENCY'] = model_data['NWCG_REPORTING_AGENCY'].astype('category')
model_data['NWCG_REPORTING_AGENCY'] = model_data['NWCG_REPORTING_AGENCY'].cat.codes.astype(int)

model_data['OWNER_DESCR'] = model_data['OWNER_DESCR'].astype('category')
model_data['OWNER_DESCR'] = model_data['OWNER_DESCR'].cat.codes.astype(int)

In [11]:
# Random Forest / Grid Search 

In [12]:
from sklearn import tree

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score
#from imblearn.under_sampling import RandomUnderSampler
import time

In [13]:
model_data['NWCG_GENERAL_CAUSE'] = model_data['NWCG_GENERAL_CAUSE'].replace({'Other causes': 'Undetermined',
                                                                             'Undetermined': 'Undetermined'})

In [14]:
# Dataset 5 - above but with 'eratic' category of minor & smoking

data_2 = model_data.drop(['OWNER_DESCR','NWCG_REPORTING_AGENCY'], axis=1)
data_5 = data_2[data_2['NWCG_GENERAL_CAUSE']!= 'Undetermined']


data_5['NWCG_GENERAL_CAUSE'] = data_5['NWCG_GENERAL_CAUSE'].replace({'Railroad operations and maintenance': 'Infrastructure-related causes',
                                                                          'Power supply': 'Infrastructure-related causes',
                                                                          'Equipment and vehicle use': 'Infrastructure-related causes'})
data_5['NWCG_GENERAL_CAUSE'] = data_5['NWCG_GENERAL_CAUSE'].replace({'Smoking': 'Erratic',
                                                                     'Misuse of fire by a minor': 'Erratic'})


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_5['NWCG_GENERAL_CAUSE'] = data_5['NWCG_GENERAL_CAUSE'].replace({'Railroad operations and maintenance': 'Infrastructure-related causes',
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_5['NWCG_GENERAL_CAUSE'] = data_5['NWCG_GENERAL_CAUSE'].replace({'Smoking': 'Erratic',


In [15]:
X5 = data_5.drop("NWCG_GENERAL_CAUSE", axis=1)
y5 = data_5["NWCG_GENERAL_CAUSE"]


In [16]:
# Dataset 6 - dataset 5 but oversampled 

from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X5_resampled, y5_resampled = ros.fit_resample(X5, y5)

In [17]:
X6_train, X6_test, y6_train, y6_test = train_test_split(X5_resampled,y5_resampled, test_size=0.2, shuffle=True, random_state=2)

In [39]:
# param_grid = { 
#     'n_estimators': [100, 200], # try lower n_estimators 
#     'max_features': ['auto', 'sqrt', 'log2'],
#     'max_depth' : [4,5,6,7,8],
#     'criterion' :['gini', 'entropy'] #remove this
# }

In [18]:
param_grid = {'max_features': ["auto", "sqrt", "log2"]}

In [19]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

rf6 = RandomForestClassifier()
CV_rfc = GridSearchCV(estimator=rf6, param_grid=param_grid, cv= 5)
CV_rfc.fit(X6_train, y6_train)

300 fits failed out of a total of 900.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
300 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/_param_validation.py", line 96, in validate_parameter_constraints
    raise InvalidParame

In [20]:
CV_rfc.best_params_

{'max_depth': 8,
 'max_features': 'log2',
 'max_samples': 100,
 'min_samples_split': 6,
 'n_estimators': 40}

In [25]:
start_time = time.time()

rf7 = RandomForestClassifier(max_features='log2',
                             min_samples_split=6)
rf7.fit(X6_train, y6_train)
rf7_predictions = rf7.predict(X6_test)
rf7_acc = accuracy_score(y6_test, rf6_predictions)
rf7_f1 = f1_score(y6_test, rf6_predictions, average='weighted')


print("--- %s seconds ---" % round(time.time() - start_time, 2))

--- 1909.87 seconds ---


In [24]:
# param_grid = {
#    'n_estimators': range(10, 50, 10),
#    'max_depth': [2,4, 6, 8, 10, 20],
#    'min_samples_split': range(2, 8, 2),
#    'max_features': ["auto", "sqrt", "log2"],
#     'max_samples': [100]
# }
print(f"Random Forest Accuracy Score: {rf6_acc}, F1-Score: {rf6_f1}")

Random Forest Accuracy Score: 0.47302911592783536, F1-Score: 0.46480475139805505


In [26]:
print(f"Random Forest Accuracy Score: {rf7_acc}, F1-Score: {rf7_f1}")

Random Forest Accuracy Score: 0.47302911592783536, F1-Score: 0.46480475139805505
