In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
import pandas as pd
import tensorflow as tf

# Import our input dataset
fire_df = pd.read_csv('../Resources/wildfire_v2_clean_data.csv')
fire_df.head()

Unnamed: 0,fire_size,fire_size_class,fire_cause,latitude,longitude,state,discovery_month,discovery_date,discovery_year,Vegetation,...,Wind_cont,Hum_pre_30,Hum_pre_15,Hum_pre_7,Hum_cont,Prec_pre_30,Prec_pre_15,Prec_pre_7,Prec_cont,remoteness
0,10.0,C,Missing/Undefined,18.105072,-66.753044,PR,Feb,1/12/2007,2007,12,...,3.250413,78.21659,76.79375,76.381579,78.72437,0.0,0.0,0.0,0.0,0.017923
1,3.0,B,Arson,35.03833,-87.61,TN,Dec,11/11/2006,2006,15,...,2.12232,70.84,65.858911,55.505882,81.682678,59.8,8.4,0.0,86.8,0.184355
2,60.0,C,Arson,34.9478,-88.7225,MS,Feb,1/30/2004,2004,16,...,3.36905,75.531629,75.868613,76.812834,65.0638,168.8,42.2,18.1,124.5,0.194544
3,1.0,B,Debris Burning,39.6414,-119.3083,NV,Jun,5/7/2005,2005,0,...,0.0,44.778429,37.140811,35.353846,0.0,10.4,7.2,0.0,0.0,0.487447
4,2.0,B,Miscellaneous,30.7006,-90.5914,LA,Sep,8/23/1999,1999,12,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.214633


In [2]:
fire_clean_df=fire_df.drop(columns=["discovery_date","fire_size_class","fire_mag","state","discovery_month","discovery_year"])
fire_clean_df

Unnamed: 0,fire_size,fire_cause,latitude,longitude,Vegetation,Temp_pre_30,Temp_pre_15,Temp_pre_7,Temp_cont,Wind_pre_30,...,Wind_cont,Hum_pre_30,Hum_pre_15,Hum_pre_7,Hum_cont,Prec_pre_30,Prec_pre_15,Prec_pre_7,Prec_cont,remoteness
0,10.0,Missing/Undefined,18.105072,-66.753044,12,24.480974,24.716923,24.902597,24.527961,4.341807,...,3.250413,78.216590,76.793750,76.381579,78.724370,0.0,0.0,0.0,0.0,0.017923
1,3.0,Arson,35.038330,-87.610000,15,7.553433,7.010000,0.343529,10.448298,2.709764,...,2.122320,70.840000,65.858911,55.505882,81.682678,59.8,8.4,0.0,86.8,0.184355
2,60.0,Arson,34.947800,-88.722500,16,4.971930,5.782766,5.558750,13.696600,3.364499,...,3.369050,75.531629,75.868613,76.812834,65.063800,168.8,42.2,18.1,124.5,0.194544
3,1.0,Debris Burning,39.641400,-119.308300,0,16.275967,18.996181,18.142564,0.000000,4.054982,...,0.000000,44.778429,37.140811,35.353846,0.000000,10.4,7.2,0.0,0.0,0.487447
4,2.0,Miscellaneous,30.700600,-90.591400,12,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,...,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,-1.0,-1.0,-1.0,0.214633
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55362,6289.0,Missing/Undefined,39.180000,-96.784167,0,19.720799,16.891282,18.918994,13.242324,3.259176,...,3.804803,65.671410,61.839572,54.625698,55.042092,35.4,8.2,0.0,249.0,0.331501
55363,70868.0,Missing/Undefined,38.342719,-120.695967,0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,...,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,-1.0,-1.0,-1.0,0.124683
55364,5702.0,Arson,37.262607,-119.511139,0,28.425403,28.425403,28.166667,27.646067,2.649395,...,2.529158,43.755556,43.755556,44.443975,35.924406,0.0,0.0,0.0,0.0,0.097682
55365,3261.0,Miscellaneous,40.604300,-123.080450,15,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,...,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.0,-1.0,-1.0,-1.0,0.178206


In [3]:
# Generate our categorical variable list
fire_cat = fire_clean_df.dtypes[fire_df.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
fire_clean_df[fire_cat].nunique()

fire_cause    13
dtype: int64

In [4]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(fire_clean_df[fire_cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(fire_cat)
encode_df.head()

Unnamed: 0,fire_cause_Arson,fire_cause_Campfire,fire_cause_Children,fire_cause_Debris Burning,fire_cause_Equipment Use,fire_cause_Fireworks,fire_cause_Lightning,fire_cause_Miscellaneous,fire_cause_Missing/Undefined,fire_cause_Powerline,fire_cause_Railroad,fire_cause_Smoking,fire_cause_Structure
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Merge one-hot encoded features and drop the originals
x_df = fire_clean_df.merge(encode_df,left_index=True, right_index=True)
x_df = x_df.drop(fire_cat,1)
x_df.head()

Unnamed: 0,fire_size,latitude,longitude,Vegetation,Temp_pre_30,Temp_pre_15,Temp_pre_7,Temp_cont,Wind_pre_30,Wind_pre_15,...,fire_cause_Debris Burning,fire_cause_Equipment Use,fire_cause_Fireworks,fire_cause_Lightning,fire_cause_Miscellaneous,fire_cause_Missing/Undefined,fire_cause_Powerline,fire_cause_Railroad,fire_cause_Smoking,fire_cause_Structure
0,10.0,18.105072,-66.753044,12,24.480974,24.716923,24.902597,24.527961,4.341807,3.492857,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,3.0,35.03833,-87.61,15,7.553433,7.01,0.343529,10.448298,2.709764,2.881707,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,60.0,34.9478,-88.7225,16,4.97193,5.782766,5.55875,13.6966,3.364499,2.92383,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,39.6414,-119.3083,0,16.275967,18.996181,18.142564,0.0,4.054982,3.398329,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.0,30.7006,-90.5914,12,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [26]:
# Remove target from features data
import numpy as np
y = np.ravel(fire_clean_df.fire_size.values)
X = pd.DataFrame(x_df.drop(columns=["fire_size"]))
# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [28]:
# Create a random forest regressor.
rf_model = RandomForestRegressor(n_estimators=100, random_state=2)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)


In [29]:
# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)


ValueError: continuous is not supported

In [30]:
rf_model.score(X_test_scaled, y_test)

0.17723637566731787

In [31]:
y_pred

array([2.0280000e+01, 1.4443000e+00, 5.6920000e+00, ..., 2.9272701e+04,
       1.3200000e+01, 5.0600000e+00])

In [32]:
y_test

array([1.50e+00, 1.48e+00, 3.00e+00, ..., 3.31e+03, 1.00e+00, 8.00e+00])

In [33]:
y

array([1.0000e+01, 3.0000e+00, 6.0000e+01, ..., 5.7020e+03, 3.2610e+03,
       7.6067e+04])

In [34]:
pd.DataFrame({"Predicted": y_pred, "Actual": y_test, "Error": y_pred - y_test})[["Predicted", "Actual", "Error"]]

Unnamed: 0,Predicted,Actual,Error
0,20.2800,1.50,18.7800
1,1.4443,1.48,-0.0357
2,5.6920,3.00,2.6920
3,700.7587,2.00,698.7587
4,22.0000,1.00,21.0000
...,...,...,...
13837,7.0220,6.00,1.0220
13838,2.7640,11.00,-8.2360
13839,29272.7010,3310.00,25962.7010
13840,13.2000,1.00,12.2000
