In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
import pandas as pd
import tensorflow as tf

In [2]:
# Import our input dataset
fire_data_df = pd.read_csv('../Resources/wildfire_data_final.csv')
fire_data_df

Unnamed: 0,Fire_Size,Fire_Size_Class,Fire_Cause,Latitude,Longitude,State,Discovery_Month,Discovery_Date,Discovery_Year,Vegetation,...,Wind_Pre_7,Wind_Present,Hum_Pre_30,Hum_Pre_15,Hum_Pre_7,Hum_Present,Prec_Pre_30,Prec_Pre_15,Prec_Pre_7,Prec_Present
0,60.0,C,Arson,34.947800,-88.722500,MS,Feb,1/30/2004,2004,16,...,2.695833,3.369050,75.531629,75.868613,76.812834,65.063800,168.8,42.2,18.1,124.5
1,1.0,B,Campfire,30.904720,-93.557500,TX,Nov,10/13/2005,2005,12,...,1.424783,2.148857,72.899478,75.061381,77.924623,70.732911,28.4,27.5,1.2,55.4
2,8.3,B,Debris Burning,30.845339,-83.127987,GA,Mar,2/4/2010,2010,12,...,2.224500,1.750701,71.260870,69.281030,64.797980,73.072072,76.3,26.2,8.4,40.5
3,1.0,B,Miscellaneous,42.731934,-77.905976,NY,Apr,3/4/2010,2010,4,...,3.744928,2.872771,68.640553,69.556263,63.966184,59.956798,52.9,38.4,2.3,30.5
4,20.0,C,Arson,31.122200,-88.099400,AL,Jun,5/5/2000,2000,12,...,2.899537,2.623313,73.717979,74.603325,69.440594,77.471227,93.7,85.3,41.4,154.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7279,3082.0,F,Lightning,48.383600,-117.891900,WA,Aug,7/15/2015,2015,15,...,2.026590,2.918241,37.301713,36.085561,40.526012,49.762009,2.0,2.0,2.0,9.2
7280,4800.0,F,Equipment Use,45.135700,-99.908700,SD,Apr,3/2/2015,2015,9,...,6.023590,5.456159,58.984906,61.466667,60.994872,56.239420,4.4,4.4,4.1,12.2
7281,5100.0,G,Equipment Use,45.069700,-99.821100,SD,Apr,3/3/2015,2015,9,...,5.436216,5.316949,57.976101,59.909524,58.129730,56.989091,4.4,4.4,4.1,12.7
7282,17823.0,G,Campfire,44.834600,-117.220600,OR,Sep,8/13/2015,2015,15,...,1.835821,2.292956,55.009259,62.368700,67.266304,58.917679,10.4,10.4,9.4,8.6


In [3]:
fire_data_df.columns

Index(['Fire_Size', 'Fire_Size_Class', 'Fire_Cause', 'Latitude', 'Longitude',
       'State', 'Discovery_Month', 'Discovery_Date', 'Discovery_Year',
       'Vegetation', 'Fire_Magnitude', 'Temp_Pre_30', 'Temp_Pre_15',
       'Temp_Pre_7', 'Temp_Present', 'Wind_Pre_30', 'Wind_Pre_15',
       'Wind_Pre_7', 'Wind_Present', 'Hum_Pre_30', 'Hum_Pre_15', 'Hum_Pre_7',
       'Hum_Present', 'Prec_Pre_30', 'Prec_Pre_15', 'Prec_Pre_7',
       'Prec_Present'],
      dtype='object')

In [36]:
fire_clean_df=fire_data_df.drop(columns=["Fire_Size","Fire_Size_Class","Discovery_Date","Discovery_Year",
                                          "Fire_Magnitude","Fire_Cause",
                                           "Temp_Pre_30","Temp_Pre_15","Temp_Pre_7",
                                           "Wind_Pre_30","Wind_Pre_15","Wind_Pre_7",
                                           "Hum_Pre_30", "Hum_Pre_15", "Hum_Pre_7",
                                           "Prec_Pre_30", "Prec_Pre_15","Prec_Pre_7",
                                           "Vegetation"
                                           ])
fire_clean_df

Unnamed: 0,Latitude,Longitude,State,Discovery_Month,Temp_Present,Wind_Present,Hum_Present,Prec_Present
0,34.947800,-88.722500,MS,Feb,13.696600,3.369050,65.063800,124.5
1,30.904720,-93.557500,TX,Nov,11.985560,2.148857,70.732911,55.4
2,30.845339,-83.127987,GA,Mar,14.328047,1.750701,73.072072,40.5
3,42.731934,-77.905976,NY,Apr,11.375329,2.872771,59.956798,30.5
4,31.122200,-88.099400,AL,Jun,25.610425,2.623313,77.471227,154.3
...,...,...,...,...,...,...,...,...
7279,48.383600,-117.891900,WA,Aug,17.337247,2.918241,49.762009,9.2
7280,45.135700,-99.908700,SD,Apr,8.710361,5.456159,56.239420,12.2
7281,45.069700,-99.821100,SD,Apr,8.686747,5.316949,56.989091,12.7
7282,44.834600,-117.220600,OR,Sep,12.713333,2.292956,58.917679,8.6


In [37]:
# Generate our categorical variable list
fire_cat = fire_clean_df.dtypes[fire_clean_df.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
fire_clean_df[fire_cat].nunique()

# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(fire_clean_df[fire_cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(fire_cat)
encode_df.head()

Unnamed: 0,State_AK,State_AL,State_AR,State_AZ,State_CA,State_CO,State_CT,State_DE,State_FL,State_GA,...,Discovery_Month_Dec,Discovery_Month_Feb,Discovery_Month_Jan,Discovery_Month_Jul,Discovery_Month_Jun,Discovery_Month_Mar,Discovery_Month_May,Discovery_Month_Nov,Discovery_Month_Oct,Discovery_Month_Sep
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [38]:
# Merge one-hot encoded features and drop the originals
x_df = fire_clean_df.merge(encode_df,left_index=True, right_index=True)
x_df = x_df.drop(fire_cat,1)
x_df.head()

Unnamed: 0,Latitude,Longitude,Temp_Present,Wind_Present,Hum_Present,Prec_Present,State_AK,State_AL,State_AR,State_AZ,...,Discovery_Month_Dec,Discovery_Month_Feb,Discovery_Month_Jan,Discovery_Month_Jul,Discovery_Month_Jun,Discovery_Month_Mar,Discovery_Month_May,Discovery_Month_Nov,Discovery_Month_Oct,Discovery_Month_Sep
0,34.9478,-88.7225,13.6966,3.36905,65.0638,124.5,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,30.90472,-93.5575,11.98556,2.148857,70.732911,55.4,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,30.845339,-83.127987,14.328047,1.750701,73.072072,40.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,42.731934,-77.905976,11.375329,2.872771,59.956798,30.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,31.1222,-88.0994,25.610425,2.623313,77.471227,154.3,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [39]:
# Remove target from features data
import numpy as np
y = np.reshape(fire_data_df.Fire_Size_Class.values,(-1,1))
X = x_df
# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [40]:
    # Create a random forest Classifier.
rf_model = RandomForestClassifier(n_estimators=100, random_state=2)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

  """


In [23]:
# Evaluate the model with "Temp_Present","Wind_Present","Hum_Present","Prec_Present","Vegetation"
y_pred = rf_model.predict(X_test_scaled)
from sklearn.metrics import accuracy_score
rf_model.score(X_test_scaled, y_test)
# score 0.6727073036792971

0.6727073036792971

In [29]:
    # Create a random forest Classifier.
rf_model = RandomForestClassifier(n_estimators=100, random_state=2)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

  """


In [30]:
# Evaluate the model without "Temp_Present","Wind_Present","Hum_Present","Prec_Present","Vegetation"
y_pred = rf_model.predict(X_test_scaled)
from sklearn.metrics import accuracy_score
rf_model.score(X_test_scaled, y_test)
#score 0.6490939044481054

0.6490939044481054

In [41]:
# Evaluate the model without "Vegetation"
y_pred = rf_model.predict(X_test_scaled)
from sklearn.metrics import accuracy_score
rf_model.score(X_test_scaled, y_test)
#score 0.6688632619439868

0.6688632619439868

In [42]:
# Create a random forest Classifier.
rf_model = RandomForestClassifier(n_estimators=100, random_state=2,max_depth=15)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

  """


In [43]:
# Evaluate the model without "Vegetation"
y_pred = rf_model.predict(X_test_scaled)
from sklearn.metrics import accuracy_score
rf_model.score(X_test_scaled, y_test)
#score 0.6842394288852279 max_depth=15

0.6842394288852279

In [44]:
# Create a random forest Classifier.
rf_model = RandomForestClassifier(n_estimators=100, random_state=2,max_depth=10)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

  """


In [45]:
# Evaluate the model without "Vegetation"
y_pred = rf_model.predict(X_test_scaled)
from sklearn.metrics import accuracy_score
rf_model.score(X_test_scaled, y_test)
#score  0.6825919824272377 max_depth=10

0.6825919824272377

In [46]:
# Create a random forest Classifier.
rf_model = RandomForestClassifier(n_estimators=100, random_state=2,max_depth=20)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

  """


In [47]:
# Evaluate the model without "Vegetation"
y_pred = rf_model.predict(X_test_scaled)
from sklearn.metrics import accuracy_score
rf_model.score(X_test_scaled, y_test)
#score  0.6809445359692476 max_depth=20

0.6809445359692476

In [48]:
# Create a random forest Classifier.
rf_model = RandomForestClassifier(n_estimators=50, random_state=2,max_depth=15)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

  """


In [49]:
# Evaluate the model without "Vegetation"
y_pred = rf_model.predict(X_test_scaled)
from sklearn.metrics import accuracy_score
rf_model.score(X_test_scaled, y_test)
#score  0.6809445359692476 max_depth=15 n_estimators=50

0.6836902800658978

In [50]:
# Create a random forest Classifier.
rf_model = RandomForestClassifier(n_estimators=150, random_state=2,max_depth=15)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

  """


In [51]:
# Evaluate the model without "Vegetation"
y_pred = rf_model.predict(X_test_scaled)
from sklearn.metrics import accuracy_score
rf_model.score(X_test_scaled, y_test)
#score  0.6831411312465678 max_depth=15 n_estimators=150

0.6831411312465678

In [54]:
# Create a random forest Classifier.
rf_model = RandomForestClassifier(n_estimators=500, random_state=2,max_depth=15)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

  """


In [55]:
# Evaluate the model without "Vegetation"
y_pred = rf_model.predict(X_test_scaled)
from sklearn.metrics import accuracy_score
rf_model.score(X_test_scaled, y_test)
#score  0.6842394288852279 max_depth=15 n_estimators=500

0.6842394288852279

In [64]:
# Create a random forest Classifier.
rf_model = RandomForestClassifier(n_estimators=600, random_state=1,max_depth=15)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

  """


In [65]:
# Evaluate the model without "Vegetation"
y_pred = rf_model.predict(X_test_scaled)
from sklearn.metrics import accuracy_score
rf_model.score(X_test_scaled, y_test)
#score  0.6842394288852279 max_depth=15 n_estimators=500,min_samples_split=5

0.6820428336079077

In [67]:
from sklearn.ensemble import GradientBoostingClassifier
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=20,
    learning_rate=learning_rate,
    max_features=5,
    max_depth=3,
    random_state=0)
    classifier.fit(X_train_scaled, y_train)
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(classifier.score(X_train_scaled,y_train)))
    print("Accuracy score (validation): {0:.3f}".format(classifier.score(X_test_scaled,y_test)))

  return f(*args, **kwargs)


Learning rate:  0.05
Accuracy score (training): 0.670
Accuracy score (validation): 0.660


  return f(*args, **kwargs)


Learning rate:  0.1
Accuracy score (training): 0.689
Accuracy score (validation): 0.673


  return f(*args, **kwargs)


Learning rate:  0.25
Accuracy score (training): 0.712
Accuracy score (validation): 0.676


  return f(*args, **kwargs)


Learning rate:  0.5
Accuracy score (training): 0.724
Accuracy score (validation): 0.680


  return f(*args, **kwargs)


Learning rate:  0.75
Accuracy score (training): 0.716
Accuracy score (validation): 0.651


  return f(*args, **kwargs)


Learning rate:  1
Accuracy score (training): 0.720
Accuracy score (validation): 0.650


In [68]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',max_iter=200,random_state=1)
classifier.fit(X_train_scaled, y_train)
y_pred = classifier.predict(X_test)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

  return f(*args, **kwargs)


0.008786381109280615
