In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
import pandas as pd
import tensorflow as tf

In [2]:
# Import our input dataset
fire_data_df = pd.read_csv('../Resources/wildfire_data_final.csv')
fire_data_df

Unnamed: 0,Fire_Size,Fire_Size_Class,Fire_Cause,Latitude,Longitude,State,Discovery_Month,Discovery_Date,Discovery_Year,Vegetation,...,Wind_Pre_7,Wind_Present,Hum_Pre_30,Hum_Pre_15,Hum_Pre_7,Hum_Present,Prec_Pre_30,Prec_Pre_15,Prec_Pre_7,Prec_Present
0,60.0,C,Arson,34.947800,-88.722500,MS,Feb,1/30/2004,2004,16,...,2.695833,3.369050,75.531629,75.868613,76.812834,65.063800,168.8,42.2,18.1,124.5
1,1.0,B,Campfire,30.904720,-93.557500,TX,Nov,10/13/2005,2005,12,...,1.424783,2.148857,72.899478,75.061381,77.924623,70.732911,28.4,27.5,1.2,55.4
2,8.3,B,Debris Burning,30.845339,-83.127987,GA,Mar,2/4/2010,2010,12,...,2.224500,1.750701,71.260870,69.281030,64.797980,73.072072,76.3,26.2,8.4,40.5
3,1.0,B,Miscellaneous,42.731934,-77.905976,NY,Apr,3/4/2010,2010,4,...,3.744928,2.872771,68.640553,69.556263,63.966184,59.956798,52.9,38.4,2.3,30.5
4,20.0,C,Arson,31.122200,-88.099400,AL,Jun,5/5/2000,2000,12,...,2.899537,2.623313,73.717979,74.603325,69.440594,77.471227,93.7,85.3,41.4,154.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7279,3082.0,F,Lightning,48.383600,-117.891900,WA,Aug,7/15/2015,2015,15,...,2.026590,2.918241,37.301713,36.085561,40.526012,49.762009,2.0,2.0,2.0,9.2
7280,4800.0,F,Equipment Use,45.135700,-99.908700,SD,Apr,3/2/2015,2015,9,...,6.023590,5.456159,58.984906,61.466667,60.994872,56.239420,4.4,4.4,4.1,12.2
7281,5100.0,G,Equipment Use,45.069700,-99.821100,SD,Apr,3/3/2015,2015,9,...,5.436216,5.316949,57.976101,59.909524,58.129730,56.989091,4.4,4.4,4.1,12.7
7282,17823.0,G,Campfire,44.834600,-117.220600,OR,Sep,8/13/2015,2015,15,...,1.835821,2.292956,55.009259,62.368700,67.266304,58.917679,10.4,10.4,9.4,8.6


In [3]:
fire_data_df.columns

Index(['Fire_Size', 'Fire_Size_Class', 'Fire_Cause', 'Latitude', 'Longitude',
       'State', 'Discovery_Month', 'Discovery_Date', 'Discovery_Year',
       'Vegetation', 'Fire_Magnitude', 'Temp_Pre_30', 'Temp_Pre_15',
       'Temp_Pre_7', 'Temp_Present', 'Wind_Pre_30', 'Wind_Pre_15',
       'Wind_Pre_7', 'Wind_Present', 'Hum_Pre_30', 'Hum_Pre_15', 'Hum_Pre_7',
       'Hum_Present', 'Prec_Pre_30', 'Prec_Pre_15', 'Prec_Pre_7',
       'Prec_Present'],
      dtype='object')

In [4]:
fire_clean_df=fire_data_df.drop(columns=["Fire_Size_Class","Discovery_Month","Discovery_Year",
                                          "Fire_Magnitude",
                                           "Temp_Pre_30","Temp_Pre_15","Temp_Pre_7",
                                           "Wind_Pre_30","Wind_Pre_15","Wind_Pre_7",
                                           "Hum_Pre_30", "Hum_Pre_15", "Hum_Pre_7",
                                           "Prec_Pre_30", "Prec_Pre_15","Prec_Pre_7",
                                           "Vegetation","Temp_Present","Wind_Present","Hum_Present","Prec_Present"
                                           ])
fire_clean_df

Unnamed: 0,Fire_Size,Fire_Cause,Latitude,Longitude,State,Discovery_Date
0,60.0,Arson,34.947800,-88.722500,MS,1/30/2004
1,1.0,Campfire,30.904720,-93.557500,TX,10/13/2005
2,8.3,Debris Burning,30.845339,-83.127987,GA,2/4/2010
3,1.0,Miscellaneous,42.731934,-77.905976,NY,3/4/2010
4,20.0,Arson,31.122200,-88.099400,AL,5/5/2000
...,...,...,...,...,...,...
7279,3082.0,Lightning,48.383600,-117.891900,WA,7/15/2015
7280,4800.0,Equipment Use,45.135700,-99.908700,SD,3/2/2015
7281,5100.0,Equipment Use,45.069700,-99.821100,SD,3/3/2015
7282,17823.0,Campfire,44.834600,-117.220600,OR,8/13/2015


In [5]:
# Generate our categorical variable list
fire_cat = fire_clean_df.dtypes[fire_clean_df.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
fire_clean_df[fire_cat].nunique()

# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(fire_clean_df[fire_cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(fire_cat)
encode_df.head()

Unnamed: 0,Fire_Cause_Arson,Fire_Cause_Campfire,Fire_Cause_Children,Fire_Cause_Debris Burning,Fire_Cause_Equipment Use,Fire_Cause_Fireworks,Fire_Cause_Lightning,Fire_Cause_Miscellaneous,Fire_Cause_Missing/Undefined,Fire_Cause_Powerline,...,Discovery_Date_9/8/1999,Discovery_Date_9/8/2005,Discovery_Date_9/8/2006,Discovery_Date_9/8/2008,Discovery_Date_9/8/2013,Discovery_Date_9/9/2000,Discovery_Date_9/9/2005,Discovery_Date_9/9/2006,Discovery_Date_9/9/2010,Discovery_Date_9/9/2015
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Merge one-hot encoded features and drop the originals
x_df = fire_clean_df.merge(encode_df,left_index=True, right_index=True)
x_df = x_df.drop(fire_cat,1)
x_df.head()

Unnamed: 0,Fire_Size,Latitude,Longitude,Fire_Cause_Arson,Fire_Cause_Campfire,Fire_Cause_Children,Fire_Cause_Debris Burning,Fire_Cause_Equipment Use,Fire_Cause_Fireworks,Fire_Cause_Lightning,...,Discovery_Date_9/8/1999,Discovery_Date_9/8/2005,Discovery_Date_9/8/2006,Discovery_Date_9/8/2008,Discovery_Date_9/8/2013,Discovery_Date_9/9/2000,Discovery_Date_9/9/2005,Discovery_Date_9/9/2006,Discovery_Date_9/9/2010,Discovery_Date_9/9/2015
0,60.0,34.9478,-88.7225,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,30.90472,-93.5575,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,8.3,30.845339,-83.127987,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,42.731934,-77.905976,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,20.0,31.1222,-88.0994,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# Remove target from features data
import numpy as np
y = np.reshape(fire_data_df.Fire_Size_Class.values,(-1,1))
X = x_df
# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [8]:
# Create a random forest Classifier.
rf_model = RandomForestClassifier(n_estimators=100, random_state=2,max_depth=15)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

  """


In [9]:
# Evaluate the model with "discovery date"
y_pred = rf_model.predict(X_test_scaled)
from sklearn.metrics import accuracy_score
rf_model.score(X_test_scaled, y_test)
# score 0.6529379461834157

0.6760021965952773

In [10]:
# Create a random forest Classifier.
rf_model = RandomForestClassifier(n_estimators=100, random_state=2,max_depth=15)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

  """


In [11]:
# Evaluate the model with no weather
y_pred = rf_model.predict(X_test_scaled)
from sklearn.metrics import accuracy_score
rf_model.score(X_test_scaled, y_test)
#score 0.6584294343767161

0.6760021965952773

In [12]:
# Create a random forest Classifier.
rf_model = RandomForestClassifier(n_estimators=100, random_state=2,max_depth=15)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

  """


In [13]:
# Evaluate the model with fir size 
y_pred = rf_model.predict(X_test_scaled)
from sklearn.metrics import accuracy_score
rf_model.score(X_test_scaled, y_test)
#score 0.6694124107633168

0.6760021965952773

In [14]:
# Create a random forest Classifier.
rf_model = RandomForestClassifier(n_estimators=100, random_state=2,max_depth=35)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

  """


In [15]:
# Evaluate the model with Fire_Cause,Fire_Size,Latitude,Longitude,State,Discovery_Date
y_pred = rf_model.predict(X_test_scaled)
from sklearn.metrics import accuracy_score
rf_model.score(X_test_scaled, y_test)
#score  0.9110378912685337 max_depth=35

0.9110378912685337

In [16]:
# Create a random forest Classifier.
rf_model = RandomForestClassifier(n_estimators=100, random_state=2,max_depth=20)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

  """


In [17]:
# Evaluate the model without "Vegetation"
y_pred = rf_model.predict(X_test_scaled)
from sklearn.metrics import accuracy_score
rf_model.score(X_test_scaled, y_test)
#score  0.6809445359692476 max_depth=20

0.6891817682591982

In [18]:
# Create a random forest Classifier.
rf_model = RandomForestClassifier(n_estimators=50, random_state=2,max_depth=15)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

  """


In [19]:
# Evaluate the model without "Vegetation"
y_pred = rf_model.predict(X_test_scaled)
from sklearn.metrics import accuracy_score
rf_model.score(X_test_scaled, y_test)
#score  0.6809445359692476 max_depth=15 n_estimators=50

0.6699615595826469

In [20]:
# Create a random forest Classifier.
rf_model = RandomForestClassifier(n_estimators=150, random_state=2,max_depth=15)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

  """


In [21]:
# Evaluate the model without "Vegetation"
y_pred = rf_model.predict(X_test_scaled)
from sklearn.metrics import accuracy_score
rf_model.score(X_test_scaled, y_test)
#score  0.6831411312465678 max_depth=15 n_estimators=150

0.6754530477759473

In [22]:
# Create a random forest Classifier.
rf_model = RandomForestClassifier(n_estimators=500, random_state=2,max_depth=15)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

  """


In [23]:
# Evaluate the model without "Vegetation"
y_pred = rf_model.predict(X_test_scaled)
from sklearn.metrics import accuracy_score
rf_model.score(X_test_scaled, y_test)
#score  0.6842394288852279 max_depth=15 n_estimators=500

0.6721581548599671

In [24]:
# Create a random forest Classifier.
rf_model = RandomForestClassifier(n_estimators=600, random_state=1,max_depth=15)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

  """


In [25]:
# Evaluate the model without "Vegetation"
y_pred = rf_model.predict(X_test_scaled)
from sklearn.metrics import accuracy_score
rf_model.score(X_test_scaled, y_test)
#score  0.6842394288852279 max_depth=15 n_estimators=500,min_samples_split=5

0.6721581548599671

In [26]:
from sklearn.ensemble import GradientBoostingClassifier
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=20,
    learning_rate=learning_rate,
    max_features=5,
    max_depth=3,
    random_state=0)
    classifier.fit(X_train_scaled, y_train)
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(classifier.score(X_train_scaled,y_train)))
    print("Accuracy score (validation): {0:.3f}".format(classifier.score(X_test_scaled,y_test)))

  return f(*args, **kwargs)


Learning rate:  0.05
Accuracy score (training): 0.664
Accuracy score (validation): 0.651


  return f(*args, **kwargs)


Learning rate:  0.1
Accuracy score (training): 0.665
Accuracy score (validation): 0.651


  return f(*args, **kwargs)


Learning rate:  0.25
Accuracy score (training): 0.671
Accuracy score (validation): 0.652


  return f(*args, **kwargs)


Learning rate:  0.5
Accuracy score (training): 0.681
Accuracy score (validation): 0.650


  return f(*args, **kwargs)


Learning rate:  0.75
Accuracy score (training): 0.685
Accuracy score (validation): 0.647


  return f(*args, **kwargs)


Learning rate:  1
Accuracy score (training): 0.683
Accuracy score (validation): 0.646


In [27]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',max_iter=200,random_state=1)
classifier.fit(X_train_scaled, y_train)
y_pred = classifier.predict(X_test)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

  return f(*args, **kwargs)


0.06644700713893466


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
