In [53]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import StackingRegressor, VotingRegressor, RandomForestRegressor, AdaBoostRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, ridge_regression, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from datetime import datetime, timedelta

In [54]:
df = pd.read_csv('../../data/processed/wildfire_weather.csv')
df.head()

Unnamed: 0,x,y,containmentdatetime,controldatetime,dailyacres,discoveryacres,firecause,firediscoverydatetime,incidenttypecategory,incidenttypekind,...,wind_speed_10m_max_mean_x,humidity_mean_x,temp_2m_mean_x,wind_speed_2m_mean_y,wind_speed_2m_max_mean_y,wind_speed_10m_mean_y,wind_speed_10m_max_mean_y,humidity_mean_y,temp_2m_mean_y,rain_sum
0,-111.348611,33.195755,2020-07-23 05:29:59+00:00,2020-07-23 05:29:59+00:00,8.0,2.5,Human,2020-07-22 21:51:00+00:00,WF,FI,...,4.18,13.21,29.9,1.41,3.09,1.895,4.18,13.21,29.9,2.6
1,-115.748812,40.617506,2020-08-03 23:00:00+00:00,2020-09-02 15:00:00+00:00,5985.9,5.0,Natural,2020-07-19 23:00:00+00:00,WF,FI,...,5.237174,5.657391,23.396304,1.933913,3.914783,2.865217,5.237174,5.657391,23.396304,12.62
2,-108.193611,39.858486,2020-08-30 00:00:00+00:00,2020-09-10 14:00:00+00:00,0.1,1.0,Natural,2020-08-29 21:46:00+00:00,WF,FI,...,4.754615,4.731538,16.121538,1.733846,2.986154,2.993846,4.754615,4.731538,16.121538,27.78
3,-109.703111,40.227646,2020-10-28 20:15:00+00:00,2020-10-28 20:15:00+00:00,0.1,0.1,Human,2020-10-28 19:37:00+00:00,WF,FI,...,4.02,2.56,5.48,1.25,2.98,1.88,4.02,2.56,5.48,0.0
4,-110.385511,31.961145,2020-07-10 18:14:59+00:00,2020-07-10 18:14:59+00:00,0.1,0.1,Human,2020-07-09 16:34:59+00:00,WF,FI,...,6.8,9.95,31.235,3.09,5.055,4.265,6.8,9.95,31.235,0.58


In [55]:
df['firediscoverydatetime'] = pd.to_datetime(df['firediscoverydatetime'])
df['controldatetime'] = pd.to_datetime(df['controldatetime'])
df['containmentdatetime'] = pd.to_datetime(df['containmentdatetime'])

In [56]:
df['firecause'] = df['firecause'].map({'Undetermined': 0, 'Unknown': 0, 'Natural': 1, 'Human' : 2})

In [57]:
filtered_df = df[df['dailyacres'] >1.0]
filtered_df = filtered_df[((filtered_df['controldatetime'] - filtered_df['firediscoverydatetime']) > timedelta(days=1))]
filtered_df.shape

(2862, 43)

In [39]:
df = df[(df['controldatetime'] - df['firediscoverydatetime']).map(lambda x: x.total_seconds() > (60*60*24))]

In [40]:
df.shape

(2862, 43)

In [14]:
cols = ['dailyacres', 'discoveryacres', 'firecause', 'temp_2m', 'temp_2m_max', 'humidity', 'rain', 'wind_speed_2m', 
                'wind_speed_2m_max', 'wind_speed_10m', 'wind_speed_10m_max', 'surface_soil_wetness_5cm_below', 'surface_soil_wetness_to_bedrock', 
                'rain_mean', 'wind_speed_2m_mean_x', 'wind_speed_2m_max_mean_x', 'wind_speed_10m_mean_x', 'wind_speed_10m_max_mean_x', 
                'humidity_mean_x', 'temp_2m_mean_x', 'wind_speed_2m_mean_y', 'wind_speed_2m_max_mean_y', 'wind_speed_10m_mean_y', 
                'wind_speed_10m_max_mean_y', 'humidity_mean_y', 'temp_2m_mean_y', 'rain_sum']

In [15]:
pca = PCA(random_state = 42)
df = df[cols]

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns = 'dailyacres'), df['dailyacres'], random_state= 42)

In [16]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [17]:
Z_train = pca.fit_transform(X_train)
Z_test = pca.transform(X_test)

In [18]:
explained = pca.explained_variance_ratio_
cumulative = np.cumsum(explained)
np.round(cumulative[:20], 3)

array([0.736, 0.826, 0.901, 0.94 , 0.965, 0.989, 0.997, 0.999, 0.999,
       1.   , 1.   , 1.   , 1.   , 1.   , 1.   , 1.   , 1.   , 1.   ,
       1.   , 1.   ])

In [19]:
Z_train = Z_train[:,:10]
Z_test = Z_test[:,:10]

In [25]:
np.savetxt('../../data/processed/Z_train.csv', Z_train, delimiter=',')
np.savetxt('../../data/processed/Z_test.csv', Z_test, delimiter=',')
np.savetxt('../../data/processed/y_train.csv', y_train, delimiter=',')
np.savetxt('../../data/processed/y_test.csv', y_test, delimiter=',')

In [56]:
#Linear Regression - Baseline
# With X data
lr = LinearRegression()

lr.fit(X_train, y_train)
print(f'Training R-Squared: {lr.score(X_train, y_train)}')
print(f'Testing R-Squared: {lr.score(X_test, y_test)}')
print('='*50)
print(f'RMSE: {mean_squared_error(y_test, lr.predict(X_test), squared = False)}')

Training R-Squared: 0.0492391111018341
Testing R-Squared: 0.04661002884838039
RMSE: 6105.022697614408


In [55]:
#Linear Regression - Baseline
# With PCA Data
lr = LinearRegression()

lr.fit(Z_train, y_train)
print(f'Training R-Squared: {lr.score(Z_train, y_train)}')
print(f'Testing R-Squared: {lr.score(Z_test, y_test)}')
print('='*50)
print(f'RMSE: {mean_squared_error(y_test, lr.predict(Z_test), squared = False)}')

Training R-Squared: 0.02599914527915359
Testing R-Squared: 0.010760861012771183
RMSE: 6218.743422107511


In [74]:
def model_eval(model, param_dict, training_data):
    '''
    d
    '''
    
    gs_model = GridSearchCV(model,
                            param_grid = param_dict,
                            n_jobs = -1)
    if training_data == 'X_train':
        gs_model.fit(X_train, y_train)
        print(f'Best Score: {gs_model.best_score_}')
        print(f'Best Params: {gs_model.best_params_}')
        print('='*50)
        print(f'RMSE: {mean_squared_error(y_test, gs_model.predict(X_test), squared = False)}')
        print('='*50)
        print(f'Training R-Squared: {gs_model.score(X_train, y_train)}')
        print(f'Testing R-Squared: {gs_model.score(X_test, y_test)}')
        
    elif training_data == 'Z_train':
        gs_model.fit(Z_train, y_train)
        print(f'Best Score: {gs_model.best_score_}')
        print(f'Best Params: {gs_model.best_params_}')
        print('='*50)
        print(f'RMSE: {mean_squared_error(y_test, gs_model.predict(Z_test), squared = False)}')
        print('='*50)
        print(f'Training R-Squared: {gs_model.score(Z_train, y_train)}')
        print(f'Testing R-Squared: {gs_model.score(Z_test, y_test)}')
    else:
        return 'Try Again'

In [80]:
#KNN Regressor with X Data

knn_model = KNeighborsRegressor(n_jobs=-1)

knn_params = {
    'n_neighbors': range(1000, 2001, 100),
    'weights': ['uniform', 'distance']
}

model_eval(model = knn_model,
           param_dict = knn_params,
           training_data = 'X_train')

Best Score: 0.019112073018724286
Best Params: {'n_neighbors': 1000, 'weights': 'distance'}
RMSE: 6179.017464619431
Training R-Squared: 0.9956659831913355
Testing R-Squared: 0.02335921023841725


In [81]:
#KNN Regressor with PCA Data

knn_model = KNeighborsRegressor(n_jobs=-1)

knn_params = {
    'n_neighbors': range(1000, 2001, 100),
    'weights': ['uniform', 'distance']
}

model_eval(model = knn_model,
           param_dict = knn_params,
           training_data = 'Z_train')

Best Score: 0.02003454974637453
Best Params: {'n_neighbors': 1000, 'weights': 'distance'}
RMSE: 6176.462505383479
Training R-Squared: 0.9956659742921743
Testing R-Squared: 0.024166704806837136


In [82]:
# Lasso Regression with X Data

lasso = Lasso()
lasso_params = {
    'alpha': range(5, 101, 2)
}

model_eval(model = lasso,
           param_dict = lasso_params,
           training_data = 'X_train')

Best Score: 0.01859308701054958
Best Params: {'alpha': 7}
RMSE: 6163.003211356567
Training R-Squared: 0.04068128966678297
Testing R-Squared: 0.028414999644036154


  model = cd_fast.enet_coordinate_descent(


In [83]:
# Lasso Regression with PCA Data

lasso = Lasso()
lasso_params = {
    'alpha': range(5, 101, 2)
}

model_eval(model = lasso,
           param_dict = lasso_params,
           training_data = 'Z_train')

Best Score: 0.011162574978104578
Best Params: {'alpha': 35}
RMSE: 6233.225615938861
Training R-Squared: 0.02040531819976854
Testing R-Squared: 0.006148020903463114


In [None]:
# Adaboost with X Data

ada = AdaBoostRegressor()
ada_params = {
    'n_estimators': range(40, 101, 10),
    'learning_rate': [0.5, 0.75, 1, 1.25, 1.5] 
}

model_eval(model = ada,
           param_dict = ada_params,
           training_data = 'X_train')

In [None]:
# Adaboost with PCA Data

ada = AdaBoostRegressor()
ada_params = {
    'n_estimators': range(40, 101, 10),
    'learning_rate': [0.5, 0.75, 1, 1.25, 1.5] 
}

model_eval(model = ada,
           param_dict = ada_params,
           training_data = 'Z_train')

In [57]:
# Stacking Regressor with Z_train

lvl1_est = ([
    ('rf', RandomForestRegressor(n_jobs=-1)),
    ('ada', AdaBoostRegressor()),
    ('lasso', Lasso())
])

stack = StackingRegressor(estimators=lvl1_est, 
                          final_estimator=DecisionTreeRegressor(),
                         n_jobs=-1)

stack.fit(Z_train, y_train)

print(f'Training R-Squared: {stack.score(Z_train, y_train)}')
print(f'Testing R-Squared: {stack.score(Z_test, y_test)}')
print('='*50)
print(f'RMSE: {mean_squared_error(y_test, stack.predict(Z_test), squared = False)}')

Training R-Squared: -0.665183675674186
Testing R-Squared: -1.329365828862259
RMSE: 9542.690608388644


In [58]:
# Stacking Regressor with X_train

lvl1_est = ([
    ('rf', RandomForestRegressor(n_jobs=-1)),
    ('ada', AdaBoostRegressor()),
    ('lasso', Lasso())
])

stack = StackingRegressor(estimators=lvl1_est, 
                          final_estimator=DecisionTreeRegressor(),
                         n_jobs=-1)

stack.fit(X_train, y_train)

print(f'Training R-Squared: {stack.score(X_train, y_train)}')
print(f'Testing R-Squared: {stack.score(X_test, y_test)}')
print('='*50)
print(f'RMSE: {mean_squared_error(y_test, stack.predict(X_test), squared = False)}')

Training R-Squared: -0.3294699362845459
Testing R-Squared: -0.7911446500665642
RMSE: 8367.916221278168


In [59]:
# Stacking Regressor

lvl1_est = ([
    ('rf', RandomForestRegressor(n_jobs=-1)),
    ('ada', AdaBoostRegressor()),
    ('lasso', Lasso())
])

stack = StackingRegressor(estimators=lvl1_est, 
                          final_estimator=LinearRegression(),
                         n_jobs=-1)

stack.fit(Z_train, y_train)

print(f'Training R-Squared: {stack.score(Z_train, y_train)}')
print(f'Testing R-Squared: {stack.score(Z_test, y_test)}')
print('='*50)
print(f'RMSE: {mean_squared_error(y_test, stack.predict(Z_test), squared = False)}')

Training R-Squared: 0.388392632271471
Testing R-Squared: 0.10308886313885379
RMSE: 5921.431390717258


In [60]:
# Stacking Regressor

lvl1_est = ([
    ('rf', RandomForestRegressor(n_jobs=-1)),
    ('ada', AdaBoostRegressor()),
    ('lasso', Lasso())
])

stack = StackingRegressor(estimators=lvl1_est, 
                          final_estimator=Lasso(),
                         n_jobs=-1)

stack.fit(Z_train, y_train)

print(f'Training R-Squared: {stack.score(Z_train, y_train)}')
print(f'Testing R-Squared: {stack.score(Z_test, y_test)}')
print('='*50)
print(f'RMSE: {mean_squared_error(y_test, stack.predict(Z_test), squared = False)}')

Training R-Squared: 0.36829782661161214
Testing R-Squared: 0.10263671420713849
RMSE: 5922.923752348845


In [61]:
# Stacking Regressor

lvl1_est = ([
    ('ada', AdaBoostRegressor()),
    ('lasso', Lasso())
])

stack = StackingRegressor(estimators=lvl1_est, 
                          final_estimator=Lasso(),
                         n_jobs=-1)

stack.fit(Z_train, y_train)

print(f'Training R-Squared: {stack.score(Z_train, y_train)}')
print(f'Testing R-Squared: {stack.score(Z_test, y_test)}')
print('='*50)
print(f'RMSE: {mean_squared_error(y_test, stack.predict(Z_test), squared = False)}')

Training R-Squared: 0.16917662957589852
Testing R-Squared: 0.043167416258513835
RMSE: 6116.035132160507


In [62]:
#Voting Classifier

lvl1_est = ([
    ('rf', RandomForestRegressor(n_jobs=-1)),
    ('ada', AdaBoostRegressor()),
    ('lasso', Lasso())
])

vote = VotingRegressor(estimators = lvl1_est, n_jobs = -1)

vote.fit(Z_train, y_train)
print(f'Training R-Squared: {vote.score(Z_train, y_train)}')
print(f'Testing R-Squared: {vote.score(Z_test, y_test)}')
print('='*50)
print(f'RMSE: {mean_squared_error(y_test, vote.predict(Z_test), squared = False)}')

Training R-Squared: 0.19924171963242732
Testing R-Squared: -0.4346389554736585
RMSE: 7488.990901646875


<module 'xgboost' from 'C:\\Users\\jeffr\\anaconda3\\lib\\site-packages\\xgboost\\__init__.py'>