In [None]:
# Data
import numpy as np
import pandas as pd
import geopandas as gpd

# Dates
import datetime
import time
import julian

# Database
import sqlite3
import spatialite
import shapely

# Plotting
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

# ML
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import multilabel_confusion_matrix, classification_report


In [None]:
%%time
conn_spa = spatialite.connect('../FPA_FOD_20170508.sqlite')
sql = '''
SELECT *, ST_AsBinary(Shape) as geom
FROM Fires 
'''
fpa = gpd.read_postgis(sql, conn_spa)
fpa.head()
conn_spa.close()

### CLEANING DATA

In [None]:
fpa['OWNER_CODE'] = fpa['OWNER_CODE'].astype(int)
fpa['DISCOVERY_DATE'] = fpa['DISCOVERY_DATE'].map(lambda x: julian.from_jd(x))
fpa['CONT_DATE'] = fpa['CONT_DATE'].map(lambda x: julian.from_jd(x) if np.isnan(x) == False else x)

In [None]:
fpa.columns

## Modeling - Creating X and y

In [None]:
features = ['FIRE_YEAR','DISCOVERY_DATE','DISCOVERY_DOY','DISCOVERY_TIME','LATITUDE','LONGITUDE','OWNER_CODE','STATE','COUNTY','FIPS_COMB']
features = ['FIRE_YEAR','DISCOVERY_DATE','DISCOVERY_DOY','LATITUDE','LONGITUDE','OWNER_CODE', 'STATE']
X = fpa[features]
y = fpa['STAT_CAUSE_CODE'].astype(int)

In [None]:
X.head()

In [None]:
y.unique()

In [None]:
X['FIRE_MONTH'] = X['DISCOVERY_DATE'].map(lambda x: x.month)
X = X.merge(pd.get_dummies(X['OWNER_CODE'], drop_first = True, prefix = "owner_code"), 
            how = 'inner',
            left_index = True, 
            right_index = True)
X.drop(columns = ['OWNER_CODE'], inplace= True)


X = X.merge(pd.get_dummies(X['STATE'], drop_first = True), 
            how = 'inner',
            left_index = True, 
            right_index = True)
X.drop(columns = ['STATE', 'DISCOVERY_DATE'], inplace= True)

X.head()

In [None]:
X.info()

## Modeling Decision Tree Classifiers

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
dt = DecisionTreeClassifier(criterion = 'gini', 
                            max_depth = 120, 
                            min_samples_split = 25, 
                            min_samples_leaf = 100)
dt.fit(X_train, y_train)

In [None]:
print(f'Score on training set: {dt.score(X_train, y_train)}')
print(f'Score on testing set: {dt.score(X_test, y_test)}')

In [None]:
preds = dt.predict(X_test)

In [None]:
MCM = multilabel_confusion_matrix(y_test, preds, labels = [1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13])

In [None]:
MCM

In [None]:
(y_test == 12).sum()

In [None]:
MCM

### Switching to Cause by category

In [None]:
cause_dict = {
    1:'Lightning', # Natural
    2:'Equipment Use', # Infrastructure
    3:'Smoking', # Human
    4:'Campfire', # Human
    5:'Debris Burning', # Human
    6:'Railroad', # Infrastructure
    7:'Arson', # Human
    8:'Children', # Human
    9:'Miscellaneous', # Other
    10:'Fireworks', # Human
    11:'Powerline', # Infrastructure
    12:'Structure', # Infrastructure
    13:'Missing/Undefined' # Other
}

cat_dict = {
    1:1,
    2:2,
    3:3,
    4:3,
    5:3,
    6:2,
    7:3,
    8:3,
    9:4,
    10:3,
    11:2,
    12:2,
    13:4
}

In [None]:
y_cat = [cat_dict[i] for i in y]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_cat)

In [None]:
dt_cat = DecisionTreeClassifier(criterion = 'gini', 
                            max_depth = 200, 
                            min_samples_split = 100, 
                            min_samples_leaf = 10)
dt_cat.fit(X_train, y_train)
print(f'Score on training set: {dt_cat.score(X_train, y_train)}')
print(f'Score on testing set: {dt_cat.score(X_test, y_test)}')

In [None]:
preds_cat = dt_cat.predict(X_test)
MCM = multilabel_confusion_matrix(y_test, preds_cat)
MCM

In [None]:
params = params = {
    'max_depth' : [100, 90, 110],
    'min_samples_split' : [100,150,180],
    'min_samples_leaf' : [30,40,50],
}

In [None]:
grid = GridSearchCV(DecisionTreeClassifier(random_state = 42), 
                    params,
                    cv = 5, 
                    verbose = 2)

In [None]:


# Start our timer.
t0 = time.time()

# Let's GridSearch over the above parameters on our training data.
grid.fit(X_train,y_train)

# Stop our timer and print the result.
print(f'This took {time.time() - t0}')

In [None]:
grid.best_estimator_

In [None]:
grid.best_params_

In [None]:
grid.best_score_

In [None]:
print(f'Score on training set: {grid.score(X_train, y_train)}')
print(f'Score on testing set: {grid.score(X_test, y_test)}')

In [None]:
grid.param_grid

In [None]:
grid.cv_results_.keys()

In [None]:
grid.cv_results_['params']

In [None]:
grid.cv_results_['mean_test_score']

In [None]:


def collect_gridCV_results(model, name)
    # Create a dataframe of model parameters, and results
    model_history = pd.merge(pd.DataFrame(model.cv_results_['params']), 
             pd.DataFrame(model.cv_results_['mean_test_score'], columns = ['mean_test_score']),
             how = 'inner',left_index = True, right_index = True)
    # Append name column
    model_history['model'] = name
    return model_history




# Random Forest

In [None]:
%%time
rf = RandomForestClassifier(n_estimators = 80, 
                            max_depth = 100, 
                            min_samples_split = 100, 
                            min_samples_leaf =40)

rf.fit(X_train, y_train)

In [None]:
rf.score(X_train, y_train)

In [None]:
rf.score(X_test, y_test)

In [None]:
print(f'Score on training set: {grid.score(X_train, y_train)}')
print(f'Score on testing set: {grid.score(X_test, y_test)}')

## KNN Classifier

## Using a Neural Network for Classification