In [48]:
import numpy as np, pandas as pd
import warnings
import psutil, os
warnings.filterwarnings('ignore')


# NN imports
from tensorflow.keras.callbacks import Callback, ModelCheckpoint
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, LSTM
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
import tensorflow.keras.backend as K

# X_train = pd.read_csv('X_train.csv')
# y_train = pd.read_csv('y_train.csv')
# X_test = pd.read_csv('X_test.csv')
# y_test = pd.read_csv('y_test.csv')

# X_train = pd.read_csv('X_train_small.csv')
# y_train = pd.read_csv('y_train_small.csv')
# X_test = pd.read_csv('X_test_small.csv')
# y_test = pd.read_csv('y_test_small.csv')

X_train = pd.read_pickle('Data/X_train_small.pkl')
y_train = pd.read_pickle('Data/y_train_small.pkl')
X_test = pd.read_pickle('Data/X_test_small.pkl')
y_test = pd.read_pickle('Data/y_test_small.pkl')

In [60]:
X_train.head()

Unnamed: 0,latitude,longitude,brightness,scan,track,acq_time,satellite,confidence,bright_t31,frp,daynight,type,FIRE_YEAR,MONTH,WEEK,DAY
9,19.3454,-155.046906,368.200012,1.0,1.0,849,Terra,100,309.299988,110.300003,N,2,2001,1,1,1
19,19.3552,-155.055206,366.200012,1.3,1.1,2100,Terra,100,314.799988,137.899994,D,2,2001,1,1,1
29,31.2537,-84.508698,310.399994,1.0,1.0,1643,Terra,70,278.5,13.6,D,0,2001,1,1,2
39,30.9461,-84.917099,304.399994,1.0,1.0,1643,Terra,59,280.899994,8.6,D,0,2001,1,1,2
49,26.872299,-81.117996,342.600006,1.6,1.3,1644,Terra,93,290.5,96.800003,D,0,2001,1,1,2


In [49]:
y_train['isFire'].value_counts()[1] / y_train['isFire'].value_counts()[0]

0.41251901925013057

In [50]:
y_test['isFire'].value_counts()[1] / y_test['isFire'].value_counts()[0]

0.5169574072434251

In [51]:
# Now I'll set up pipelines

# scikit-learn pipelines
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

# feature processing
import category_encoders as ce
from sklearn.preprocessing import StandardScaler

# pre-processing pipeline
column_trans = ColumnTransformer(
    [('onehot', ce.OneHotEncoder(), ['satellite', 'daynight', 'type']),
    ('scale', StandardScaler(), ['brightness', 'track', 'scan', 'acq_time', 'confidence', 'bright_t31', 'frp'])],
    remainder='passthrough')


In [52]:
X_train.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 186598 entries, 9 to 1865979
Data columns (total 16 columns):
latitude      186598 non-null float32
longitude     186598 non-null float32
brightness    186598 non-null float32
scan          186598 non-null float32
track         186598 non-null float32
acq_time      186598 non-null uint16
satellite     186598 non-null object
confidence    186598 non-null uint8
bright_t31    186598 non-null float32
frp           186598 non-null float32
daynight      186598 non-null object
type          186598 non-null uint8
FIRE_YEAR     186598 non-null uint16
MONTH         186598 non-null uint8
WEEK          186598 non-null uint8
DAY           186598 non-null uint8
dtypes: float32(7), object(2), uint16(2), uint8(5)
memory usage: 30.7 MB


In [58]:
# try to tune RNN with timeseries split
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

# Random Seed
seed = 2001
np.random.seed(seed)

# Important Hyperparameters
inputs = column_trans.fit_transform(X_train).shape[1]
epochs = 50
batch_size = 128

# Create our model
model = Sequential()

# input and hidden
model.add(Dense(32, input_shape = (None, inputs), activation='sigmoid'))
model.add(Dropout(0.3))
model.add(Dense(16, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))

#compile
model.compile(loss='binary_crossentropy',
               optimizer = 'adam',
               metrics=['acc'])

history = model.fit(column_trans.fit_transform(X_train), y_train, batch_size=batch_size, epochs = epochs, validation_split=.1, verbose =10)
scores = model.evaluate(column_trans.transform(X_test), y_test)
print(f'{model.metrics_names[1]}: {scores[1]*100}')

ValueError: Error when checking input: expected lstm_8_input to have 3 dimensions, but got array with shape (186598, 20)

In [54]:
from sklearn.metrics import roc_auc_score

y_pred_proba = model.predict_proba(column_trans.transform(X_test))#[:,1]

# y_pred_proba
roc_auc_score(y_test, y_pred_proba)

ValueError: Error when checking input: expected lstm_input to have 3 dimensions, but got array with shape (17131, 20)

In [21]:
# pretty tight ROC_AUC
#f1 score
from sklearn.metrics import f1_score

y_pred = model.predict_classes(column_trans.transform(X_test))

# y_pred
f1_score(y_test, y_pred)

0.0

In [32]:
# f1 score is garbage, build a custom estimator to optomize for f1

def get_f1(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

## Define Model

def mlp_v2():    
    mdl = Sequential()
    mdl.add(Dense(512, activation='relu',input_dim=inputs))
    mdl.add(Dropout(0.5))
    mdl.add(Dense(128, activation='relu'))
    mdl.add(Dropout(0.5))
    mdl.add(Dense(1, activation='sigmoid'))
    mdl.compile(loss='binary_crossentropy', optimizer='adadelta', metrics=[get_f1])
    mdl.summary()
    return mdl

mode_path = '../models/mlp_v2.h5'
callbacks = [ModelCheckpoint(filepath=mode_path, save_best_only=True)]

In [33]:
## Run Model
estimator = KerasClassifier(build_fn=mlp_v2, epochs=5, batch_size=128)

history = estimator.fit(column_trans.fit_transform(X_train), y_train, 
                        batch_size=batch_size, epochs = epochs, validation_split=.1, verbose =10)
scores = model.evaluate(column_trans.transform(X_test), y_test)
print(f'{model.metrics_names[1]}: {scores[1]*100}')

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_21 (Dense)             (None, 512)               10752     
_________________________________________________________________
dropout_8 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_22 (Dense)             (None, 128)               65664     
_________________________________________________________________
dropout_9 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_23 (Dense)             (None, 1)                 129       
Total params: 76,545
Trainable params: 76,545
Non-trainable params: 0
_________________________________________________________________
Train on 167938 samples, validate on 18660 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50

In [34]:
y_pred_proba = model.predict_proba(column_trans.transform(X_test))#[:,1]

# y_pred_proba
roc_auc_score(y_test, y_pred_proba)

0.5821190791228575

In [35]:
y_pred = model.predict_classes(column_trans.transform(X_test))

# y_pred
f1_score(y_test, y_pred)

0.0

In [36]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[11293,     0],
       [ 5838,     0]])

In [40]:
import matplotlib.pyplot as plt
from sklearn.utils.multiclass import unique_labels

def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax


np.set_printoptions(precision=2)

class_names = [0, 1]

# Plot non-normalized confusion matrix
plot_confusion_matrix(y_test, y_pred, classes=class_names,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plot_confusion_matrix(y_test, y_pred, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

plt.show()


TypeError: only integer scalar arrays can be converted to a scalar index