# Imports

In [1]:
from tensorflow.keras import regularizers

from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation

import matplotlib.pyplot as plt

%matplotlib inline

import numpy as np
from sklearn.model_selection import train_test_split


#!pip install tensorflow-hub
#!pip install tensorflow-datasets
import tensorflow_hub as hub

import tensorflow as tf

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.experimental.list_physical_devices("GPU") else "NOT AVAILABLE")

import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


Version:  2.2.0
Eager mode:  True
Hub version:  0.10.0
GPU is available


# Reading Data

In [2]:
#load text and target data
df = pd.read_csv('/home/aschharwood/notebooks/NST/nst_ml/data/master_text/boko_haram_model/final_nst_acled_augmented_data_12.18.20.csv')

#convert Boko Haram to binary
df['bh_binary'] = np.where(df['boko haram']=='Boko Haram', 1, 0)

#check balance of classes
df['bh_binary'].value_counts()

#remove the "real data" from training data set to be used as test set
nst_sample = df[df['source']=='nst'].sample(n=10000)

#check balance of classes in test set
nst_sample['boko haram'].value_counts()

#create training data set
df_text_removed = df.drop(nst_sample.index)

#shuffle training data
df_text_removed = df_text_removed.sample(frac=1)

#shuffled test data
nst_sample = nst_sample.sample(frac=1)

#create feature and target train and test sets
X_train = df_text_removed['tokens'].astype(str)
y_train = df_text_removed['bh_binary']

X_test = nst_sample['tokens'].astype(str)
y_test = nst_sample['bh_binary']

# Build the model

In [3]:
#download and set word vector pretrained model
embedding = "https://tfhub.dev/google/nnlm-en-dim128/2"
hub_layer = hub.KerasLayer(embedding, input_shape=[], dtype=tf.string, trainable=True)

print('Google news model downlownded')

Google news model downlownded


In [6]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(4, activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(1))

#compile model
model.compile(optimizer='adam',
             loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
             metrics=['accuracy'])
print('model defined and compiled')

model defined and compiled


In [6]:
#check the model
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer (KerasLayer)     (None, 128)               124642688 
_________________________________________________________________
dense (Dense)                (None, 16)                2064      
_________________________________________________________________
dense_1 (Dense)              (None, 8)                 136       
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 36        
_________________________________________________________________
dropout (Dropout)            (None, 4)                 0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 5         
Total params: 124,644,929
Trainable params: 124,644,929
Non-trainable params: 0
__________________________________________

In [7]:
#train model
print('training model')
history = model.fit(X_train, y_train, epochs=5, verbose=1, validation_data=(X_test, y_test))
print('model training complete')

training model
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
model training complete


In [8]:
#save the model
model.save('/home/aschharwood/notebooks/NST/nst_ml/nst_boko_haram/')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


INFO:tensorflow:Assets written to: /home/aschharwood/notebooks/NST/nst_ml/nst_boko_haram/assets


INFO:tensorflow:Assets written to: /home/aschharwood/notebooks/NST/nst_ml/nst_boko_haram/assets


In [9]:
#generate predictions from test set
test_pred = (model.predict(X_test) > 0.5).astype("int32")

In [11]:
#generate classification report on test data
print(classification_report(y_test, test_pred))

              precision    recall  f1-score   support

           0       0.97      0.85      0.91      6783
           1       0.76      0.95      0.84      3217

    accuracy                           0.88     10000
   macro avg       0.86      0.90      0.87     10000
weighted avg       0.90      0.88      0.89     10000



In [13]:
#check confusion matrix of predictions
confusion_matrix(y_test, test_pred)

array([[5798,  985],
       [ 175, 3042]])

In [14]:
#load saved model
reloaded = tf.keras.models.load_model('/home/aschharwood/notebooks/NST/nst_ml/nst_boko_haram/')

In [15]:
#check summary
reloaded.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer (KerasLayer)     (None, 128)               124642688 
_________________________________________________________________
dense_3 (Dense)              (None, 16)                2064      
_________________________________________________________________
dense_4 (Dense)              (None, 4)                 68        
_________________________________________________________________
dropout_1 (Dropout)          (None, 4)                 0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 5         
Total params: 124,644,825
Trainable params: 124,644,825
Non-trainable params: 0
_________________________________________________________________


In [17]:
#generate predictions from saved model
test_pred = (reloaded.predict(X_test) > 0.5).astype("int32")

In [18]:
#check classification report
print(classification_report(y_test, test_pred))

              precision    recall  f1-score   support

           0       0.97      0.85      0.91      6783
           1       0.76      0.95      0.84      3217

    accuracy                           0.88     10000
   macro avg       0.86      0.90      0.87     10000
weighted avg       0.90      0.88      0.89     10000



In [19]:
#generate predictions from saved model for train set
train_pred = (reloaded.predict(X_train) > 0.5).astype("int32")

In [20]:
print(classification_report(y_train, train_pred))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98     19325
           1       1.00      0.98      0.99     27633

    accuracy                           0.98     46958
   macro avg       0.98      0.99      0.98     46958
weighted avg       0.98      0.98      0.98     46958

