# Data Processing with BERT

Bert Model Download: https://github.com/google-research/bert#pre-trained-models

inspired by: https://github.com/llSourcell/bert-as-service

DOC: https://bert-as-service.readthedocs.io/en/latest/section/get-start.html#start-the-bert-service

Install BERT:
pip install bert-serving-server
pip install bert-serving-client
Python 3.5 and Tensorflow 1.10 required

Download pretrained model:

https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip

add unziped to /tmp/uncased_L-12_H-768_A-12/

then start the service by : bert-serving-start -model_dir /tmp/uncased_L-12_H-768_A-12/ -num_worker=1

In [18]:
from bert_serving.client import BertClient
import pandas as pd
import numpy as np
import h5py
from nltk import tokenize
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense, Dropout, LeakyReLU, Activation
from keras.regularizers import l2, l1
from keras.callbacks import TensorBoard, ModelCheckpoint
from keras.models import load_model
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sn

from hyperas import optim
from hyperas.distributions import choice, uniform, normal, qlognormal, randint
from hyperopt import Trials, STATUS_OK, tpe

In [None]:
news = pd.read_csv('../data/mixed_news/news_dataset.csv')
news = news.dropna(subset=['title','content'])
news = news[news.content != ' ']
news = news[news.title != ' ']
news.shape

# Vergleichsanalyse:

## Klassifizierung anhand eines Splits:

In [None]:
text_len_real  =  [len(c) for c in news[news['label']=='real'].content]
plt.hist(text_len_real,color="b", bins=500,density=True,alpha=0.4,label="real")
text_len_fake = [len(c) for c in news[news['label']=='fake'].content]
plt.hist(text_len_fake,color="r",bins=500, density=True,alpha=0.4,label="fake")
plt.xlim(0,20000)
plt.xlabel("Textlänge")
plt.legend()
plt.show()

In [None]:
data = pd.DataFrame({"x": text_len_fake+text_len_real, "truth": [0]*len(text_len_fake) + [1]*len(text_len_real)})
split=2500
plt.vlines(split,0,0.00025)
plt.hist(text_len_real,color="b", bins=500,density=True,alpha=0.4,label="real")
plt.hist(text_len_fake,color="r",bins=500, density=True,alpha=0.4,label="fake")
plt.xlim(0,20000)
plt.xlabel("Textlänge")
plt.legend()
plt.show()

In [None]:
from sklearn.utils.multiclass import unique_labels
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [None]:
data["predict"] = data.x > split
data.head()
from sklearn.metrics import confusion_matrix
data = data.replace(0,"fake").replace(1,"real").replace(False,"fake").replace(True,"real")
plot_confusion_matrix(data.truth,data.predict,["fake","truth"])

# Using BERT for Datapreprocessing
## Encode Titles:

Bei den Titeln gibt es das Problem, dass einige Titel den Publisher beinhalten (Washington Post oder 100percentfedUp.com)
jedoch ist das ja durchaus ein Indiz, daher ist es zwar fragwürdig den ganzen Titel zu nutzen jedoch durchaus legitim

In [None]:
bc = BertClient()

In [None]:
title_enc = bc.encode(list(news['title']))

In [None]:
with h5py.File('encoded_data/encode_news.h5', 'w') as hf:
    hf.create_dataset("title_encode",  title_enc)

## Encode Text:

In [None]:
'''
content = []
num_sentences_train = []
for c in news_train['content']:
    tok = tokenize.sent_tokenize(c)
    content  = content + tok
    num_sentences_train.append(len(tok))

train_content_encode = bc.encode(content)
'''

In [None]:
'''with h5py.File('encode_news.h5', 'a') as hf:
    hf.create_dataset("train_content_encode",  data = train_content_encode)
    hf.create_dataset("train_num_sentences", data = np.array(num_sentences_train))'''

In [None]:
'''content = []
num_sentences_test = []
for c in news_test['content']:
    tok = tokenize.sent_tokenize(c)
    content  = content + tok
    num_sentences_test.append(len(tok))

test_content_encode = bc.encode(content)'''

In [None]:
'''with h5py.File('encode_news.h5', 'a') as hf:
    hf.create_dataset("test_content_encode",  data = test_content_encode)
    hf.create_dataset("test_num_sentences", data = np.array(num_sentences_test))'''

## Read encoded data:

In [None]:
with h5py.File('encoded_data/title_encode.h5', 'r') as hf:
    title_encode = hf['title_encode'][:]

preparing for NN:

In [None]:
title_NN = pd.DataFrame(data = title_encode)
X_train, X_test, y_train, y_test = train_test_split(title_NN,news.label, test_size=0.33, stratify=news.label,
                                                                  random_state=42)

In [None]:
LE = LabelEncoder()
LE.fit(["fake","real"])
y_train_enc = LE.transform(y_train)
y_test_enc = LE.transform(y_test)

In [None]:
X_train = pd.DataFrame(X_train)
X_train["label"] = y_train_enc
X_test = pd.DataFrame(X_test)
X_test["label"] = y_test_enc

X_train.to_hdf('../data/BERT/Bert_title_NN.h5', key='train', mode='w')
X_test.to_hdf('../data/BERT/Bert_title_NN.h5', key='test', mode='a')
X_test = X_test.drop('label',axis=1)
X_train = X_train.drop('label',axis=1)

## Hyperparameter Optimierung

### Data creation

In [2]:
def data_BERT():
    x_test  = pd.read_hdf("../data/BERT/Bert_title_NN.h5",key="test")
    y_test = x_test['label']
    x_test = x_test.drop("label", axis=1)
    x_train  = pd.read_hdf("../data/BERT/Bert_title_NN.h5",key="train")
    y_train = x_train['label']
    x_train = x_train.drop("label", axis=1)
    return x_train, y_train, x_test, y_test

In [3]:
x_train, y_train, x_test, y_test = data_BERT()

### Model creation

In [9]:
def model_structure(x_train, y_train, x_test, y_test):
    dim = x_train.shape[1]
    model = Sequential()
    model.add(Dense(int({{qlognormal(7,0.5,10)}}), input_dim=dim))
    model.add(Activation('relu'))
    
    if {{choice(['three', 'four'])}} == 'four':
        model.add(Dense(int({{qlognormal(7,0.5,10)}})))
        model.add(Activation('relu'))
        
    model.add(Dense(int({{qlognormal(4,0.5,1)}})))
    model.add(Activation('relu'))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    model.compile(loss='binary_crossentropy', metrics=['accuracy'],
                  optimizer='adam')

    result = model.fit(x_train, y_train,
              batch_size=64,
              epochs=30,
              verbose=2,
              validation_split=0.3)
    validation_acc = np.amax(result.history['val_acc']) 
    print('Best validation acc of epoch:', validation_acc)
    return {'loss': -validation_acc, 'status': STATUS_OK, 'model': model}

In [5]:
def model_training(x_train, y_train, x_test, y_test):
    dim = x_train.shape[1]
    model = Sequential()
    model.add(Dense(int(np.abs(1187.5872913047178)), input_dim=dim))
    model.add(Activation('relu'))
    model.add(Dense(int(np.abs(-1475.2916969518506))))
    model.add(Activation('relu'))    
    model.add(Dense(261))
    model.add(Activation('relu'))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    model.compile(loss={{choice(['hinge','binary_crossentropy','squared_hinge'])}}, metrics=['accuracy'],
                  optimizer={{choice(['adam','AdaDelta','Adagrad'])}})

    result = model.fit(x_train, y_train,
              batch_size=64,
              epochs=30,
              verbose=2,
              validation_split=0.3)
    validation_acc = np.amax(result.history['val_acc']) 
    print('Best validation acc of epoch:', validation_acc)
    return {'loss': -validation_acc, 'status': STATUS_OK, 'model': model}

In [19]:
def model_regularization(x_train, y_train, x_test, y_test):
    dim = x_train.shape[1]
    model = Sequential()
    model.add(Dense(int(np.abs(1187.5872913047178)), kernel_regularizer=l2({{uniform(0,0.1)}}), input_dim=dim))
    model.add(Activation('relu'))
    model.add(Dropout({{uniform(0,0.7)}}))
    model.add(Dense(int(np.abs(-1475.2916969518506)),kernel_regularizer=l2({{uniform(0,0.1)}})))
    model.add(Activation('relu'))
    model.add(Dropout({{uniform(0,0.7)}}))
    model.add(Dense(261,kernel_regularizer=l2({{uniform(0,0.1)}})))
    model.add(Activation('relu'))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    model.compile(loss='binary_crossentropy', metrics=['accuracy'],
                  optimizer='Adagrad')

    result = model.fit(x_train, y_train,
              batch_size=64,
              epochs=30,
              verbose=2,
              validation_split=0.3)
    validation_acc = np.amax(result.history['val_acc']) 
    print('Best validation acc of epoch:', validation_acc)
    return {'loss': -validation_acc, 'status': STATUS_OK, 'model': model}

## Optimization with hyperopt
Algorithm: Tree of Parzen Estimators Optimierung in 3 Schritten:

- Struktur (Tiefe (2 oder 3 hidden Layers) und Breite)
- Training (loss function und optimizer)
- Regularizierung ( L1 für die erste Layer und L2 für 2 und 3)

In [10]:
trials = Trials()
best_run, best_model = optim.minimize(model=model_structure,
                                      data=data_BERT,
                                      algo=tpe.suggest,
                                      max_evals=50,
                                      trials=trials,
                                     notebook_name='Bert_NN')
print("Best performing model chosen hyper-parameters:")
print(best_run)
best_model.save('../model/best_Hyperopt_NN_BERT_struct.hdf5')

>>> Imports:
#coding=utf-8

try:
    from bert_serving.client import BertClient
except:
    pass

try:
    import pandas as pd
except:
    pass

try:
    import numpy as np
except:
    pass

try:
    import h5py
except:
    pass

try:
    from nltk import tokenize
except:
    pass

try:
    from sklearn.decomposition import PCA
except:
    pass

try:
    from sklearn.model_selection import train_test_split
except:
    pass

try:
    from sklearn.preprocessing import LabelEncoder
except:
    pass

try:
    import matplotlib.pyplot as plt
except:
    pass

try:
    from keras.models import Sequential
except:
    pass

try:
    from keras.layers import Dense, Dropout, LeakyReLU, Activation
except:
    pass

try:
    from keras.callbacks import TensorBoard, ModelCheckpoint
except:
    pass

try:
    from keras.models import load_model
except:
    pass

try:
    from sklearn.metrics import confusion_matrix, classification_report
except:
    pass

try:
    import seaborn as sn
except:
    pa

KeyboardInterrupt: 

In [12]:
trials = Trials()
best_run, best_model = optim.minimize(model=model_training,
                                      data=data_BERT,
                                      algo=tpe.suggest,
                                      max_evals=15,
                                      trials=trials,
                                     notebook_name='Bert_NN')
print("Best performing model chosen hyper-parameters:")
print(best_run)
best_model.save('../model/best_Hyperopt_NN_BERT_training.hdf5')

>>> Imports:
#coding=utf-8

try:
    from bert_serving.client import BertClient
except:
    pass

try:
    import pandas as pd
except:
    pass

try:
    import numpy as np
except:
    pass

try:
    import h5py
except:
    pass

try:
    from nltk import tokenize
except:
    pass

try:
    from sklearn.decomposition import PCA
except:
    pass

try:
    from sklearn.model_selection import train_test_split
except:
    pass

try:
    from sklearn.preprocessing import LabelEncoder
except:
    pass

try:
    import matplotlib.pyplot as plt
except:
    pass

try:
    from keras.models import Sequential
except:
    pass

try:
    from keras.layers import Dense, Dropout, LeakyReLU, Activation
except:
    pass

try:
    from keras.callbacks import TensorBoard, ModelCheckpoint
except:
    pass

try:
    from keras.models import load_model
except:
    pass

try:
    from sklearn.metrics import confusion_matrix, classification_report
except:
    pass

try:
    import seaborn as sn
except:
    pa

KeyboardInterrupt: 

In [20]:
trials = Trials()
best_run, best_model = optim.minimize(model=model_regularization,
                                      data=data_BERT,
                                      algo=tpe.suggest,
                                      max_evals=80,
                                      trials=trials,
                                     notebook_name='Bert_NN')
print("Best performing model chosen hyper-parameters:")
print(best_run)
best_model.save('../model/best_Hyperopt_NN_BERT_regularization.hdf5')

>>> Imports:
#coding=utf-8

try:
    from bert_serving.client import BertClient
except:
    pass

try:
    import pandas as pd
except:
    pass

try:
    import numpy as np
except:
    pass

try:
    import h5py
except:
    pass

try:
    from nltk import tokenize
except:
    pass

try:
    from sklearn.decomposition import PCA
except:
    pass

try:
    from sklearn.model_selection import train_test_split
except:
    pass

try:
    from sklearn.preprocessing import LabelEncoder
except:
    pass

try:
    import matplotlib.pyplot as plt
except:
    pass

try:
    from keras.models import Sequential
except:
    pass

try:
    from keras.layers import Dense, Dropout, LeakyReLU, Activation
except:
    pass

try:
    from keras.regularizers import l2, l1
except:
    pass

try:
    from keras.callbacks import TensorBoard, ModelCheckpoint
except:
    pass

try:
    from keras.models import load_model
except:
    pass

try:
    from sklearn.metrics import confusion_matrix, classification_repo

KeyboardInterrupt: 

## Evaluation of best model

### Train best model¶
Neues Training des besten Modells, welches Optimiert bezüglich der Hyperparameter ist

In [None]:
def plot_history(network_history):
    plt.figure()
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.plot(network_history.history['loss'])
    plt.plot(network_history.history['val_loss'])
    plt.legend(['Training', 'Validation'])
    plt.savefig("../build/plots/BERT/history_best.pdf")
    plt.close()

In [None]:
best_model = load_model('../model/best_Hyperopt_NN_BERT_regularization.hdf5')
model = Sequential.from_config(best_model.get_config())
model.summary()

In [None]:
filepath = '../model/best_Hyperopt_NN_BERT_trained.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True)
model.compile(loss='binary_crossentropy',
              optimizer='Adagrad',
              metrics=['accuracy'])
history = model.fit(x_train, y_train, validation_split=0.3,
                    epochs=100,batch_size=64, callbacks=[checkpoint])

In [None]:
plot_history(history)

Es muss beachtet werden, dass Dropouts verwendet wurden und die Validierung erst nach jeder Batch geschieht

## Evaluierung des Besten Modells

In [None]:
best_model = load_model('../model/best_Hyperopt_NN_bow_trained.hdf5')
y_pred = best_model.predict(x_test, batch_size=64, verbose=1)
y_pred_train = best_model.predict(x_train, batch_size=64, verbose=1)
y_pred_bool = np.round(y_pred)

In [None]:
print(classification_report(y_test, y_pred_bool))
print(confusion_matrix(y_test, y_pred_bool,labels=[0,1]))

#Confusion Matrix
cnfn_matrix = pd.crosstab(y_test, y_pred_bool[:,0], rownames=['Actual'], colnames=['Predicted'])
print(cnfn_matrix)
cnfn_matrix.columns = ['fake','real']
cnfn_matrix = cnfn_matrix.rename_axis("Predicted", axis="columns")
cnfn_matrix.rename(index = {0.0: "fake", 1.0:'real'}, inplace = True) 
cnfn_matrix = cnfn_matrix/y_test.shape[0]
sn.heatmap(cnfn_matrix, annot=True , cmap='viridis')
#plt.show()
plt.savefig("../build/plots/BERT/cnfsn_mtx_best_nn.pdf")
plt.close()

#Overtraining test
plt.hist(y_pred[y_test == 0],label="fake test", alpha = 0.4, color = "r",density=True)
plt.hist(y_pred_train[y_train == 0],label='fake train', alpha = 0.4, color = 'r', histtype='step',density=True)
plt.hist(y_pred[y_test == 1],label = "real test",alpha = 0.4, color = "b",density=True)
plt.hist(y_pred_train[y_train == 1],label='real train', alpha = 0.4, color = 'b', histtype='step',density=True)

plt.xlabel("Prediction Probability")
plt.ylabel("density")
plt.legend(loc='upper center')
#plt.show()
plt.savefig("../build/plots/BERT/prob_best_nn.pdf")
plt.close()

fpr = dict()
tpr = dict()
roc_auc = dict()
fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
#plt.show()
plt.savefig("../build/plots/BERT/roc_Hyperopt_best_nn.pdf")
plt.close()

# Vergleich mit BOW mit den Titeln