# Data Processing with BERT

Bert Model Download: https://github.com/google-research/bert#pre-trained-models

inspired by: https://github.com/llSourcell/bert-as-service

DOC: https://bert-as-service.readthedocs.io/en/latest/section/get-start.html#start-the-bert-service

Install BERT:
pip install bert-serving-server
pip install bert-serving-client
Python 3.5 and Tensorflow 1.10 required

Download pretrained model:

https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip

add unziped to /tmp/uncased_L-12_H-768_A-12/

then start the service by : bert-serving-start -model_dir /tmp/uncased_L-12_H-768_A-12/ -num_worker=1

In [1]:
from bert_serving.client import BertClient
import pandas as pd
import numpy as np
import h5py
from nltk import tokenize
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
news = pd.read_csv('../data/mixed_news/news_dataset.csv')
news = news.dropna(subset=['title','content'])
news = news[news.content != ' ']
news = news[news.title != ' ']

news_train, news_test, label_train, label_test = train_test_split(news[["title","content"]], news["label"], test_size=0.33, 
                                                                  random_state=42)
print(" Train Dataset of size: %i \n Test Dataset of size: %i" % (news_train.shape[0], news_test.shape[0]))
print("Size of FakeNews: %i \n Size of Real News: %i" % (news[news.label == 'fake'].shape[0],news[news.label=="real"].shape[0]))
print("FakeNews Anteil:",len(news[news.label=='fake'])/len(news))
news_train.head()

 Train Dataset of size: 18678 
 Test Dataset of size: 9201
Size of FakeNews: 12167 
 Size of Real News: 15712
FakeNews Anteil: 0.43642167940026544


Unnamed: 0,title,content
4714,'D**k-Waving Berlusconi Knockoff': Late-Night ...,Share on Twitter \nDuring the third and final ...
2858,Ikea crea un carril rápido para solteros,Ikea crea un carril rápido para solteros LOS C...
25695,Congo forces kill 26 protesters against leader...,Security forces shot dead at least 26 protest...
631,Get Ready For Civil Unrest,Get Ready For Civil Unrest 10/31/2016 \nECONOM...
2028,Review: Brit Bennett’s “The Mothers”,Email \nThe mothers referred to in the title o...


# Vergleichsanalyse:

## Klassifizierung anhand eines Splits:

In [None]:
text_len_real  =  [len(c) for c in news[news['label']=='real'].content]
plt.hist(text_len_real,color="b", bins=500,density=True,alpha=0.4,label="real")
text_len_fake = [len(c) for c in news[news['label']=='fake'].content]
plt.hist(text_len_fake,color="r",bins=500, density=True,alpha=0.4,label="fake")
plt.xlim(0,20000)
plt.xlabel("Textlänge")
plt.legend()
plt.show()

In [None]:
data = pd.DataFrame({"x": text_len_fake+text_len_real, "truth": [0]*len(text_len_fake) + [1]*len(text_len_real)})
split=2500
plt.vlines(split,0,0.00025)
plt.hist(text_len_real,color="b", bins=500,density=True,alpha=0.4,label="real")
plt.hist(text_len_fake,color="r",bins=500, density=True,alpha=0.4,label="fake")
plt.xlim(0,20000)
plt.xlabel("Textlänge")
plt.legend()
plt.show()

In [None]:
from sklearn.utils.multiclass import unique_labels
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [None]:
data["predict"] = data.x > split
data.head()
from sklearn.metrics import confusion_matrix
data = data.replace(0,"fake").replace(1,"real").replace(False,"fake").replace(True,"real")
plot_confusion_matrix(data.truth,data.predict,["fake","truth"])

# Using BERT for Datapreprocessing
## Encode Titles:

In [4]:
bc = BertClient()

In [3]:
train_title_enc = bc.encode(list(news_train['title']))

KeyboardInterrupt: 

In [None]:
with h5py.File('encode_news.h5', 'w') as hf:
    hf.create_dataset("train_title_encode",  train_title_enc)

In [None]:
test_title_enc = bc.encode(list(news_test['title']))

In [None]:
with h5py.File('encode_news.h5', 'w') as hf:
    hf.create_dataset("test_title_encode",  test_title_enc)

## Encode Text:

In [None]:
content = []
num_sentences_train = []
for c in news_train['content']:
    tok = tokenize.sent_tokenize(c)
    content  = content + tok
    num_sentences_train.append(len(tok))

train_content_encode = bc.encode(content)



In [None]:
with h5py.File('encode_news.h5', 'a') as hf:
    hf.create_dataset("train_content_encode",  data = train_content_encode)
    hf.create_dataset("train_num_sentences", data = np.array(num_sentences_train))

In [None]:
content = []
num_sentences_test = []
for c in news_test['content']:
    tok = tokenize.sent_tokenize(c)
    content  = content + tok
    num_sentences_test.append(len(tok))

test_content_encode = bc.encode(content)

In [None]:
with h5py.File('encode_news.h5', 'a') as hf:
    hf.create_dataset("test_content_encode",  data = test_content_encode)
    hf.create_dataset("test_num_sentences", data = np.array(num_sentences_test))

## Read encoded data:

In [None]:
with h5py.File('title_encode.h5', 'r') as hf:
    title_encode = hf['title_encode'][:]
    #text_encode = hf['text_encode'][:]
    #num_sentences = hf['num_sentences'][:]

preparing for NN:

In [None]:
title_NN = pd.DataFrame(data = title_encode)
title_NN.shape

# Verkleinerung des Designspace mithilfe einer PCA:

In [None]:
modell = PCA()
modell.fit(title_NN)

In [None]:
import matplotlib.pyplot as plt
plt.plot(range(768),np.cumsum(modell.explained_variance_ratio_))
plt.xlabel("Hauptkomponente")
plt.ylabel("Kummulierte Varianz")
plt.show()

Es gibt drei Methoden, um zu entscheiden wie viele Hauptkomponenten ausreichend sind:

1. Methode: eine bestimmter prozentsatz der Varainz soll erklärt sein:

In [None]:
print("Anzahl der Hauptkomponenten für erklärte Varianz größer 0.8: ", sum(np.cumsum(modell.explained_variance_ratio_)<0.8)+1)

2. Methode: Die Hauptkomponenten, die mehr als die mittlere Varianz erkären

In [None]:
print("Anzahl der Hauptkomponenten größer der mittleren erklärten Varainz: ",sum(modell.explained_variance_ratio_ > np.mean(modell.explained_variance_ratio_)))

Nutzen des Scree-Plots

In [None]:
plt.plot(range(768),modell.explained_variance_ratio_)
plt.xlabel("Hauptkomponente")
plt.ylabel("erklärte Varianz")
plt.xlim(0,200)
plt.show()

Der Scree-Plot legt nahe $50-75$ Hauptkomponenten zu nutzen. Ab dann flacht der Graph sehr stark ab