In [1]:
# GLOVE download reference https://github.com/guillaume-chevalier/GloVe-as-a-TensorFlow-Embedding-Layer/blob/master/README.md

import pandas as pd
import numpy as np
import os
import nltk
import re
from datetime import date

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense ,Dropout,LSTM, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.losses import sparse_categorical_crossentropy

from tensorflow.keras.losses import sparse_categorical_crossentropy

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# is a gpu available for training ?
tf.test.is_gpu_available()

True

In [3]:
tf.__version__

'2.0.0-alpha0'

In [4]:
# list of unwanted characters
badlist = pd.read_csv('data/badlist_de', header=None)[0].tolist()

In [5]:
import re

# add spaces to certain special characters
unspaced = r'[\[\]\'\'\”\(\)\.\,\/\?\-\!\"\;|:]'
spaced =  ' \g<0> '



In [6]:
def clean_line(line):
    """
    remove stray characters
    """
    for badchar in badlist:
        if badchar in line:
            line = re.sub(badchar,'',line)
        line = re.sub(unspaced, spaced, line)
        line = re.sub("\n" ," \n", line)
        for n in range(2):
            line = re.sub('  ',' ',line)
        
    return line

def tokenize(sentence):
    """
    simple tokenization keeping the line seperators etc.
    """
    sentence = clean_line(sentence)
    return sentence.lower().split(' ') #sentence.lower().split(' ')

In [7]:
clean_line('Warum die royale Hochzeit zum Desaster werden könnte')

'Warum die royale Hochzeit zum Desaster werden könnte'

In [10]:
# read the raw text files and clean the lines

all_episodes_by_sentence = []

seperators =  [' \t\n', '\t\n']

with open('Newssnippets.txt',encoding='utf-8' ) as in_raw:
    # start token
    for (i, line) in enumerate(in_raw):
        if not line in seperators:
            all_episodes_by_sentence.append(clean_line(line) )
        

In [11]:
all_episodes_by_sentence[243]

'Herzogin Meghan Prinzessin Beatrice : Showdown in London : Warum die royale Hochzeit zum Desaster werden könnte \n'

In [12]:
# combine text to create a single string for sliceshifting
word_dict = {}
used_words = []
text = []
for sent in all_episodes_by_sentence:
    for word in tokenize(sent):
        #if word in word2idx :
        if not word in word_dict:
            word_dict[word] = 0
        word_dict[word] +=1
        text.append(word)
#text = [tokenize(sent) for sent in all_episodes_by_sentence]

In [13]:
# total number of words
len(word_dict )

20502

In [14]:
# test if we can reduce the number of words
data_quant = pd.DataFrame.from_dict(word_dict, orient='index').sort_values(by=0, ascending=False)

In [15]:
data_quant.shape

(20502, 1)

In [16]:
#data_quant.tail(50) # looking clean, but many words with only one occurance

In [17]:
# only 5760 words occur more than 3 times
data_quant.loc[data_quant[0] > 2].shape

(7581, 1)

In [19]:
allowed_vocabulary = data_quant.loc[data_quant[0] > 2].index.tolist()


word2idx = {}
idx2word = {}
for count, word in enumerate(allowed_vocabulary):
    word2idx[word] = count
    idx2word[count] = word

In [20]:
word2idx

{'.': 0,
 '\n': 1,
 '-': 2,
 ':': 3,
 'der': 4,
 ',': 5,
 '2020': 6,
 'die': 7,
 'und': 8,
 '02': 9,
 'in': 10,
 'de': 11,
 'mit': 12,
 'den': 13,
 'im': 14,
 '03': 15,
 '(': 16,
 'das': 17,
 ')': 18,
 'von': 19,
 'ist': 20,
 'für': 21,
 'auf': 22,
 'bundesliga': 23,
 'sich': 24,
 'ein': 25,
 'zu': 26,
 'dem': 27,
 'am': 28,
 'hat': 29,
 'freitag': 30,
 'schauspieler': 31,
 'es': 32,
 'bei': 33,
 'er': 34,
 'film': 35,
 'des': 36,
 'nach': 37,
 '?': 38,
 'eine': 39,
 'donnerstag': 40,
 'nicht': 41,
 'als': 42,
 'mittwoch': 43,
 '15': 44,
 'auch': 45,
 'an': 46,
 '!': 47,
 'sänger': 48,
 'sie': 49,
 'aus': 50,
 'gegen': 51,
 'dienstag': 52,
 'wird': 53,
 '14': 54,
 '17': 55,
 'vor': 56,
 'montag': 57,
 'samstag': 58,
 'zum': 59,
 'florian': 60,
 '20': 61,
 'wie': 62,
 'über': 63,
 '16': 64,
 'sonntag': 65,
 '22': 66,
 '19': 67,
 'einem': 68,
 '18': 69,
 'gala': 70,
 'einen': 71,
 'stern': 72,
 '10': 73,
 'tv': 74,
 'um': 75,
 'so': 76,
 'seine': 77,
 '11': 78,
 '13': 79,
 'war': 80,
 '1

In [21]:
# compare readability, create text from only allowed words
text = []
for sent in all_episodes_by_sentence:
    for word in tokenize(sent):
        #if word in word2idx :
        if word in allowed_vocabulary:
            word_dict[word] +=1
            text.append(word)

In [22]:
# original text, first 10 sentences
all_episodes_by_sentence[0:10]

['Donnerstag 08 . 08 . 2019 17 : 07 - Übermedien \n',
 'Klatschpresse macht Klatschpresse - Opfer zum Jammerlappen \n',
 'Interview und beschwerte sich über die außerordentlichen Zumutungen der Boulevardpresse . Die nahm das als neue Munition , um gegen ihn zu schießen . Weiterlesen auf Übermedien : Klatschpresse macht Klatschpresse - Opfer zum \n',
 'Donnerstag 25 . 07 . 2019 15 : 30 - Übermedien \n',
 'Thomas Seitel im Lügentornado \n',
 'Seit einem halben Jahr ist Thomas Seitel mit Helene Fischer liiert , seitdem wird er Woche für Woche von der Klatschpresse durch den Dreck gezogen . Im Zeit Magazin hat er sich erstmals darüber geäußert , wie belastend die Berichterstattung für ihn \n',
 ' Klatschpresse . \n',
 'Donnerstag 16 . 01 . 2020 7 : 16 - GMX \n',
 'Das sagt Hugh Grant über den royalen Rückzug \n',
 'Vor allem die britische Klatschpresse kritisiert Prinz Harry und Herzogin Meghan für ihre Entscheidung . Der Schauspieler hat deswegen eine eindeutige Meinung . \n']

In [25]:
' '.join(text[0:200]) # missing a few proper nouns like Radiohead, but this should do

'donnerstag 08 . 08 . 2019 17 : 07 - übermedien \n klatschpresse macht klatschpresse - opfer zum \n interview und sich über die der . die nahm das als neue , um gegen ihn zu schießen . weiterlesen auf übermedien : klatschpresse macht klatschpresse - opfer zum \n donnerstag 25 . 07 . 2019 15 : 30 - übermedien \n thomas seitel im \n seit einem halben jahr ist thomas seitel mit helene fischer liiert , seitdem wird er woche für woche von der klatschpresse durch den gezogen . im zeit magazin hat er sich erstmals darüber geäußert , wie die für ihn \n  klatschpresse . \n donnerstag 16 . 01 . 2020 7 : 16 - gmx \n das sagt hugh grant über den rückzug \n vor allem die britische klatschpresse kritisiert prinz harry und herzogin meghan für ihre entscheidung . der schauspieler hat deswegen eine meinung . \n freitag 30 . 08 . 2019 18 : 04 - spiegel online \n und : im hause schweinsteiger - \n hugh grant ärgert sich über den britischen und helene fischer über die klatschpresse . grund zur freude gibt

In [24]:
# The unique characters in the file
vocab = sorted(allowed_vocabulary)
print ('{} unique words'.format(len(vocab)))

7581 unique words


In [27]:
# Creating a mapping from unique characters to indices
# Here we are acutally using the pretrained words, so this is not needed
word2idx = {u:i for i, u in enumerate(allowed_vocabulary)}
idx2word = np.array(vocab)

In [28]:
text_as_int = np.array([word2idx[c] for c in text])
text_as_int

array([ 40, 368,   0, ...,  20,   4,   1])

In [29]:
# The maximum length sentence we want for a single input in words
seq_length = 30
examples_per_epoch = len(text)//(seq_length+1)

In [30]:
text_as_int

array([ 40, 368,   0, ...,  20,   4,   1])

In [31]:
# Create training examples / targets
word_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [32]:
# extract sequences from character dataset
sequences = word_dataset.batch(seq_length+1, drop_remainder=True)

In [33]:

def split_shift_input(segment):
    """
    Creates the teaching data by shifting the training data on off to create labeled data
    """
    input_segment = segment[:-1]
    target_segment = segment[1:]
    return input_segment, target_segment

dataset = sequences.map(split_shift_input)

In [34]:
# set up dataset as prebatched
BATCH_SIZE = 35

# shuffle the dataset and set batch size
dataset = dataset.shuffle(10000).batch(BATCH_SIZE, drop_remainder=True)

In [35]:


# Length of the vocabulary
vocab_size = len(vocab)

# hyperparameters

# embedding dimension
embedding_dim = 300

# RNN units
rnn_units = 800


In [36]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    
    i = Input(shape=(None,), batch_size=batch_size, dtype=tf.int64 )
    # use pretrained embedding now
    
    # here is the new embedding layer

    # preset the weights as untrainable
    x = Embedding(vocab_size, embedding_dim,  trainable=True)(i)
    x = LSTM(rnn_units, 
             return_sequences=True,
             stateful=True)(x)
    x = Dense(vocab_size)(x)
    x = Dense(vocab_size)(x)

    model = Model(i,x)
    
    return model
    

In [47]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE )



In [48]:
# simple model
model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(35, None)]              0         
_________________________________________________________________
embedding_2 (Embedding)      (35, None, 300)           2274300   
_________________________________________________________________
unified_lstm_2 (UnifiedLSTM) (35, None, 800)           3523200   
_________________________________________________________________
dense_4 (Dense)              (35, None, 7581)          6072381   
_________________________________________________________________
dense_5 (Dense)              (35, None, 7581)          57479142  
Total params: 69,349,023
Trainable params: 69,349,023
Non-trainable params: 0
_________________________________________________________________


In [49]:
def loss(labels, logits):
    """
    Define loss function 
    """
    return sparse_categorical_crossentropy(labels, logits, from_logits=True)


In [50]:
model.compile(optimizer='adam', loss =loss)# loss='sparse_categorical_crossentropy' )

In [51]:
# Directory where the checkpoints will be saved
today = date.today()

checkpoint_dir = './pretrained_word_training_checkpoints_{today}'.format(today=today)


checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt") # only save last checkpoint

# define callbacks
checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [54]:
EPOCHS=50

In [55]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [56]:
# for prediction, batch size has to be changed
# So reload the model and set shape to [1, None]

tf.train.latest_checkpoint(checkpoint_dir)

model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]),)

model.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(1, None)]               0         
_________________________________________________________________
embedding_3 (Embedding)      (1, None, 300)            2274300   
_________________________________________________________________
unified_lstm_3 (UnifiedLSTM) (1, None, 800)            3523200   
_________________________________________________________________
dense_6 (Dense)              (1, None, 7581)           6072381   
_________________________________________________________________
dense_7 (Dense)              (1, None, 7581)           57479142  
Total params: 69,349,023
Trainable params: 69,349,023
Non-trainable params: 0
_________________________________________________________________


In [57]:
def text_gen(model, start_string, freedom=1.0, num_generate=1000):
    """
    generate text with the trained model
    
    start_string (STR):  Basis for the model to start prediction on. 
    freedom (FLOAT): Multiplier for predictions. The lower it is the lower the impact of predictive variance
    num_generate (INT): Desired text length
    """
    
    text_generated = []
    

    # vectorization of starting string
    input_eval = [word2idx[s] for s in tokenize(start_string)]
    input_eval = tf.expand_dims(input_eval, 0)

    model.reset_states()
    
    
    for i in range(num_generate):
        predictions = model(input_eval)
        
        
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        
        # using a categorical distribution to predict the character returned by the model
        predictions = predictions / freedom
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # We pass the predicted character as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(' ') # spacing
        text_generated.append(idx2word[predicted_id])

    return (start_string + ''.join(text_generated))

In [62]:
print(text_gen(model, start_string=u"Heidi", freedom=2, num_generate=100))

Heidi 37 ) agenten auto " dietmar ! agency 1962 dau 
 07 1991 ! amigos 02 19 09 favoritin alvaro nachrichtenagentur 120 comeback hausarbeit 
 2008 110 . guardians +++ #metoo abgesagte 05 
 1933 0  0  #s04tsg 75 " abwehrspieler ! all 
 1994 mio kündigt jedem 2015 37 1 2001  94 #s04tsg 1837 " abspeck ! 54  , 
 besorgt 03 5 alleine 
 bericht ! 62 02 diverser - zufolge groß 1916 ( 53  ( besser ! achtelfinal 8 06 13 #metoo 
 100 25  0  #s04tsg 75 " 21


In [146]:
word2idx['knnte']

513