# Session 8 - Language modelling with RNNs (Text Generation)

In [1]:
# data processing tools
import string, os 
import pandas as pd
import numpy as np
np.random.seed(42)

# keras module for building LSTM 
import tensorflow as tf
tf.random.set_seed(42)
import tensorflow.keras.utils as ku 
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# surpress warnings
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning) # Ignore warnings from libraries. 

2023-03-24 16:37:16.281199: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-24 16:37:16.413027: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-24 16:37:16.413044: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-03-24 16:37:17.086356: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directo

## Some helper functions

In [2]:
def clean_text(txt): # return vocab if it is not part of string.punctuation 
    # string.punctuation is a python model. ( a list of all string characters that er punctuations /%&¤#";:_-.,*")
    txt = "".join(v for v in txt if v not in string.punctuation).lower() # Making lower case 
    txt = txt.encode("utf8").decode("ascii",'ignore') # encoding utf8
    return txt 

def get_sequence_of_tokens(tokenizer, corpus):
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus: # every head 
        token_list = tokenizer.texts_to_sequences([line])[0] # list of tokens 
        for i in range(1, len(token_list)): # order dem sequentialy
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences

def generate_padded_sequences(input_sequences):
    # get the length of the longest sequence
    max_sequence_len = max([len(x) for x in input_sequences])
    # make every sequence the length of the longest on
    input_sequences = np.array(pad_sequences(input_sequences, 
                                            maxlen=max_sequence_len, 
                                            padding='pre'))

    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, 
                            num_classes=total_words)
    return predictors, label, max_sequence_len

def create_model(max_sequence_len, total_words): # model initilisation 
    input_len = max_sequence_len - 1
    model = Sequential() # sequential model
    # Add Input Embedding Layer
    model.add(Embedding(total_words, #
                        10, 
                        input_length=input_len))
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100)) # long short term memory
    model.add(Dropout(0.1)) # drop out layer, during training everytime you make an iteration 10% of the weights should be removed. 
    # so every iteration is only 90 %. Making things a bit more diffiuclt for the model 
    # Add Output Layer
    model.add(Dense(total_words, 
                    activation='softmax')) # Softmax prediction.
    model.compile(loss='categorical_crossentropy', 
                    optimizer='adam')
    
    return model

def generate_text(seed_text, next_words, model, max_sequence_len): # seed_text = prompt.
    for _ in range(next_words): # for how ever many in next_word.
        token_list = tokenizer.texts_to_sequences([seed_text])[0] # get vocab 
        token_list = pad_sequences([token_list],  # pad it (zeros)
                                    maxlen=max_sequence_len-1, 
                                    padding='pre')
        predicted = np.argmax(model.predict(token_list), # predict the next words with higest score.
                                            axis=1)
        
        output_word = ""
        for word,index in tokenizer.word_index.items(): # appending words together. 
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

## Load the data

In [5]:
data_dir = os.path.join("..","..","assignment-3---rnns-for-text-generation-Augustseverinandersen","data","news_data")

We're then going to load the data one at a time and append *only* the headlines to our list of data.

In [6]:
all_headlines = []
for filename in os.listdir(data_dir):
    if 'Articles' in filename:
        article_df = pd.read_csv(data_dir + "/" + filename) # joining data_dir / filename. ( Creating dataframe)
        all_headlines.extend(list(article_df["headline"].values)) # Creating a list of only headlines. 
# If string "article" is in the headline, and just keep the headline column.

We then clean up a little bit and see how many data points we have.

In [7]:
all_headlines = [h for h in all_headlines if h != "Unknown"] # keep the headlines if they are not "unknown"
len(all_headlines)

8603

We call out ```clean_text()``` function and then inspect the first 10 texts.

In [8]:
corpus = [clean_text(x) for x in all_headlines]
corpus[:10]

['my beijing the sacred city',
 '6 million riders a day 1930s technology',
 'seeking a crossborder conference',
 'questions for despite the yuck factor leeches are big in russian medicine',
 'who is a criminal',
 'an antidote to europes populism',
 'the cost of a speech',
 'degradation of the language',
 'on the power of being awful',
 'trump garbles pitch on a revised health bill']

## Tokenize

We're then going to tokenize our data, using the ```Tokenizer()``` class from ```TensorFlow```, about which you can read more [here](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer).

We then use the ```get_sequence_of_tokens()``` function we defined above, which turns every text into a sequence of tokens based on the vocabulary from the tokenizer.

In [9]:
tokenizer = Tokenizer()
## tokenization
tokenizer.fit_on_texts(corpus) # tokenizing the text, and gives every word an index. Creating a vocab.
total_words = len(tokenizer.word_index) + 1 # how many total words are there. The reason for + 1 is to account for  = out of vocabulary token. if the tensorflow does not know the word. <unk> unknown word.

In [10]:
tokenizer.word_index # Ordered based on freq

{'the': 1,
 'a': 2,
 'to': 3,
 'of': 4,
 'in': 5,
 'for': 6,
 'and': 7,
 'on': 8,
 'is': 9,
 'trump': 10,
 'with': 11,
 'new': 12,
 'at': 13,
 'how': 14,
 'what': 15,
 'you': 16,
 'an': 17,
 'from': 18,
 'as': 19,
 'it': 20,
 'trumps': 21,
 'its': 22,
 'your': 23,
 'are': 24,
 'be': 25,
 'not': 26,
 'us': 27,
 'season': 28,
 'that': 29,
 'by': 30,
 'about': 31,
 'but': 32,
 'can': 33,
 'episode': 34,
 'do': 35,
 'this': 36,
 'when': 37,
 'york': 38,
 'up': 39,
 'over': 40,
 'why': 41,
 'no': 42,
 'i': 43,
 'out': 44,
 'more': 45,
 'my': 46,
 'after': 47,
 'will': 48,
 'may': 49,
 'we': 50,
 'or': 51,
 'war': 52,
 'who': 53,
 'his': 54,
 'health': 55,
 'teaching': 56,
 'questions': 57,
 'now': 58,
 'president': 59,
 'was': 60,
 'one': 61,
 'house': 62,
 'get': 63,
 'today': 64,
 'have': 65,
 'should': 66,
 'into': 67,
 'home': 68,
 'all': 69,
 'dont': 70,
 'life': 71,
 'our': 72,
 'has': 73,
 'plan': 74,
 'good': 75,
 'first': 76,
 'gop': 77,
 '1': 78,
 'says': 79,
 'like': 80,
 'white'

In [11]:
inp_sequences = get_sequence_of_tokens(tokenizer, corpus)
inp_sequences[:10] # Each document has multiple rows. 1-2, 1-2-3, 1-2-3-4 words (n-grams)
# Teaching the model to account to longer distances. 

[[46, 1601],
 [46, 1601, 1],
 [46, 1601, 1, 1951],
 [46, 1601, 1, 1951, 120],
 [122, 331],
 [122, 331, 1952],
 [122, 331, 1952, 2],
 [122, 331, 1952, 2, 125],
 [122, 331, 1952, 2, 125, 2484],
 [122, 331, 1952, 2, 125, 2484, 812]]

We then want to *pad* our input sequences to make them all the same length.

In [13]:
predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences) 
# All inputs need to be same lenght. 
# adding zeros to the start of shorted sequences 
# predictors = input vectors 
# labels = words 

In [None]:
max_sequence_len # 24

## Create model

We then use the ```create_model()``` function created above to initialize a model, telling the model the length of sequences and the total size of the vocabulary.

In [14]:
model = create_model(max_sequence_len, total_words)
model.summary()

2023-03-22 10:30:28.139544: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 23, 10)            112650    
                                                                 
 lstm (LSTM)                 (None, 100)               44400     
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 11265)             1137765   
                                                                 
Total params: 1,294,815
Trainable params: 1,294,815
Non-trainable params: 0
_________________________________________________________________


Model training is exactly the same as last week, but instead of document labels, we're fitting the model to predict next word.

*NB!* This will take some time to train! It took me 35 minutes on UCloud 32xCPU.

In [16]:
history = model.fit(predictors, 
                    label, 
                    epochs=100,
                    batch_size=128, # Updates weights after 128 
                    verbose=1)

# In notebooks, a models history is saved. So if the model has run one time with 100 epoch and you start it again it will run for 200 intotal.
# You either need to create the model again ( Above chunck) or use tensor flow functiion clear history.

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
 13/405 [..............................] - ETA: 29s - loss: 3.6229

KeyboardInterrupt: 

When the model has trained, we can then use this to generate *new text*.

In [27]:
print (generate_text("Nerual Networks", 10, model, max_sequence_len)) # word you want, words to come after, model, make the sequence 24 in total.

Nerual Networks Says A Columnist Of A Backup Quarterback Rape Case Dies


## Using pre-trained word embeddings

Instead of having the embedding layer as a trainable parameter, we can instead using a *pretrained word embedding* model like ```word2vec```.

In the following examples, we're using [GloVe embeddings](https://nlp.stanford.edu/projects/glove/). These are trained a little differently from ```word2vec``` but they behave in the same way.

In [None]:
path_to_glove_file = os.path.join("path/to/glove/vectors")

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

We can define some variables that we're going to use later.

With hits and misses, we're counting how many words in the corpus vocabulary have a corresponding GloVe embedding; misses are the words which appear in our vocabulary but which do not have a GloVe embedding.

In [None]:
num_tokens = total_words
embedding_dim = 100
hits = 0
misses = 0

In [None]:
# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

In [None]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer - notice that this is different
    model.add(Embedding(
            total_words,
            embedding_dim,
            embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
            trainable=False,
            input_length=input_len)
    )
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(500))
    model.add(Dropout(0.2))
    
    # Add Output Layer
    model.add(Dense(total_words, 
                    activation='softmax'))

    model.compile(loss='categorical_crossentropy', 
                    optimizer='adam')
    
    return model

In [None]:
model = create_model(max_sequence_len, total_words)
model.summary()

In [None]:
history = model.fit(predictors, 
                    label, 
                    epochs=100,
                    batch_size=128, 
                    verbose=1)

In [None]:
print (generate_text("china", 30, model, max_sequence_len))