In [28]:
# data processing tools
import string, os 
import pandas as pd
import numpy as np
np.random.seed(42)

import random 

# keras module for building LSTM 
import tensorflow as tf
tf.random.set_seed(42)
import tensorflow.keras.utils as ku 
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# surpress warnings
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning) # Ignore warnings from libraries. 

#import sys
#sys.path.append("..")
#import utils.requirement_functions as rf

## Ross functions

In [38]:
def clean_text(txt): # return vocab if it is not part of string.punctuation 
    # string.punctuation is a python model. ( a list of all string characters that er punctuations /%&¤#";:_-.,*")
    txt = "".join(v for v in txt if v not in string.punctuation).lower() # Making lower case 
    txt = txt.encode("utf8").decode("ascii",'ignore') # encoding utf8
    return txt 

def get_sequence_of_tokens(tokenizer, corpus):
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus: # every head 
        token_list = tokenizer.texts_to_sequences([line])[0] # list of tokens 
        for i in range(1, len(token_list)): # order dem sequentialy
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences

def generate_padded_sequences(input_sequences):
    # get the length of the longest sequence
    max_sequence_len = max([len(x) for x in input_sequences])
    # make every sequence the length of the longest on
    input_sequences = np.array(pad_sequences(input_sequences, 
                                            maxlen=max_sequence_len, 
                                            padding='pre'))

    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, 
                            num_classes=total_words)
    return predictors, label, max_sequence_len

def create_model(max_sequence_len, total_words): # model initilisation 
    input_len = max_sequence_len - 1
    model = Sequential() # sequential model
    # Add Input Embedding Layer
    model.add(Embedding(total_words, #
                        10, 
                        input_length=input_len))
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100)) # long short term memory
    model.add(Dropout(0.1)) # drop out layer, during training everytime you make an iteration 10% of the weights should be removed. 
    # so every iteration is only 90 %. Making things a bit more diffiuclt for the model 
    # Add Output Layer
    model.add(Dense(total_words, 
                    activation='softmax')) # Softmax prediction.
    model.compile(loss='categorical_crossentropy', 
                    optimizer='adam')
    
    return model

def generate_text(seed_text, next_words, model, max_sequence_len): # seed_text = prompt.
    for _ in range(next_words): # for how ever many in next_word.
        token_list = tokenizer.texts_to_sequences([seed_text])[0] # get vocab 
        token_list = pad_sequences([token_list],  # pad it (zeros)
                                    maxlen=max_sequence_len-1, 
                                    padding='pre')
        predicted = np.argmax(model.predict(token_list), # predict the next words with higest score.
                                            axis=1)
        
        output_word = ""
        for word,index in tokenizer.word_index.items(): # appending words together. 
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [39]:
# Loading data 
print("Loading data")
data_dir = os.path.join("..","data", "news_data")


Loading data


In [40]:
# Appending columns 
all_comments = []
for filename in os.listdir(data_dir):
    if 'Comments' in filename:
        comment_df = pd.read_csv(data_dir + "/" + filename) # joining data_dir / filename. ( Creating dataframe)
        all_comments.extend(list(comment_df["commentBody"].values)) # Creating a list of only comments. 

In [41]:
len(all_comments)

2176364

## MAking a new list with 1000 random

In [42]:
thousand_comments = random.sample(all_comments, 1000)

In [43]:
len(thousand_comments)

1000

# Cleaning text

In [44]:
corpus = [clean_text(x) for x in thousand_comments]
corpus[:10]

['hi iq explain then why wyoming has 2 senators and california has 2 senators brbrif you were a high iq foreign country say which state senators would be easiest to buy with your money and which presidential candidate would you supportbrbras an exercise for the highest iq cabinet ever study the campaign contributions to the low population states visavis the high population states look at the  per vote by state by foreign interest by foreign aid ok smarty pants',
 'memo to the trump administratiionbrbrif you cant learn to live with reality reality will come to live with you',
 'neither you nor i have much power to do anything about this manchild but congressional republicans do when will they put country before party and stop protecting the president',
 'they tell us if we see something say something  but they didnt say what would happen afterwards apparently nothing  this is shatteringly sad',
 'actually no  their competition in china actually beat uber at this game and forcing uber to

## Tokenize

In [45]:
tokenizer = Tokenizer()
## tokenization
tokenizer.fit_on_texts(corpus) # tokenizing the text, and gives every word an index. Creating a vocab.
total_words = len(tokenizer.word_index) + 1 # how many total words are there. The reason for + 1 is to account for  = out of vocabulary token. if the tensorflow does not know the word. <unk> unknown word.
total_words

10957

In [46]:
inp_sequences = get_sequence_of_tokens(tokenizer, corpus)
inp_sequences[:10] # Each document has multiple rows. 1-2, 1-2-3, 1-2-3-4 words (n-grams)
# Teaching the model to account to longer distances. 

[[4588, 2139],
 [4588, 2139, 718],
 [4588, 2139, 718, 91],
 [4588, 2139, 718, 91, 76],
 [4588, 2139, 718, 91, 76, 4589],
 [4588, 2139, 718, 91, 76, 4589, 37],
 [4588, 2139, 718, 91, 76, 4589, 37, 509],
 [4588, 2139, 718, 91, 76, 4589, 37, 509, 809],
 [4588, 2139, 718, 91, 76, 4589, 37, 509, 809, 3],
 [4588, 2139, 718, 91, 76, 4589, 37, 509, 809, 3, 581]]

## Pad

In [47]:
predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences) 
# All inputs need to be same lenght. 
# adding zeros to the start of shorted sequences 
# predictors = input vectors 
# labels = words 

In [48]:
max_sequence_len # 264

264

## Model

In [49]:
model = create_model(max_sequence_len, total_words)
model.summary()

2023-03-24 16:42:37.168958: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-03-24 16:42:37.169022: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2023-03-24 16:42:37.169071: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (j-688397-job-0): /proc/driver/nvidia/version does not exist
2023-03-24 16:42:37.170068: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 263, 10)           109570    
                                                                 
 lstm (LSTM)                 (None, 100)               44400     
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 10957)             1106657   
                                                                 
Total params: 1,260,627
Trainable params: 1,260,627
Non-trainable params: 0
_________________________________________________________________


In [52]:
history = model.fit(predictors, 
                    label, 
                    epochs=10, # prev. 100
                    batch_size=128, # prev. 128 # Updates weights after 128 
                    verbose=1)

# In notebooks, a models history is saved. So if the model has run one time with 100 epoch and you start it again it will run for 200 intotal.
# You either need to create the model again ( Above chunck) or use tensor flow functiion clear history.

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [54]:
print (generate_text("Hello", 10, model, max_sequence_len)) # word you want, words to come after, model, make the sequence 24 in total.

Hello Is The Same Of The Us Of The Us Of
