In [1]:
# data processing tools
import string, os 
import pandas as pd
import numpy as np
np.random.seed(42)

import random 

# keras module for building LSTM 
import tensorflow as tf
tf.random.set_seed(42)
import tensorflow.keras.utils as ku 
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# surpress warnings
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning) # Ignore warnings from libraries. 

#import sys
#sys.path.append("..")
#import utils.requirement_functions as rf

2023-03-25 19:52:58.825576: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-25 19:52:58.970638: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-25 19:52:58.970655: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-03-25 19:52:59.687268: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directo

## Ross functions

In [2]:
def clean_text(txt): # return vocab if it is not part of string.punctuation 
    # string.punctuation is a python model. ( a list of all string characters that er punctuations /%&¤#";:_-.,*")
    txt = "".join(v for v in txt if v not in string.punctuation).lower() # Making lower case 
    txt = txt.encode("utf8").decode("ascii",'ignore') # encoding utf8
    return txt 

def get_sequence_of_tokens(tokenizer, corpus):
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus: # every head 
        token_list = tokenizer.texts_to_sequences([line])[0] # list of tokens 
        for i in range(1, len(token_list)): # order dem sequentialy
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences

def generate_padded_sequences(input_sequences):
    # get the length of the longest sequence
    max_sequence_len = max([len(x) for x in input_sequences])
    # make every sequence the length of the longest on
    input_sequences = np.array(pad_sequences(input_sequences, 
                                            maxlen=max_sequence_len, 
                                            padding='pre'))

    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, 
                            num_classes=total_words)
    return predictors, label, max_sequence_len

def create_model(max_sequence_len, total_words): # model initilisation 
    input_len = max_sequence_len - 1
    model = Sequential() # sequential model
    # Add Input Embedding Layer
    model.add(Embedding(total_words, #
                        10, 
                        input_length=input_len))
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100)) # long short term memory
    model.add(Dropout(0.1)) # drop out layer, during training everytime you make an iteration 10% of the weights should be removed. 
    # so every iteration is only 90 %. Making things a bit more diffiuclt for the model 
    # Add Output Layer
    model.add(Dense(total_words, 
                    activation='softmax')) # Softmax prediction.
    model.compile(loss='categorical_crossentropy', 
                    optimizer='adam')
    
    return model

def generate_text(seed_text, next_words, model, max_sequence_len): # seed_text = prompt.
    for _ in range(next_words): # for how ever many in next_word.
        token_list = tokenizer.texts_to_sequences([seed_text])[0] # get vocab 
        token_list = pad_sequences([token_list],  # pad it (zeros)
                                    maxlen=max_sequence_len-1, 
                                    padding='pre')
        predicted = np.argmax(model.predict(token_list), # predict the next words with higest score.
                                            axis=1)
        
        output_word = ""
        for word,index in tokenizer.word_index.items(): # appending words together. 
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [3]:
# Loading data 
def filepath ():
    print("Loading data")
    data_dir = os.path.join("..","data", "news_data")
    return data_dir

In [5]:
# Appending columns
def creating_list(file_path): 
    all_comments = []
    for filename in os.listdir(file_path):
        if 'Comments' in filename:
            comment_df = pd.read_csv(data_dir + "/" + filename) # joining data_dir / filename. ( Creating dataframe)
            all_comments.extend(list(comment_df["commentBody"].values)) # Creating a list of only comments. 
    print("Amount of comments: " + len(all_comments))
    return all_comments

## MAking a new list with 1000 random

In [7]:
def data_sampling(comments_list):
    thousand_comments = random.sample(comments_list, 1000)
    print("Sample size: " + len(thousand_comments))
    return thousand_comments

# Cleaning text

In [9]:
def cleaning_comments(sample_list):
    print("Cleaning text")
    corpus = [clean_text(x) for x in sample_list]
    
    return corpus

## Tokenize

In [10]:
def tokenization(clean_data):
    print("Tokenizing")
    tokenizer = Tokenizer()
    ## tokenization
    tokenizer.fit_on_texts(clean_data) # tokenizing the text, and gives every word an index. Creating a vocab.
    total_words = len(tokenizer.word_index) + 1 # how many total words are there. The reason for + 1 is to account for  = out of vocabulary token. if the tensorflow does not know the word. <unk> unknown word.
    
    return tokenizer, total_words

In [46]:
def input_sequence_function(tokenizer, clean_data):
    print("Input sequence")
    inp_sequences = get_sequence_of_tokens(tokenizer, clean_data)
    # Each document has multiple rows. 1-2, 1-2-3, 1-2-3-4 words (n-grams)
    # Teaching the model to account to longer distances. 
    return inp_sequences

[[4588, 2139],
 [4588, 2139, 718],
 [4588, 2139, 718, 91],
 [4588, 2139, 718, 91, 76],
 [4588, 2139, 718, 91, 76, 4589],
 [4588, 2139, 718, 91, 76, 4589, 37],
 [4588, 2139, 718, 91, 76, 4589, 37, 509],
 [4588, 2139, 718, 91, 76, 4589, 37, 509, 809],
 [4588, 2139, 718, 91, 76, 4589, 37, 509, 809, 3],
 [4588, 2139, 718, 91, 76, 4589, 37, 509, 809, 3, 581]]

## Pad

In [11]:
def padded_sequences(input_sequence):
    print("Padding sequences")
    predictors, label, max_sequence_len = generate_padded_sequences(input_sequence) 
    # All inputs need to be same lenght. 
    # adding zeros to the start of shorted sequences 
    # predictors = input vectors 
    # labels = words 
    print("Max sequence length: " + max_sequence_len)
    return predictors, label, max_sequence_len

## Model

In [12]:
def create_model(sequnece_length, total_words):
    print("Creating model")
    model = create_model(sequnece_length, total_words)
    print(model.summary())
    return model

In [52]:
def training_model(model):
    print("Training model")
    history = model.fit(predictors, 
                        label, 
                        epochs=1, # prev. 100
                        batch_size=128, # Updates weights after 128 
                        verbose=1)
return history

# In notebooks, a models history is saved. So if the model has run one time with 100 epoch and you start it again it will run for 200 intotal.
# You either need to create the model again ( Above chunck) or use tensor flow functiion clear history.

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [54]:
print (generate_text("Hello", 10, model, max_sequence_len)) # word you want, words to come after, model, make the sequence 24 in total.

Hello Is The Same Of The Us Of The Us Of


In [13]:
def main_function():
    data_dir = filepath()
    all_comments = creating_list(data_dir)
    thousand_comments = data_sampling(allcomments)
    corpus = cleaning_comments(clean_text, thousand_comments)
    tokenizer, total_words = tokenization(corpus)
    inp_sequences = input_sequence_function(tokenizer, corpus)
    predictors, label, max_sequence_len = padded_sequences(inp_sequences)
    model = create_model(max_sequence_len, total_words)
    history = training_model(model)

In [14]:
main_function()

Loading data


NameError: name 'data_dir' is not defined