In [1]:
#import

# data processing tools
import string, os 
import pandas as pd
import numpy as np
np.random.seed(42)


# keras module for building LSTM 
import tensorflow as tf
tf.random.set_seed(42)
import tensorflow.keras.utils as ku 
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# surpress warnings
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

2023-04-16 13:11:32.555437: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-04-16 13:11:35.516187: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-04-16 13:11:35.518302: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
#helper functions
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

def get_sequence_of_tokens(tokenizer, corpus):
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    total_words = len(tokenizer.word_index) + 1
    return input_sequences, total_words

def generate_padded_sequences(input_sequences):
    # get the length of the longest sequence
    max_sequence_len = max([len(x) for x in input_sequences])
    # make every sequence the length of the longest on
    input_sequences = np.array(pad_sequences(input_sequences, 
                                            maxlen=50, 
                                            padding='pre'))

    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, 
                            num_classes=total_words)
    return predictors, label, max_sequence_len

def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 
                        10, 
                        input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, 
                    activation='softmax'))

    model.compile(loss='categorical_crossentropy', 
                    optimizer='adam')
    
    return model

def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], 
                                    maxlen=max_sequence_len-1, 
                                    padding='pre')
        predicted = np.argmax(model.predict(token_list),
                                            axis=1)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [3]:
data_dir = os.path.join("/work/cds-lang/data/")

In [4]:
all_comments = []
for filename in os.listdir(data_dir):
    if 'Comments' in filename:
        article_df = pd.read_csv(data_dir + filename)
        all_comments.extend(list(article_df["commentBody"].values))

In [5]:
all_comments = [h for h in all_comments if h != "Unknown"]
len(all_comments)
all_comments[:10]


['If the choice is between mining for bitcoin - which wastes hydroelectric power and takes up empty office space -- or mining for gold &amp; diamonds - which is hugely destructive to people and the environment in developing countries - I think we can live with bitcoin mining.',
 "<br/>To me, Bitcoin (et al) appears to be an expensive game a number of speculative people, many who want to hide their money, are playing. <br/><br/>The problem: it is only etherial math. Neat, but not worth anything if people get turned off the game by losing money. A matter of time, I believe, because it's too much like a Ponzi scheme.",
 'Bitcoin is a pyramid scheme backed by nothing and meaning nothing.  It is useful to criminal enterprises, terrorists and those bent on evading taxes. It is not a generally accepted form of currency and it will never be.  Yes, we are wasting huge amounts of economic potential on financial engineering instead of building infrastructure, devising new architectural methods, a

In [6]:
corpus = [clean_text(x) for x in all_comments]
corpus[:10]

['if the choice is between mining for bitcoin  which wastes hydroelectric power and takes up empty office space  or mining for gold amp diamonds  which is hugely destructive to people and the environment in developing countries  i think we can live with bitcoin mining',
 'brto me bitcoin et al appears to be an expensive game a number of speculative people many who want to hide their money are playing brbrthe problem it is only etherial math neat but not worth anything if people get turned off the game by losing money a matter of time i believe because its too much like a ponzi scheme',
 'bitcoin is a pyramid scheme backed by nothing and meaning nothing  it is useful to criminal enterprises terrorists and those bent on evading taxes it is not a generally accepted form of currency and it will never be  yes we are wasting huge amounts of economic potential on financial engineering instead of building infrastructure devising new architectural methods and rising to the challenges of global 

In [7]:
## tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1


In [8]:
inp_sequences, total_words = get_sequence_of_tokens(tokenizer, corpus)
inp_sequences[:10]

[[37, 1],
 [37, 1, 591],
 [37, 1, 591, 6],
 [37, 1, 591, 6, 279],
 [37, 1, 591, 6, 279, 3680],
 [37, 1, 591, 6, 279, 3680, 9],
 [37, 1, 591, 6, 279, 3680, 9, 13732],
 [37, 1, 591, 6, 279, 3680, 9, 13732, 79],
 [37, 1, 591, 6, 279, 3680, 9, 13732, 79, 16598],
 [37, 1, 591, 6, 279, 3680, 9, 13732, 79, 16598, 27307]]

In [9]:
predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [1]:
#CREATE MODEL
model = create_model(max_sequence_len, total_words)
model.summary()

history = model.fit(predictors, 
                    label, 
                    epochs=10,
                    batch_size=8, 
                    verbose=1)

print (generate_text("danish", 5, model, max_sequence_len))


NameError: name 'create_model' is not defined