In [1]:
import nltk
import pandas as pd
import pickle
import re
import keras
import tensorflow as tf

physical_devices = tf.config.experimental.list_physical_devices('GPU')
config = tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [2]:
def fix_this(text):
    """ Takes in a string and returns everything inside quotes. """
    if text.startswith('"'):
        text = re.findall('".*?"', text)
        if len(text) > 0:
            return text[0]
        else:
            return None
    else:
        text = '"' + text
        return fix_this(text)

    
class NLPiper:
    import keras
    import tensorflow as tf
    import nltk
    import pandas as pd
    import re
    """
    Takes in an iterable with strings (i.e. list of sentences)
    and returns it preprocessed, allowing to access each level of
    preprocessing: 'cleaned', 'tagged', 'lemmat', 'tk_lemm', 'tk_norm'.
    You can access the tokenizer inversed dictionary through 'tk_map'
    and the 'tokenizer' object.
    
    :param: txt: iterable with strings to preprocess.
    :param: level: 'word' or 'char', if the preprocessing is
                    word level or character level.
    """
    
    def __init__(self, txt, level='word'):
        self.txt = pd.Series(txt)
        self.level = level

    def clean_this(self, x):
        """ Takes in a string and returns it cleaned. """
        x = x.lower().strip()
        x = x.replace('-', ' ').replace('`', '')
        x = x.replace('[', '').replace(']', '')
        x = x.replace(',', '')
        x = x.replace("i'm", "i am")
        x = x.replace("'re", " are")
        x = x.replace("gotta", "got to")
        x = x.replace("he's", "he is")
        x = x.replace("she's", "she is")
        x = x.replace("it's", "it is")
        x = x.replace("that's", "that is")
        x = x.replace("what's", "what is")
        x = x.replace("how's", "how is")
        x = x.replace("here's", "here is")
        x = x.replace("there's", "there is")
        x = x.replace("'s", "")
        x = x.replace("'ve", " have")
        x = x.replace("'d", " would")
        x = x.replace("'ll", " will")
        x = x.replace("can't", "can not")
        x = x.replace("won't", "will not")
        x = x.replace("n't", " not")
        x = x.replace("'bout", "about")
        x = x.replace("'til", "until")
        x = x.replace("'cause", "because")
        x = x.replace("gonna", "going to")
        x = x.replace("kinda", "kind of")
        x = x.replace("n'", "ng")
        x = x.replace('"', '').replace("'", "").replace(':', '')
        x = x.replace('...', '').replace('!', '').replace('?', "")
        x = x.replace('.', "")
        x = x.replace("  ", " ").strip()
        self.clean = x.split()
        return self.clean

    def pos_align(self, x):
        if x.startswith('J'):
            return nltk.corpus.wordnet.ADJ
        elif x.startswith('V'):
            return nltk.corpus.wordnet.VERB
        elif x.startswith('N'):
            return nltk.corpus.wordnet.NOUN
        elif x.startswith('R'):
            return nltk.corpus.wordnet.ADV
        else:
            return 'n'

    def lemm_this(self, x):
        lemmatizer = nltk.stem.WordNetLemmatizer() 
        lemmas = [lemmatizer.lemmatize(word, pos=self.pos_align(tag))
                  for word, tag in x]
        self.lemmas = lemmas
        return self.lemmas
    
    def char_level(self, x):
        self.chars = [char for char in ' '.join(x)]
        return self.chars
    
    def hadouken(self):
        
        # returns cleaned texts
        self.cleaned = self.txt.apply(self.clean_this)
        
        # start tokenizer
        tokenizer = keras.preprocessing.text.Tokenizer()
        self.tokenizer = tokenizer
        
        # add tags for correct lemmatization
        self.tagged = self.cleaned.apply(nltk.pos_tag)
        
        # lemmatizes the texts
        self.lemmat = self.tagged.apply(self.lemm_this)
        
        # maintains the word level split or splits by characters
        if self.level == 'char':
            self.cleaned = self.cleaned.apply(self.char_level)
            self.lemmat = self.lemmat.apply(self.char_level)
        
        # fits the tokenizer
        tokenizer.fit_on_texts(list(self.lemmat) + list(self.cleaned))
        
        # tokenized and lemmatized
        self.tk_lemm = keras.preprocessing.sequence.pad_sequences(
                        tokenizer.texts_to_sequences(self.lemmat),
                        padding='post')
        
        # tokenized but not lemmatized
        self.tk_norm = keras.preprocessing.sequence.pad_sequences(
                        tokenizer.texts_to_sequences(self.cleaned),
                        padding='post')
        
        
        # inverse map the tokenizer
        self.tk_map = {v:k for k,v in tokenizer.word_index.items()}
        
        # maximum input (lemmatized) and output (not lemmatized) length
        self.max_in = max([len(i) for i in self.tk_lemm])
        self.max_out = max([len(i) for i in self.tk_norm])
        

In [3]:
corpora = pd.read_csv('../data/quotesdrivedb.txt',
                      sep=',,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,',
                      engine='python',
                      header=None)[0].dropna()

corpora = corpora.apply(fix_this).dropna()

brown = [' '.join(i) for i in nltk.corpus.brown.sents()]

In [4]:
piper_brown = NLPiper(brown, level='char')
piper_brown.hadouken()

#piper_quotes = NLPiper(corpora)
#piper_quotes.hadouken()

In [42]:
X = piper_brown.tk_lemm
y = piper_brown.tk_norm

In [43]:
piper_brown.lemmat

0        [t, h, e,  , f, u, l, t, o, n,  , c, o, u, n, ...
1        [t, h, e,  , j, u, r, y,  , f, a, r,  , s, a, ...
2        [t, h, e,  , s, e, p, t, e, m, b, e, r,  , o, ...
3        [o, n, l, y,  , a,  , r, e, l, a, t, i, v, e, ...
4        [t, h, e,  , j, u, r, y,  , s, a, y,  , i, t, ...
                               ...                        
57335                 [s,  , j,  , p, e, r, e, l, m, a, n]
57336    [r, e, v, u, l, s, i, o, n,  , i, n,  , t, h, ...
57337    [t, h, e,  , d, o, o, r,  , o, f,  , t, h, e, ...
57338    [s, h, e,  , b, e,  , a,  , l, i, v, i, n, g, ...
57339    [f, r, o, m,  , w, h, a, t,  , i,  , b, e,  , ...
Length: 57340, dtype: object

In [44]:
max_in = piper_brown.max_in
max_out = piper_brown.max_out
uniques = len(piper_brown.tokenizer.word_index)+1
max_in, max_out, uniques

(966, 1004, 49)

In [45]:
from keras.layers import Embedding, Bidirectional, LSTM, Dropout, \
                         TimeDistributed, RepeatVector, Dense
from tensorflow.compat.v1.keras.layers import CuDNNLSTM

model = keras.Sequential()
model.add(Embedding(uniques, 100, input_length=max_in))
model.add(Bidirectional(CuDNNLSTM(128)))
model.add(RepeatVector(max_out))
model.add(Bidirectional(CuDNNLSTM(128, return_sequences=True)))
model.add(CuDNNLSTM(128, return_sequences=True))
#model.add(Dense(max_out/2, activation='selu'))
#model.add(Dense(uniques, activation='selu'))
#model.add(CuDNNLSTM(128, return_sequences=True))
#model.add(Dropout(0.2))
model.add(TimeDistributed(Dense(uniques, activation='softmax')))

model.summary()

Model: "sequential_18"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 966, 100)          4900      
_________________________________________________________________
bidirectional_27 (Bidirectio (None, 256)               235520    
_________________________________________________________________
repeat_vector_15 (RepeatVect (None, 1004, 256)         0         
_________________________________________________________________
bidirectional_28 (Bidirectio (None, 1004, 256)         395264    
_________________________________________________________________
cu_dnnlstm_38 (CuDNNLSTM)    (None, 1004, 128)         197632    
_________________________________________________________________
time_distributed_9 (TimeDist (None, 1004, 49)          6321      
Total params: 839,637
Trainable params: 839,637
Non-trainable params: 0
_______________________________________________

In [41]:
model.compile(#optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
    
model.fit(X, y, batch_size=128, epochs=5,
          validation_split=0.2, shuffle=True,
          use_multiprocessing=True, workers=8)


Epoch 1/5

KeyboardInterrupt: 

In [11]:
model.save('dislemma.h5')




In [57]:
sentence = [i for i in 'he good be ready']#[::-1]
sentence = piper_brown.tokenizer.texts_to_sequences([sentence])
sentence = keras.preprocessing.sequence.pad_sequences(sentence, maxlen=X.shape[-1], padding='post')
predictions = model.predict(sentence)
predictions.shape

(1, 1004, 49)

In [68]:
import numpy as np
pred = [piper_brown.tk_map[np.argmax(j)] for i in predictions for j in i if np.argmax(j)>0]

In [69]:
pred

['h',
 'e',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 ' ',
 'e']

In [70]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense


encoder_inputs = Input(shape=(None, max_in))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None, max_out))
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
# Run training
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)


In [None]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [None]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index['\t']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [None]:
with open('../data/piped_corpora.pkl', 'wb') as file:
    pickle.dump((piper_quotes, piper_brown), file)

In [None]:
with open('../data/piped_corpora.pkl', 'rb') as file:
    piper_quotes, piper_brown = pickle.load(file)

In [None]:
text = []

for sent in y:
    t = []
    for word, tag in sent:
        t.append(word)
    text.append(t)


tags = []

for sent in y:
    t = []
    for word, tag in sent:
        t.append(tag)
    tags.append(t.lower())

48