<a href="https://colab.research.google.com/github/WomenInDataScience-Seattle/FortuneCookie/blob/master/FortuneCookieModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

TODO: Add the pre-trained embedding layer to our model following this guide: https://keras.io/examples/pretrained_word_embeddings/



In [17]:
from io import StringIO

import os

import numpy as np
import pandas as pd
import requests
url='https://raw.githubusercontent.com/WomenInDataScience-Seattle/Machine_Learning_Projects/master/FortuneCookie/training_data/data.csv'
s=requests.get(url).text

c=pd.read_csv(StringIO(s))

In [18]:
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
from scipy.spatial.distance import cdist

In [42]:

# from tf.keras.models import Sequential  # This does not work!
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding, LSTM, Dropout
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from keras.initializers import Constant
import keras.utils as ku 

In [20]:
BASE_DIR = '/home/jovyan/training_data/'
GLOVE_DIR = os.path.join(BASE_DIR, '')
TEXT_DATA_DIR = os.path.join(BASE_DIR, '20_newsgroup')
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [21]:
print('Indexing word vectors.')

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))


Indexing word vectors.
Found 400000 word vectors.


Prepare embedding matrix

In [22]:
# random-word used to generate the first word in the sequence
!pip install random-word
from random_word import RandomWords

Collecting random-word
  Using cached https://files.pythonhosted.org/packages/95/0d/c672ff7d6e36f88e60cbdd669f1247041fb7a45f3e2368d314f65b1dd933/Random_Word-1.0.4-py3-none-any.whl
Collecting nose (from random-word)
  Using cached https://files.pythonhosted.org/packages/15/d8/dd071918c040f50fa1cf80da16423af51ff8ce4a0f2399b7bf8de45ac3d9/nose-1.3.7-py3-none-any.whl
Installing collected packages: nose, random-word
Successfully installed nose-1.3.7 random-word-1.0.4


In [23]:
c.head(5)

Unnamed: 0,Fortune Cookie Quotes,Unnamed: 1
0,Be a generous friend and a fair enemy,
1,Never quit!,
2,"Old friends, old wines and old gold are best",
3,"If your desires are not extravagant, they will...",
4,Every Friend Joys in your Success,


In [24]:
fortune_data = c['Fortune Cookie Quotes']

In [25]:
fortune_data.head(5)

0                Be a generous friend and a fair enemy
1                                         Never quit! 
2        Old friends, old wines and old gold are best 
3    If your desires are not extravagant, they will...
4                   Every Friend Joys in your Success 
Name: Fortune Cookie Quotes, dtype: object

In [26]:
fortune_data[1]

'Never quit! '

In [27]:
fortune_data[36]

'Let your heart make your decisions - it does not get as confused as your head. '

In [28]:
cleaned_df = fortune_data.str.lower()
cleaned_df2 = cleaned_df.str.strip()

In [29]:
dropped = cleaned_df2.dropna()

In [30]:
dropped.tail(5)

1188    your quick wits will get you out of a tough si...
1189                       your reputation is your wealth
1190                  your success will astonish everyone
1191    your talents will be recognized and suitably r...
1192    your work interests can capture the highest st...
Name: Fortune Cookie Quotes, dtype: object

In [31]:
cleaned_fortunes = dropped

In [32]:
cleaned_fortunes.head(5)

0                be a generous friend and a fair enemy
1                                          never quit!
2         old friends, old wines and old gold are best
3    if your desires are not extravagant, they will...
4                    every friend joys in your success
Name: Fortune Cookie Quotes, dtype: object

In [33]:
cleaned_fortunes[3]

'if your desires are not extravagant, they will be granted'

In [34]:
cleaned_fortunes[0]

'be a generous friend and a fair enemy'

In [35]:
corpus = cleaned_fortunes

In [38]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words, tokenizer

inp_sequences, total_words, tokenizer1 = get_sequence_of_tokens(corpus)
inp_sequences[:10]


[[10, 6],
 [10, 6, 345],
 [10, 6, 345, 128],
 [10, 6, 345, 128, 8],
 [10, 6, 345, 128, 8, 6],
 [10, 6, 345, 128, 8, 6, 598],
 [10, 6, 345, 128, 8, 6, 598, 289],
 [23, 961],
 [109, 85],
 [109, 85, 109]]

Matching the size of the pre-trained embedding layer to fit our fortune cookie training data.

In [39]:
word_index = tokenizer1.word_index
# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, total_words)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [40]:
print(embedding_matrix)

[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [-0.49886     0.76602     0.89750999 ... -0.41179001  0.40538999
   0.78504002]
 [-0.038194   -0.24487001  0.72812003 ... -0.1459      0.82779998
   0.27061999]
 ...
 [-0.46709001 -0.59401     0.29403999 ...  0.042159    0.012113
  -0.08221   ]
 [ 0.38521001  0.099276    0.81708997 ... -0.55181003  0.65854001
  -0.43109   ]
 [ 0.59025002 -0.0016107   0.42640001 ... -0.36791     0.54280001
   0.30950999]]


In [43]:
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [None]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [None]:
predictors[60]

In [None]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 50, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(GRU(100, activation='relu'))
    model.add(Dropout(0.2))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

In [None]:
model.fit(predictors, label, epochs=100, verbose=5)


In [None]:
# the original generate text function from https://www.kaggle.com/shivamb/beginners-guide-to-text-generation-using-lstms

def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            print(predicted)
            print(np.sum(predicted))
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()
  

In [None]:
# tweaked generate text function that uses np.random.choice to sample of the probaility distribution of the predicted word

def generate_text_prob(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_proba(token_list, verbose=0)
        random = np.random.choice(predicted.shape[1],1, p=predicted[0])
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == random:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()
  

In [None]:
token_list = tokenizer.texts_to_sequences('you')[0]
token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
predicted = model.predict_proba(token_list, verbose=0)
random = np.random.choice(predicted.shape[1],1, p=predicted[0])

print(random)
predicted[0].shape

In [None]:
r = RandomWords()
random_word = 'Dreams'
text = generate_text_prob(random_word, 7, model, max_sequence_len)
print(text)

What we did today: 
- we changed to gru 
- we increased the word embedding length
- we increased the dropout
- we changed the activation from tanh to relu
- we randomly sampled our probaility distribution of word predictions

Next time:
- Use a pre-trained word embedding applied to our corpus
- get more data
- try training
