<a href="https://colab.research.google.com/github/WomenInDataScience-Seattle/FortuneCookie/blob/master/FortuneCookieModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from io import StringIO

import numpy as np
import pandas as pd
import requests
url='https://raw.githubusercontent.com/WomenInDataScience-Seattle/Machine_Learning_Projects/master/FortuneCookie/training_data/data.csv'
s=requests.get(url).text

c=pd.read_csv(StringIO(s))

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
from scipy.spatial.distance import cdist

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:

# from tf.keras.models import Sequential  # This does not work!
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding, LSTM, Dropout
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
import keras.utils as ku 

Using TensorFlow backend.


In [4]:
# random-word used to generate the first word in the sequence
!pip install random-word
from random_word import RandomWords

Collecting random-word
  Using cached https://files.pythonhosted.org/packages/95/0d/c672ff7d6e36f88e60cbdd669f1247041fb7a45f3e2368d314f65b1dd933/Random_Word-1.0.4-py3-none-any.whl
Collecting nose (from random-word)
  Using cached https://files.pythonhosted.org/packages/15/d8/dd071918c040f50fa1cf80da16423af51ff8ce4a0f2399b7bf8de45ac3d9/nose-1.3.7-py3-none-any.whl
Installing collected packages: nose, random-word
Successfully installed nose-1.3.7 random-word-1.0.4


In [5]:
c.head(5)

Unnamed: 0,Fortune Cookie Quotes,Unnamed: 1
0,Be a generous friend and a fair enemy,
1,Never quit!,
2,"Old friends, old wines and old gold are best",
3,"If your desires are not extravagant, they will...",
4,Every Friend Joys in your Success,


In [6]:
fortune_data = c['Fortune Cookie Quotes']

In [7]:
fortune_data.head(5)

0                Be a generous friend and a fair enemy
1                                         Never quit! 
2        Old friends, old wines and old gold are best 
3    If your desires are not extravagant, they will...
4                   Every Friend Joys in your Success 
Name: Fortune Cookie Quotes, dtype: object

In [8]:
fortune_data[1]

'Never quit! '

In [9]:
fortune_data[36]

'Let your heart make your decisions - it does not get as confused as your head. '

In [10]:
cleaned_df = fortune_data.str.lower()
cleaned_df2 = cleaned_df.str.strip()

In [11]:
dropped = cleaned_df2.dropna()

In [12]:
dropped.tail(5)

1188    your quick wits will get you out of a tough si...
1189                       your reputation is your wealth
1190                  your success will astonish everyone
1191    your talents will be recognized and suitably r...
1192    your work interests can capture the highest st...
Name: Fortune Cookie Quotes, dtype: object

In [13]:
cleaned_fortunes = dropped

In [14]:
cleaned_fortunes.head(5)

0                be a generous friend and a fair enemy
1                                          never quit!
2         old friends, old wines and old gold are best
3    if your desires are not extravagant, they will...
4                    every friend joys in your success
Name: Fortune Cookie Quotes, dtype: object

In [15]:
cleaned_fortunes[3]

'if your desires are not extravagant, they will be granted'

In [16]:
cleaned_fortunes[0]

'be a generous friend and a fair enemy'

In [17]:
corpus = cleaned_fortunes

In [18]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
inp_sequences[:10]

[[10, 6],
 [10, 6, 345],
 [10, 6, 345, 128],
 [10, 6, 345, 128, 8],
 [10, 6, 345, 128, 8, 6],
 [10, 6, 345, 128, 8, 6, 598],
 [10, 6, 345, 128, 8, 6, 598, 289],
 [23, 961],
 [109, 85],
 [109, 85, 109]]

In [19]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [20]:
predictors[60]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  60, 963, 221,
         8,  33], dtype=int32)

In [21]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 50, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(GRU(100, activation='relu'))
    model.add(Dropout(0.2))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 41, 50)            104350    
_________________________________________________________________
gru (GRU)                    (None, 100)               45300     
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 2087)              210787    
Total params: 360,437
Trainable params: 360,437
Non-trainable params: 0
_________________________________________________________________


In [22]:
model.fit(predictors, label, epochs=1, verbose=5)


Instructions for updating:
Use tf.cast instead.


<tensorflow.python.keras.callbacks.History at 0x7fd5485d9e10>

In [23]:
# the original generate text function from https://www.kaggle.com/shivamb/beginners-guide-to-text-generation-using-lstms

def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            print(predicted)
            print(np.sum(predicted))
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()
  

In [24]:
# tweaked generate text function that uses np.random.choice to sample of the probaility distribution of the predicted word

def generate_text_prob(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_proba(token_list, verbose=0)
        random = np.random.choice(predicted.shape[1],1, p=predicted[0])
        
        output_word = ""
        for word,index in  x :
            if index == random:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()
  

In [25]:
token_list = tokenizer.texts_to_sequences('you')[0]
token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
predicted = model.predict_proba(token_list, verbose=0)
random = np.random.choice(predicted.shape[1],1, p=predicted[0])

print(random)
predicted[0].shape

[20]


(2087,)

In [26]:
r = RandomWords()
random_word = 'Dreams'
text = generate_text_prob(random_word, 7, model, max_sequence_len)
print(text)

Dreams Fear Grief Notice Proven Won’T Just You


In [29]:
from random import randint


In [30]:
random_word = randint(0, 2087)
random_word

340

In [41]:
Tokenizer.sequences_to_texts_generator([1])


TypeError: sequences_to_texts_generator() missing 1 required positional argument: 'sequences'

In [33]:
token_list = tokenizer[340]


TypeError: 'Tokenizer' object is not subscriptable

In [56]:
tokenizer.index_word

SyntaxError: invalid syntax (<ipython-input-56-6397aa8bcad5>, line 1)

What we did today: 
- we changed to gru 
- we increased the word embedding length
- we increased the dropout
- we changed the activation from tanh to relu
- we randomly sampled our probaility distribution of word predictions

Next time:
- Use a pre-trained word embedding applied to our corpus
- get more data
- try training
