In [23]:
import numpy as np
import pandas
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.callbacks import TensorBoard
from gensim.parsing.preprocessing import strip_short, remove_stopwords, preprocess_string, strip_tags, strip_punctuation
from time import time
from sklearn.model_selection import train_test_split
import keras.backend as K

from keras.callbacks import ModelCheckpoint

In [2]:
df = pandas.read_pickle('dad_jokes.pkl')
df['joke_text_raw'] = df['title'] + " " + df['selftext']
df['joke_text_process'] = df['joke_text_raw'].str.lower().apply(strip_punctuation).apply(strip_tags)
df['joke_text_process'] = df['joke_text_process'].replace(r'\n',' ', regex=True) 

In [3]:
df['joke_text_process'].head()
df = df.drop_duplicates(subset = 'joke_text_process')

In [4]:
t = Tokenizer(num_words = 20000)
df['length']= df['joke_text_process'].str.split().apply(len)
t.fit_on_texts(df['joke_text_process'])

In [12]:
X= t.texts_to_sequences(df['joke_text_process'])
X[0]
word_index = t.word_index


In [6]:

df.score.describe()
df['score_bucket'] = np.where(df['score']<10,1,np.where(df['score']<100,2,3))
df.score_bucket.value_counts()

1    8391
2    5769
3    1338
Name: score_bucket, dtype: int64

In [7]:
X = sequence.pad_sequences(X, maxlen=40)
y = pandas.get_dummies(df['score_bucket'])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .2, random_state =42)

In [16]:
# set up embedding
embeddings_index = {}
f = open('/Users/dweiss89/ds/glove.6B/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))


Found 400000 word vectors.


In [18]:
embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [29]:
embedding_layer = Embedding(len(word_index) + 1,
                            100,
                            weights=[embedding_matrix],
                            input_length=40,
                            trainable=True)

In [27]:
filepath="best_model.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

model = Sequential()
model.add(embedding_layer)
model.add(Bidirectional(LSTM(128)))
model.add(Dense(3,activation = 'softmax'))

model.compile(loss='categorical_crossentropy', optimizer='RMSprop', metrics = ['acc'])
print('Train...')
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          validation_data=[X_test, y_test])

Train...
Train on 12398 samples, validate on 3100 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1a29648400>

In [31]:
tim = "you're American when you go into the bathroom, and you're American when you come out, but do you know what you are while you're in there? European"
ben = "What did the buffalo say to his son when he left for college? Bison"
li = [tim,ben]
tokens = sequence.pad_sequences(t.texts_to_sequences(li), maxlen=40)
tokens
model.predict(tokens)

array([[0.5742615 , 0.30727646, 0.11846203],
       [0.56965834, 0.35712183, 0.07321991]], dtype=float32)