In [2]:
import pandas as pd
lines_df = pd.read_csv('simpsons_dataset.csv')
lines_df = lines_df.dropna()

In [2]:
import tensorflow as tf
tf.enable_eager_execution()
import numpy as np
import os
import time


In [3]:
lines_df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [4]:
homer = lines_df[lines_df['raw_character_text'] == 'Homer Simpson']

In [5]:
homer.shape

(27850, 2)

In [6]:
homer_split = homer.iloc[:500]

In [7]:
lines = " ".join(homer_split['spoken_words'].values)

In [8]:
len(lines)

25364

In [9]:
vocab = sorted(set(lines))

In [10]:
len(vocab)

60

In [11]:
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)
text_as_int = np.array([char2idx[c] for c in lines])

In [12]:
seq_length = 100
examples_per_epoch = len(lines)//seq_length

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
  print(idx2char[i.numpy()])

N
e
v
e
r


In [13]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

In [14]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [15]:
for input_example, target_example in  dataset.take(1):
  print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
  print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'Never thrown a party? What about that big bash we had with all the champagne and musicians and holy '
Target data: 'ever thrown a party? What about that big bash we had with all the champagne and musicians and holy m'


In [16]:
# Batch size
BATCH_SIZE = 16
steps_per_epoch = examples_per_epoch//BATCH_SIZE

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 5000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<DatasetV1Adapter shapes: ((16, 100), (16, 100)), types: (tf.int64, tf.int64)>

In [17]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 32

# Number of RNN units
rnn_units = 32

if tf.test.is_gpu_available():
  rnn = tf.keras.layers.CuDNNGRU
else:
    import functools
    rnn = functools.partial(
        tf.keras.layers.GRU, recurrent_activation='sigmoid')

In [None]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    rnn(rnn_units,
        return_sequences=True,
        recurrent_initializer='glorot_uniform',
        stateful=True),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

In [None]:
tf.test.is_gpu_available()

In [3]:
import tensorflow as tf
import tensorflow_hub as hub

with tf.Graph().as_default():
  module_url = "https://tfhub.dev/google/nnlm-en-dim128-with-normalization/1"
  embed = hub.Module(module_url)
  embeddings = embed(lines_df['spoken_words'].values)

  with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())

    print(sess.run(embeddings))

InternalError: CUDA runtime implicit initialization on GPU:0 failed. Status: out of memory

In [4]:
!wget https://raw.githubusercontent.com/simpson-says/buildweek3-simpsons-says-ds/master/simpsons_script_lines.csv

--2019-08-23 14:36:44--  https://raw.githubusercontent.com/simpson-says/buildweek3-simpsons-says-ds/master/simpsons_script_lines.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.52.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.52.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 35185146 (34M) [text/plain]
Saving to: ‘simpsons_script_lines.csv’


2019-08-23 14:36:45 (50.5 MB/s) - ‘simpsons_script_lines.csv’ saved [35185146/35185146]



In [8]:
!tail simpsons_script_lines.csv

9539,32,199,Marge Simpson: Yes.,811000,true,1,373,Marge Simpson,SIMPSON HOUSE - BASEMENT STAIRCASE,Yes.,yes,1
9540,32,200,Lisa Simpson: Can we do it this week?,812000,true,9,373,Lisa Simpson,SIMPSON HOUSE - BASEMENT STAIRCASE,Can we do it this week?,can we do it this week,6
9541,32,201,(Springfield Elementary School: INT. ELEMENTARY - HALLWAY),814000,false,,3,,Springfield Elementary School,,,
9542,32,202,"Lisa Simpson: (REHEARSING) Mr. Bergstrom, we request the pleasure of your company... no... Mr. Bergstrom, if you're not doing anything this Friday... no... Mr. Bergstrom, do you like pork chops... oh no, of course you wouldn't...",814000,true,9,3,Lisa Simpson,Springfield Elementary School,"Mr. Bergstrom, we request the pleasure of your company... no... Mr. Bergstrom, if you're not doing anything this Friday... no... Mr. Bergstrom, do you like pork chops... oh no, of course you wouldn't...",mr bergstrom we request the pleasure of your company no mr bergstrom if youre not doing anyth

In [11]:
script_lines = pd.read_csv('simpsons_script_lines.csv', error_bad_lines=False)
script_lines.head()

b'Skipping line 8084: expected 13 fields, saw 20\nSkipping line 52607: expected 13 fields, saw 21\nSkipping line 59910: expected 13 fields, saw 21\n'
b'Skipping line 71801: expected 13 fields, saw 20\nSkipping line 73539: expected 13 fields, saw 21\nSkipping line 77230: expected 13 fields, saw 21\nSkipping line 78953: expected 13 fields, saw 21\nSkipping line 81138: expected 13 fields, saw 20\nSkipping line 86746: expected 13 fields, saw 22\nSkipping line 101154: expected 13 fields, saw 21\nSkipping line 115438: expected 13 fields, saw 20\nSkipping line 117573: expected 13 fields, saw 22\nSkipping line 130610: expected 13 fields, saw 22\n'
b'Skipping line 152970: expected 13 fields, saw 22\nSkipping line 153017: expected 13 fields, saw 20\nSkipping line 153018: expected 13 fields, saw 30\nSkipping line 154080: expected 13 fields, saw 20\nSkipping line 154082: expected 13 fields, saw 20\nSkipping line 154084: expected 13 fields, saw 20\nSkipping line 154086: expected 13 fields, saw 20\n

Unnamed: 0,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count
0,9549,32,209,"Miss Hoover: No, actually, it was a little of ...",848000,True,464,3.0,Miss Hoover,Springfield Elementary School,"No, actually, it was a little of both. Sometim...",no actually it was a little of both sometimes ...,31.0
1,9550,32,210,Lisa Simpson: (NEAR TEARS) Where's Mr. Bergstrom?,856000,True,9,3.0,Lisa Simpson,Springfield Elementary School,Where's Mr. Bergstrom?,wheres mr bergstrom,3.0
2,9551,32,211,Miss Hoover: I don't know. Although I'd sure l...,856000,True,464,3.0,Miss Hoover,Springfield Elementary School,I don't know. Although I'd sure like to talk t...,i dont know although id sure like to talk to h...,22.0
3,9552,32,212,Lisa Simpson: That life is worth living.,864000,True,9,3.0,Lisa Simpson,Springfield Elementary School,That life is worth living.,that life is worth living,5.0
4,9553,32,213,Edna Krabappel-Flanders: The polls will be ope...,864000,True,40,3.0,Edna Krabappel-Flanders,Springfield Elementary School,The polls will be open from now until the end ...,the polls will be open from now until the end ...,33.0
