In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('./the-office-lines - scripts.csv')

df['text_with_speaker'] = df['speaker'] + " : " +  df['line_text']
df.head()

Unnamed: 0,id,season,episode,scene,line_text,speaker,deleted,text_with_speaker
0,1,1,1,1,All right Jim. Your quarterlies look very good...,Michael,False,Michael : All right Jim. Your quarterlies look...
1,2,1,1,1,"Oh, I told you. I couldn't close it. So...",Jim,False,"Jim : Oh, I told you. I couldn't close it. So..."
2,3,1,1,1,So you've come to the master for guidance? Is ...,Michael,False,Michael : So you've come to the master for gui...
3,4,1,1,1,"Actually, you called me in here, but yeah.",Jim,False,"Jim : Actually, you called me in here, but yeah."
4,5,1,1,1,"All right. Well, let me show you how it's done.",Michael,False,"Michael : All right. Well, let me show you how..."


In [3]:
lines = []
for doc in df['text_with_speaker']:
    lines.append(doc)

completed_lines = "\n".join(lines)

In [4]:
vocab = sorted(set(completed_lines))

len(completed_lines), len(vocab)

(4145292, 90)

In [5]:

char_to_ind = {char : ind for (ind, char) in enumerate(vocab)}
ind_to_char = np.array(vocab)
encoded_text = [char_to_ind[s] for s in completed_lines]

##### Create Batches

In [6]:
stanza = "Michael : All right Jim. Your quarterlies look very good. How are things at the library?\nJim : Oh, I told you. I couldn't close it."
len(stanza)

131

In [7]:
sequence_length = 140

In [8]:
training_sequences = len(completed_lines) // sequence_length
training_sequences

29609

#### Create Training Sequences

In [9]:
char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text)
sequences = char_dataset.batch(sequence_length+1, drop_remainder=True)

def create_seq_targets(seq):
    input_seq = seq[:-1]
    target_seq = seq[1:]
    return input_seq, target_seq

dataset = sequences.map(create_seq_targets)

dataset

<MapDataset shapes: ((140,), (140,)), types: (tf.int32, tf.int32)>

###### Generate Training Batches


In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, GRU
from tensorflow.keras.losses import sparse_categorical_crossentropy

batch_size = 128
buffer_size = 10000

dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder = True)

In [11]:
##Loss function
def sparse_cat_loss(y_true, y_pred):
    return sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)

In [12]:
vocab_size = len(vocab)
embed_dim = 128
rnn_neurons = 1024

def create_model(vocab_size, embed_dim, rnn_neurons, batch_size):
    
    model = Sequential()
    
    model.add(Embedding(vocab_size, embed_dim, batch_input_shape=[batch_size, None]))
    model.add(GRU(rnn_neurons, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'))
    
    #Final Dense layer to predict
    model.add(Dense(vocab_size))
    
    model.compile(optimizer='adam', loss=sparse_cat_loss)
    
    return model

In [13]:
model = create_model(vocab_size, embed_dim, rnn_neurons, batch_size)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (128, None, 128)          11520     
_________________________________________________________________
gru (GRU)                    (128, None, 1024)         3545088   
_________________________________________________________________
dense (Dense)                (128, None, 90)           92250     
Total params: 3,648,858
Trainable params: 3,648,858
Non-trainable params: 0
_________________________________________________________________


##### Fit the model

In [14]:
epochs = 30
model.fit(dataset, epochs=epochs)

Train for 229 steps
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x1952dd3b7c8>

In [22]:
model.save('OfficeSentenceModel2.h5')

In [23]:
from tensorflow.keras.models import load_model

model = create_model(vocab_size, embed_dim, rnn_neurons, batch_size=1)
model.load_weights('OfficeSentenceModel.h5')
model.build(tf.TensorShape([1, None]))
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (1, None, 128)            11520     
_________________________________________________________________
gru_2 (GRU)                  (1, None, 1024)           3545088   
_________________________________________________________________
dense_2 (Dense)              (1, None, 90)             92250     
Total params: 3,648,858
Trainable params: 3,648,858
Non-trainable params: 0
_________________________________________________________________


In [45]:
def generate_text(model, start_seed, gen_size=100, temp=0.5):
    num_to_generate = gen_size
    input_eval = [char_to_ind[s] for s in start_seed]
    input_eval = tf.expand_dims(input_eval, 0)
    
    text_generated = []
    temperature = temp
    
    model.reset_states()
    
    for i in range(num_to_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(ind_to_char[predicted_id])
    return (start_seed + ''.join(text_generated))   

In [51]:
#print(generate_text(model,"Michael : ",gen_size=1000))

In [50]:
print(generate_text(model,"Michael : ",gen_size=750))

Michael : That's right.
Pam : Okay. [walks into kitchen] Hey! What's going on?
Michael : I want you to speak to you about a surprise.
Jim : Oh my god! Why don't you take a stupid party for you?
Pam : So the company is wrong with the word 'email.
Michael : Oh, thank you.
Pam : Then...
Dwight : [falls into golden Sign] Oh my god! What are you talking about?
Jim : I don't know what to do.  I just got a cold serious call from you go for the day.  I have made any sense. We need to tell me why you're looking for Dunder Mifflin.
Oscar : Angela, you should know that our ears tried to be a great idea on the line shop. Okay?
Jim : Well, I gotta get out of here silence. We have to see the wedding.
Dwight : [sighs] That seems for you to stop screwing with the ba


In [19]:
generated_script = generate_text(model,"Michael : ",gen_size=5500)

In [20]:
#print(generated_script)

In [21]:
# f = open("office_generated_sentence.txt", "a", encoding='utf-8')
# f.write(generated_script)
# f.close()