In [10]:
import os

data_path = './data/Game_of_Thrones_Script.csv'
input_file = os.path.join(data_path)
with open(input_file, "r") as f:
    text = f.read()

In [3]:
import numpy as np

view_line_range = (0, 10)
print('Dataset Stats')
print('Roughly the number of unique words: {}'.format(len({word: None for word in text.split()})))
lines = text.split('\n')
print('Number of lines: {}'.format(len(lines)))
word_count_line = [len(line.split()) for line in lines]
print('Average number of words in each line: {}'.format(np.average(word_count_line)))
print()
print('The lines {} to {}:'.format(*view_line_range))
print('\n'.join(text.split('\n')[view_line_range[0]:view_line_range[1]]))

Dataset Stats
Roughly the number of unique words: 37330
Number of lines: 23913
Average number of words in each line: 16.848492451804457

The lines 0 to 10:
Release Date,Season,Episode,Episode Title,Name,Sentence
2011-04-17,Season 1,Episode 1,Winter is Coming,waymar royce,"What do you expect? They're savages. One lot steals a goat from another lot and before you know it, they're ripping each other to pieces."
2011-04-17,Season 1,Episode 1,Winter is Coming,will,"I've never seen wildlings do a thing like this. I've never seen a thing like this, not ever in my life."
2011-04-17,Season 1,Episode 1,Winter is Coming,waymar royce,How close did you get?
2011-04-17,Season 1,Episode 1,Winter is Coming,will,Close as any man would.
2011-04-17,Season 1,Episode 1,Winter is Coming,gared,We should head back to the wall.
2011-04-17,Season 1,Episode 1,Winter is Coming,royce,Do the dead frighten you?
2011-04-17,Season 1,Episode 1,Winter is Coming,gared,Our orders were to track the wildlings. We tracked th

In [4]:
from collections import Counter

def create_lookup_tables(text):
    """
    Create lookup tables for vocabulary
    """

    word_counts = Counter(text)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)}
    vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}
    return (vocab_to_int, int_to_vocab)

In [5]:
def token_lookup():
    """
    Generate a dict to turn punctuation into a token.
    :return: Tokenized dictionary where the key is the punctuation and the value is the token
    """
    
    tokens = dict()
    tokens['.'] = '<period>'
    tokens[','] = '<comma>'
    tokens['"'] = '<quotation>'
    tokens[';'] = '<semicolon>'
    tokens['!'] = '<exclamation>'
    tokens['?'] = '<question>'
    tokens['('] = '<left_parenthesis>'
    tokens[')'] = '<right_parenthesis>'
    tokens['-'] = '<dash>'
    tokens['\n'] = '<new_line>'
    return tokens

In [11]:
import pickle

SPECIAL_WORDS = {'PADDING': '<PAD>'}
token_dict = token_lookup()
for key, token in token_dict.items():
    text = text.replace(key, ' {} '.format(token))

text = text.lower()
text = text.split()
vocab_to_int, int_to_vocab = create_lookup_tables(text + list(SPECIAL_WORDS.values()))
int_text = [vocab_to_int[word] for word in text]
pickle.dump((int_text, vocab_to_int, int_to_vocab, token_dict), open('preprocess.p', 'wb'))

In [12]:
def load_preprocess():
    """
    Load the Preprocessed Training data and return them in batches of <batch_size> or less
    """
    return pickle.load(open('preprocess.p', mode='rb'))

In [14]:
import torch

train_on_gpu = torch.cuda.is_available()
if not train_on_gpu:
    print('No GPU found. Please use a GPU to train your neural network.')

No GPU found. Please use a GPU to train your neural network.


In [15]:
from torch.utils.data import TensorDataset, DataLoader

def batch_data(words, sequence_length, batch_size):
    """
    Batch the neural network data using DataLoader
    """
    
    n_batches = len(words) // batch_size
    words = words[:n_batches * batch_size]
    y_len = len(words) - sequence_length
    x, y = [], []
    for idx in range(0, y_len):
        idx_end = sequence_length + idx
        x_batch = words[idx:idx_end]
        x.append(x_batch)
        batch_y =  words[idx_end]
        y.append(batch_y)    
    data = TensorDataset(torch.from_numpy(np.asarray(x)), torch.from_numpy(np.asarray(y)))
    data_loader = DataLoader(data, shuffle=True, batch_size=batch_size)
    return data_loader


In [16]:
test_text = range(50)
t_loader = batch_data(test_text, sequence_length=5, batch_size=10)

data_iter = iter(t_loader)
sample_x, sample_y = data_iter.next()

print(sample_x.shape)
print(sample_x)
print()
print(sample_y.shape)
print(sample_y)

torch.Size([10, 5])
tensor([[37, 38, 39, 40, 41],
        [ 8,  9, 10, 11, 12],
        [ 5,  6,  7,  8,  9],
        [25, 26, 27, 28, 29],
        [40, 41, 42, 43, 44],
        [27, 28, 29, 30, 31],
        [18, 19, 20, 21, 22],
        [16, 17, 18, 19, 20],
        [24, 25, 26, 27, 28],
        [33, 34, 35, 36, 37]])

torch.Size([10])
tensor([42, 13, 10, 30, 45, 32, 23, 21, 29, 38])


In [17]:
import torch.nn as nn

class RNN(nn.Module):
    
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5):
        """
        Initialize the PyTorch RNN Module
        """
        
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.fc = nn.Linear(hidden_dim, output_size)
    
    
    def forward(self, nn_input, hidden):
        """
        Forward propagation of the neural network
        """
        
        batch_size = nn_input.size(0)
        embed = self.embedding(nn_input)
        lstm_out, hidden = self.lstm(embed, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        out = self.fc(lstm_out)
        out = out.view(batch_size, -1, self.output_size)
        out = out[:, -1]
        return out, hidden
    

    def init_hidden(self, batch_size):
        '''
        Initialize the hidden state of an LSTM/GRU
        '''
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        return hidden

In [18]:
def forward_back_prop(rnn, optimizer, criterion, inp, target, hidden):
    """
    Forward and backward propagation on the neural network
    """
    
    if(train_on_gpu):
        rnn.cuda()
    h = tuple([each.data for each in hidden])
    rnn.zero_grad()
    
    if(train_on_gpu):
        inp, target = inp.cuda(), target.cuda()    
    output, h = rnn(inp, h)
    loss = criterion(output, target)
    loss.backward()
    nn.utils.clip_grad_norm_(rnn.parameters(), 5)
    optimizer.step()
    return loss.item(), h

In [19]:
def train_rnn(rnn, batch_size, optimizer, criterion, n_epochs, show_every_n_batches=100):
    batch_losses = []
    rnn.train()
    print("Training for %d epoch(s)..." % n_epochs)
    for epoch_i in range(1, n_epochs + 1):
        hidden = rnn.init_hidden(batch_size)
        for batch_i, (inputs, labels) in enumerate(train_loader, 1):
            n_batches = len(train_loader.dataset)//batch_size
            if(batch_i > n_batches):
                break
            loss, hidden = forward_back_prop(rnn, optimizer, criterion, inputs, labels, hidden)
            batch_losses.append(loss)
            if batch_i % show_every_n_batches == 0:
                print('Epoch: {:>4}/{:<4}  Loss: {}\n'.format(
                    epoch_i, n_epochs, np.average(batch_losses)))
                batch_losses = []
    return rnn

In [20]:
sequence_length = 20
batch_size = 128
train_loader = batch_data(int_text, sequence_length, batch_size)

In [21]:
num_epochs = 5
learning_rate = 0.001 
vocab_size = len(vocab_to_int)
output_size = vocab_size
embedding_dim = 250
hidden_dim = 512
n_layers = 2
show_every_n_batches = 1500

In [None]:
rnn = RNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5)
if train_on_gpu:
    rnn.cuda()

optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()
trained_rnn = train_rnn(rnn, batch_size, optimizer, criterion, num_epochs, show_every_n_batches)
save_filename = os.path.splitext(os.path.basename('./save/trained_rnn))[0] + '.pt'
torch.save(trained_rnn, save_filename)
print('Model Trained and Saved')

In [None]:
def load_model(filename):
    """
    Load the Preprocessed Training data and return them in batches of <batch_size> or less
    """
    save_filename = os.path.splitext(os.path.basename(filename))[0] + '.pt'
    return torch.load(save_filename)

In [22]:
import torch.nn.functional as F

def generate(rnn, prime_id, int_to_vocab, token_dict, pad_value, predict_len=100):
    """
    Generate text using the neural network
    """
    
    rnn.eval()
    current_seq = np.full((1, sequence_length), pad_value)
    current_seq[-1][-1] = prime_id
    predicted = [int_to_vocab[prime_id]]
    
    for _ in range(predict_len):
        if train_on_gpu:
            current_seq = torch.LongTensor(current_seq).cuda()
        else:
            current_seq = torch.LongTensor(current_seq)
        hidden = rnn.init_hidden(current_seq.size(0))
        output, _ = rnn(current_seq, hidden)
        p = F.softmax(output, dim=1).data
        if(train_on_gpu):
            p = p.cpu()
        top_k = 5
        p, top_i = p.topk(top_k)
        top_i = top_i.numpy().squeeze()
        p = p.numpy().squeeze()
        word_i = np.random.choice(top_i, p=p/p.sum())
        word = int_to_vocab[word_i]
        predicted.append(word)
        current_seq = np.roll(current_seq, -1, 1)
        current_seq[-1][-1] = word_i
    
    gen_sentences = ' '.join(predicted)
    for key, token in token_dict.items():
        ending = ' ' if key in ['\n', '(', '"'] else ''
        gen_sentences = gen_sentences.replace(' ' + token.lower(), key)
    gen_sentences = gen_sentences.replace('\n ', '\n')
    gen_sentences = gen_sentences.replace('( ', '(')
    return gen_sentences

In [None]:
gen_length = 400
prime_word = 'jerry'
pad_word = helper.SPECIAL_WORDS['PADDING']
generated_script = generate(trained_rnn, vocab_to_int[prime_word + ':'], int_to_vocab, token_dict, vocab_to_int[pad_word], gen_length)
print(generated_script)

In [None]:
f =  open("generated_script_1.txt","w")
f.write(generated_script)
f.close()