In [5]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable

In [7]:
#Define the default tensor type at the top
torch.set_default_tensor_type(torch.cuda.FloatTensor if torch.cuda.is_available() else 
                              torch.FloatTensor)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [20]:
#Import text data, Alice in Wonderland from local directory
path = "./aiw.txt"

text= open(path).read()
print(len(data))

144348


In [23]:
text[0:500]

'CHAPTER I. Down the Rabbit-Hole\n\nAlice was beginning to get very tired of sitting by her sister on the\nbank, and of having nothing to do: once or twice she had peeped into the\nbook her sister was reading, but it had no pictures or conversations in\nit, ‘and what is the use of a book,’ thought Alice ‘without pictures or\nconversations?’\n\nSo she was considering in her own mind (as well as she could, for the\nhot day made her feel very sleepy and stupid), whether the pleasure\nof making a daisy-chain w'

In [34]:
"""The vocabulary is all the unique symbols used in the text. This is the benefit of 
working with a character level RNN."""

chars = sorted(set(text))
vocab_size= len(chars)
print(vocab_size)

71


In [42]:
{c:i for i, c in enumerate(chars)}

{'\n': 0,
 ' ': 1,
 '!': 2,
 '(': 3,
 ')': 4,
 '*': 5,
 ',': 6,
 '-': 7,
 '.': 8,
 ':': 9,
 ';': 10,
 '?': 11,
 'A': 12,
 'B': 13,
 'C': 14,
 'D': 15,
 'E': 16,
 'F': 17,
 'G': 18,
 'H': 19,
 'I': 20,
 'J': 21,
 'K': 22,
 'L': 23,
 'M': 24,
 'N': 25,
 'O': 26,
 'P': 27,
 'Q': 28,
 'R': 29,
 'S': 30,
 'T': 31,
 'U': 32,
 'V': 33,
 'W': 34,
 'X': 35,
 'Y': 36,
 'Z': 37,
 '[': 38,
 ']': 39,
 '_': 40,
 'a': 41,
 'b': 42,
 'c': 43,
 'd': 44,
 'e': 45,
 'f': 46,
 'g': 47,
 'h': 48,
 'i': 49,
 'j': 50,
 'k': 51,
 'l': 52,
 'm': 53,
 'n': 54,
 'o': 55,
 'p': 56,
 'q': 57,
 'r': 58,
 's': 59,
 't': 60,
 'u': 61,
 'v': 62,
 'w': 63,
 'x': 64,
 'y': 65,
 'z': 66,
 '‘': 67,
 '’': 68,
 '“': 69,
 '”': 70}

In [46]:
#Create dictionaries from character --> index and index --> character
c_to_idx= {c:i for i, c in enumerate(chars)}
idx_to_c= {i:c for i, c in enumerate(chars)}

In [51]:
"""Convert whole text to indicies. Want each character to be 
represented by its index in the vocabulary. This is how we will feed to RNN""



'Convert whole text to indicies. Want each character to be \nrepresented by its index in the vocabulary. This is how we will feed to RNN'

In [61]:
text_idx = [c_to_idx[c] for c in text]
text_len = len(text_idx)
text_idx[:10]

[14, 19, 12, 27, 31, 16, 29, 1, 20, 8]

In [63]:
#Check it works to convert back : join up the indicies

print(text[25:100])
print("--------")
print(''.join([idx_to_c[i] for i in text_idx[25:100]]))

t-Hole

Alice was beginning to get very tired of sitting by her sister on t
--------
t-Hole

Alice was beginning to get very tired of sitting by her sister on t


In [66]:
#Create a DataLoader
#Sequence of characters passed to RNN at a time. This dictates the length of the unrolled model (#timesteps)
#Batch size affects splitting of raw data as well as model architecture

seq_len = 8
batch_size= 512

In [67]:
#Wnat a non-overlapping set of inputs and outputs. Each X should be equal to the sequence length, while the Y, shifted by 1. Note that we don't go to the end for Y.

idx_in_data = [text_idx[idx:idx+seq_len] for idx in range(0, text_len-1-seq_len,seq_len)]

In [77]:
#Convert these inputs into a numpy array and provide info. Note dimensions are the total number of sequences in the corpus and the sequence length.

inp = np.array(idx_in_data)
print(inp.shape)
print(inp[:3, :])

(18043, 8)
[[14 19 12 27 31 16 29  1]
 [20  8  1 15 55 63 54  1]
 [60 48 45  1 29 41 42 42]]


In [78]:
#Do the samething with Y

idx_out_data = [text_idx[idx:idx+seq_len] for idx in range(1, text_len-seq_len, seq_len)]


In [79]:
#Confirm that the target array is the input array shifted by 1. We'll be predicting the next character in sequence.

outp = np.array(idx_out_data)
print(outp.shape)
print(outp[:3,:])

(18043, 8)
[[19 12 27 31 16 29  1 20]
 [ 8  1 15 55 63 54  1 60]
 [48 45  1 29 41 42 42 49]]


In [83]:
'''Split up the input and target data into training and test sets.
Return 4 numpy arrays- training input, training targets, test input, and test targets'''

def train_test_split(inp_data, out_data, train_fraction):
    trn_idx = np.random.rand(len(inp_data)) < train_fraction
    
    inp_trn = inp_data[trn_idx]
    inp_test = inp_data[~trn_idx]
    
    outp_trn= out_data[trn_idx]
    outp_test= out_data[~trn_idx]
    return inp_trn, outp_trn, inp_test, outp_test
    

In [84]:
#Split the data into 90%training, 10% test. This ratio should be bigger with a larger corpus.

x_trn, y_trn, x_val, y_val = train_test_split(inp,outp, 0.9)

In [None]:
'''PyTorch Dataset class for character level text generation. X and Y have widt'''