In [3]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import train_utils
%load_ext autoreload
%autoreload 2

print(torch.cuda.is_available())

True


In [4]:
# read in the extracted text file
with open('datasets/text8_train') as f:
    text = f.read()

# print out the first 100 characters
print(text[:100])
chars = tuple(set(text))

int2char = dict(enumerate(chars))
char2int = {ch: ii for ii, ch in int2char.items()}
encoded = np.array([char2int[ch] for ch in text])

batches = train_utils.get_batches(encoded, 10, 50)
x, y = next(batches)

# this is what the batches look like, note that y is just x shifted back by one. 
print('x\n', x[:10, :10])
print('\ny\n', y[:10, :10])

 anarchism originated as a term of abuse first used against early working class radicals including t
x
 [[ 9 23  2  7 23  9  2  8  9 24]
 [24 22 22 10 21 25 25  9 25 11]
 [ 9 24 23 24 10 15  0 11 25  1]
 [16  9 24 22 24  1  9 14  7  3]
 [25  9 12  0 21 16  9 25  0  2]
 [22  9 24 18 25  2  9 15  2 18]
 [25 12  9 11 23  9 23  2 10 12]
 [ 9 25  2  1 21 12 11  1 21 25]
 [10  9  8 10  2  1  9 14 24 10]
 [11  1  3  7 10 20 21 10  9  2]]

y
 [[23  2  7 23  9  2  8  9 24  9]
 [22 22 10 21 25 25  9 25 11 19]
 [24 23 24 10 15  0 11 25  1  9]
 [ 9 24 22 24  1  9 14  7  3 18]
 [ 9 12  0 21 16  9 25  0  2 13]
 [ 9 24 18 25  2  9 15  2 18 18]
 [12  9 11 23  9 23  2 10 12  0]
 [25  2  1 21 12 11  1 21 25  9]
 [ 9  8 10  2  1  9 14 24 10 12]
 [ 1  3  7 10 20 21 10  9  2 10]]


In [5]:
class CharRNN(nn.Module):

    def __init__(self, tokens, n_steps=100, n_hidden=256, n_layers=2,
                               drop_prob=0.5, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr

        # Creating character dictionaries
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}

        ## Define the LSTM
        self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers,
                            dropout=drop_prob, batch_first=True)

        ## Define a dropout layer
        self.dropout = nn.Dropout(drop_prob)

        ## Define the final, fully-connected output layer
        self.fc = nn.Linear(n_hidden, len(self.chars))

        # Initialize the weights
        self.init_weights()


    def forward(self, x, hc):
        ''' Forward pass through the network.
            These inputs are x, and the hidden/cell state `hc`. '''

        ## Get x, and the new hidden state (h, c) from the lstm
        x, (h, c) = self.lstm(x, hc)

        ## Ppass x through the dropout layer
        x = self.dropout(x)

        # Stack up LSTM outputs using view
        x = x.reshape(x.size()[0]*x.size()[1], self.n_hidden)

        ## Put x through the fully-connected layer
        x = self.fc(x)

        # Return x and the hidden state (h, c)
        return x, (h, c)


    def predict(self, char, h=None, cuda=False, top_k=None):
        ''' Given a character, predict the next character.

            Returns the predicted character and the hidden state.
        '''
        if cuda:
            self.cuda()
        else:
            self.cpu()

        if h is None:
            h = self.init_hidden(1)

        x = np.array([[self.char2int[char]]])
        x = one_hot_encode(x, len(self.chars))

        inputs = torch.from_numpy(x)

        if cuda:
            inputs = inputs.cuda()

        h = tuple([each.data for each in h])
        out, h = self.forward(inputs, h)

        p = F.softmax(out, dim=1).data

        if cuda:
            p = p.cpu()

        if top_k is None:
            top_ch = np.arange(len(self.chars))
        else:
            p, top_ch = p.topk(top_k)
            top_ch = top_ch.numpy().squeeze()

        p = p.numpy().squeeze()

        char = np.random.choice(top_ch, p=p/p.sum())

        return self.int2char[char], h

    def init_weights(self):
        ''' Initialize weights for fully connected layer '''
        initrange = 0.1

        # Set bias tensor to all zeros
        self.fc.bias.data.fill_(0)
        # FC weights as random uniform
        self.fc.weight.data.uniform_(-1, 1)

    def init_hidden(self, n_seqs):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x n_seqs x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        return (weight.new(self.n_layers, n_seqs, self.n_hidden).zero_(),
                weight.new(self.n_layers, n_seqs, self.n_hidden).zero_())


In [8]:
if 'net' in locals():
    del net

net = CharRNN(chars, n_hidden=512, n_layers=1)
print(net)

n_seqs, n_steps = 128, 100
train_utils.train(net, encoded, epochs=25, n_seqs=n_seqs, n_steps=n_steps, lr=0.001, cuda=True, print_every=2000)

CharRNN(
  (lstm): LSTM(27, 512, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=512, out_features=27, bias=True)
)


NameError: name 'nn' is not defined

In [None]:

def letter_to_emg_sim(key, char_tuple, noise_dist= 1, typing_style='skilled'):

  keyboard = {0:'qwertyuiop',1:'asdfghjkl',2:'zxcvbnm',3:' '}

  int2char = dict(enumerate(char_tuple))
  char2int = {ch: ii for ii, ch in int2char.items()}



  # find the location of key on physical keyboard
  for row in range(4):
    argmax_key_column = keyboard[row].rfind(key)
    if argmax_key_column != -1:
      argmax_key_row = row
      break
  # create some parameters for archetypal typists
  if typing_style == 'skilled':
    accuracy = .5
    softmax_range = 1 # keys
  if typing_style == 'unskilled':
    accuracy = .25
    softmax_range = 2 # keys

  # set the peak probability at the true key "accuracy"% of the time,
  # otherwise it is uniformly randomly assigned to a key less than "softmax_range" keys away
  if np.random.rand() > accuracy:
    r_shift = np.random.choice([ i  for i in range(-softmax_range,softmax_range+1) if i != 0])
    c_shift = np.random.choice([ i  for i in range(-softmax_range,softmax_range+1) if i != 0])
    while keyboard_index_is_lowercaseletter(argmax_key_row+r_shift,argmax_key_column+c_shift) is False:
      r_shift = np.random.choice([ i  for i in range(-softmax_range,softmax_range+1) if i != 0])
      c_shift = np.random.choice([ i  for i in range(-softmax_range,softmax_range+1) if i != 0])
    argmax_key_row = argmax_key_row + r_shift
    argmax_key_column = argmax_key_column + c_shift
  max_key = keyboard[argmax_key_row][argmax_key_column]

  p = np.zeros((len(char_tuple)))
  # space key has no errors
  if key == ' ':
    # make the space key correct 80% of the time.

    for char in ['c','v','b','n','m']:
      p[char2int[char]] = np.random.random()

    p[char2int[key]] = np.random.random()+.65 # 80% correct space bar

    return p/np.sum(p)

  # add noise to softmax for keys within "softmax_range" of the peak prob key
  for i in range(-softmax_range, softmax_range+1):
    for j in range(-softmax_range, softmax_range+1):
      if not keyboard_index_is_lowercaseletter(argmax_key_row+i, argmax_key_column+j):
        continue
      # add the noise to the element in p corresponding to the key
      noise_key = keyboard[argmax_key_row+i][argmax_key_column+j]
      distance = np.max([abs(i),abs(j)])
      noise = 2*np.random.random()-1
      p[char2int[noise_key]] = ((softmax_range-distance+1) + noise) /(softmax_range+1)

  p[char2int[max_key]] = 1
  return p/np.sum(p)

def keyboard_index_is_lowercaseletter(row_index, column_index):
  # top, left, and bottom of keyboard cases

  if row_index < 0 or row_index > 2 or column_index < 0 :
    return False
  # right boundaries, manually defined for each row
  if row_index == 0 and column_index > 9:
    return False
  if row_index == 1 and column_index > 8:
    return False
  if row_index == 2 and column_index > 6:
    return False
  return True


In [None]:
with open('/content/drive/MyDrive/UCLA/Courses/NLP/CS 263 Final Project/text8_train') as f:
    text_train = f.read()
text_train_noised = ''
for char in text_train:
  if len(text_train_noised)%300000 ==0:
    print(len(text_train_noised)/80000000)
    with open('/content/drive/MyDrive/UCLA/Courses/NLP/CS 263 Final Project/text8_train_noised','w') as f:
      f.write(text_train_noised)
  text_train_noised = text_train_noised+net.chars[np.argmax(letter_to_emg_sim(char, net.chars, typing_style='skilled'))]

In [None]:

# with open('/content/drive/MyDrive/UCLA/Courses/NLP/CS 263 Final Project/text8') as f:
#     text = f.read()
# with open('/content/drive/MyDrive/UCLA/Courses/NLP/CS 263 Final Project/text8_train','w') as f:
#     f.write(text[:80000000])
with open('/content/drive/MyDrive/UCLA/Courses/NLP/CS 263 Final Project/text8_train') as f:
    text_train = f.read()
print(len(text_train))
