In [1]:
import numpy as np
import torch 
from torch import nn
import torch.nn.functional as F

In [2]:
# open text file and read in data as text
with open('data/data.txt', 'r') as f :
    text  = f.read()

In [4]:
# CHecking out first hundred characters
text[:100]

'Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverythin'

## Tokenization

In [5]:
# Convert text into numerical tokens as models only learn from numerical data
# Providing each ext to a unique index

# Encode the text and map each character to an integer and vice and versa
# chars is a set of all unique characters - using set
chars = tuple(set(text))

# Creating two dictionaries:
# int2char , which maps integers to characters 
# char2int , which maps characters to unique integers 
int2char = dict(enumerate(chars))
char2int = {ch: ii for ii, ch in int2char.items()}

# encode the text
encoded = np.array([char2int[ch] for ch in text])

In [6]:
encoded[:100]

array([60, 78, 42, 41, 22, 50, 13, 52, 17, 68, 68, 68, 79, 42, 41, 41, 61,
       52, 69, 42,  1, 21, 80, 21, 50, 23, 52, 42, 13, 50, 52, 42, 80, 80,
       52, 42, 80, 21, 12, 50, 53, 52, 50, 40, 50, 13, 61, 52, 59, 33, 78,
       42, 41, 41, 61, 52, 69, 42,  1, 21, 80, 61, 52, 21, 23, 52, 59, 33,
       78, 42, 41, 41, 61, 52, 21, 33, 52, 21, 22, 23, 52, 29, 34, 33, 68,
       34, 42, 61, 46, 68, 68,  0, 40, 50, 13, 61, 22, 78, 21, 33])

## Pre-Processing the data

In [10]:
def  one_hot_encode(arr, n_labels):
    
    # Initialize the the encoded array
    one_hot = np.zeros((np.multiply(*arr.shape), n_labels), dtype=np.float32)
    
    # Fill the appropriate elements with ones
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
    
    # Finally reshape it to get back to the original array
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    
    return one_hot

In [11]:
# check that the function works as expected
# one_hot is created with length 8 with 1 present at index [3,5,1]
test_seq = np.array([[3, 5, 1]])
one_hot = one_hot_encode(test_seq, 8)

print(one_hot)

[[[0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0.]]]
