Adding route to custom libraries

In [1]:
import sys
import os


dirname = os.path.abspath(os.path.join(os.getcwd(), "..", "scripts/lib"))
sys.path.append(dirname)

## Importing the data

In [2]:
from utils.compile import compileFolder

In [3]:
data = compileFolder('tate')
data[:5]

["It's not enough to be only exceptional in one realm.\n\nWith every single advantage you have,\n\nIt's not enough to be good at just ONE thing.\n\nYou need to be good at EVERYTHING.\n\nRich, Strong, Charismatic, Girl on Lock, Well-Connected, Wise.\n\nEVERYTHING.\n\nYou have the entire world at your fingertips,\n\nSitting in your pocket.\n\nI am a BILLIONAIRE writing to you RIGHT NOW.\n\nThis is not the past.\n\nYou have more knowledge, more resources and less bullshit to deal with than every one of your ancestors.\n\nYou cannot just be rich.\n\nYou cannot just be strong.\n\nOne is not enough anymore.\n\nAnd if you do not understand this,\n\nYou will simply lose to those who do.\n\n \n\n- Tate\n\n",
 'Inheritance is knowledge and mindset.\n\nThat\'s what a father gives his boys.\n\nMoney prevents them from experiencing the harshest lessons.\n\nVital lessons.\n\n"I wouldn\'t leave you a dollar boy, even if I had it!"\n\nMy dad died with zero assets and 12usd in the bank.\n\nI\'m trying 

In [4]:
compiledText = data
if type(data)==list:
    compiledText = " ".join(data)

# Feel free to comment next line (it is mainly to add all letters in the alphabet)
compiledText += "".join([chr(i) for i in range(ord('A'),ord('Z')+1)]) + "".join([chr(i) for i in range(ord('a'),ord('z')+1)])

temp = list(set(compiledText))
temp.append('[S]')
temp.append('[E]')
vocabulary = sorted(temp)
len(vocabulary)

76

In [5]:
''.join(vocabulary)

'\n !"\',-./01234567:=>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[E][S]abcdefghijklmnopqrstuvwxyz…'

Create encode and decode functtions

In [6]:
# Create a mapping from the characters to integer

stoi = { ch:i for i,ch in enumerate(vocabulary)}
itos = { i:ch for i,ch in enumerate(vocabulary)}

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

In [7]:
encoded = encode('Hello my friend!')
print(encoded)
decoded = decode(encoded)
print(decoded)

[28, 53, 60, 60, 63, 1, 61, 73, 1, 54, 66, 57, 53, 62, 52, 2]
Hello my friend!


## Implementation of the code in the libraries

In [8]:
from utils.tokenizer import CharTokenizer

tokenizer = CharTokenizer(data)

In [9]:
tokens = tokenizer.encode("Hello world! How is everyone doing?", isMiddle=False)
tokens[:10]

tensor([46, 28, 52, 59, 59, 62,  1, 70, 62, 65])

In [10]:
tokenizer.decode(tokens)[:10]

['[E]', 'H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r']

In [11]:
tokenizer.decodeText(tokens)

'Hello world! How is everyone doing?'

## Tokenize all the dataset

In [12]:
import torch
tokenizedData = []
for chunk in data:
    tokenizedData.append(tokenizer.encode(chunk))

In [13]:
tokenizedData[0][:200]

tensor([29, 67,  4, 66,  1, 61, 62, 67,  1, 52, 61, 62, 68, 54, 55,  1, 67, 62,
         1, 49, 52,  1, 62, 61, 59, 72,  1, 52, 71, 50, 52, 63, 67, 56, 62, 61,
        48, 59,  1, 56, 61,  1, 62, 61, 52,  1, 65, 52, 48, 59, 60,  7,  0,  0,
        42, 56, 67, 55,  1, 52, 69, 52, 65, 72,  1, 66, 56, 61, 54, 59, 52,  1,
        48, 51, 69, 48, 61, 67, 48, 54, 52,  1, 72, 62, 68,  1, 55, 48, 69, 52,
         5,  0,  0, 29, 67,  4, 66,  1, 61, 62, 67,  1, 52, 61, 62, 68, 54, 55,
         1, 67, 62,  1, 49, 52,  1, 54, 62, 62, 51,  1, 48, 67,  1, 57, 68, 66,
        67,  1, 34, 33, 25,  1, 67, 55, 56, 61, 54,  7,  0,  0, 44, 62, 68,  1,
        61, 52, 52, 51,  1, 67, 62,  1, 49, 52,  1, 54, 62, 62, 51,  1, 48, 67,
         1, 25, 41, 25, 37, 44, 39, 28, 29, 33, 27,  7,  0,  0, 37, 56, 50, 55,
         5,  1, 38, 67, 65, 62, 61, 54,  5,  1, 23, 55, 48, 65, 56, 66, 60, 48,
        67, 56])

# Conclusion:
This `HTokenizer` class is a character tokenizer, and we will use it for tokenizing the data.