Adding route to custom libraries

In [1]:
import sys
import os


dirname = os.path.abspath(os.path.join(os.getcwd(), "..", "scripts/lib"))
sys.path.append(dirname)

## Importing libraries

In [2]:
import torch

from utils.compile import compileFolder
from utils.tokenizer import CharTokenizer, END_CHAR

## Setting up the data and other

In [3]:
# Importing the data
raw_data = compileFolder('tate')

# Creating the tokenizer
tokenizer = CharTokenizer(raw_data)

### We will need a proper class to get items from the dataset

In [4]:
from torch.utils.data import Dataset

class TextChunksDataset(Dataset):
    def __init__(self, raw_data, context_length, tokenizer=None) -> None:
        super().__init__()
        self.data = []
        if tokenizer==None:
            tokenizer = CharTokenizer(raw_data)
        self.tokenizer = tokenizer
        self.context_length = context_length
        self.mappingArray = []
        idx = 0
        for chunk in raw_data:
            chunkTensor = self.tokenizer.encode([END_CHAR]+list(chunk), False)
            self.data.append(chunkTensor)
            for i in range(len(chunkTensor)-self.context_length-1):
                self.mappingArray.append(idx)
                idx+=1
            self.mappingArray.append(idx)
            idx += self.context_length + 1
        self.mappingArray = torch.tensor(self.mappingArray)
        self.data = torch.cat(self.data)
    
    def __len__(self) -> int:
        return len(self.mappingArray)
    
    def __getitem__(self, index, block_size=1) -> torch.Tensor:
        if type(index) == int:
            return (
                self.data[
                    self.mappingArray[index] : self.mappingArray[index]
                    + self.context_length
                ],
                self.data[
                    self.mappingArray[index]
                    + 1 : self.mappingArray[index]
                    + self.context_length
                    + 1
                ],
            )
        elif type(index) == slice:
            Lx = []
            Ly = []
            for k in range(index.start or 0, index.stop or len(self), index.step or 1):
                x, y = self[k]
                Lx.append(x)
                Ly.append(y)
            return torch.stack(Lx), torch.stack(Ly)
        

In [5]:
textChunksDataset = TextChunksDataset(raw_data, 8, tokenizer)

In [6]:
for i in range(600,610):
    print(tokenizer.decodeText(textChunksDataset[i][0]),"->",tokenizer.decodeText(textChunksDataset[i][1]))

 underst -> understa
understa -> nderstan
nderstan -> derstand
derstand -> erstand 
erstand  -> rstand t
rstand t -> stand th
stand th -> tand thi
tand thi -> and this
and this -> nd this,
nd this, -> d this,



This is the code implemented in `lib/utils/datasets.py`

In [7]:
from utils.datasets import TextChunksDataset

## Setting Hyperparameters

In [8]:
# The max block size (also known as max context) [in tokens]
block_size = 8

# How much does the test/validation set represent of the total data
test_train_split_ratio = 0.1

In [9]:
data = TextChunksDataset(raw_data, block_size, tokenizer)

In [10]:
def split_dataset(data,ratio):
    """
    Returns (train,test)
    """
    if type(data)==list:
        data = torch.cat(data)
        n = int(len(data)*ratio)
        return data[n:], data[:n]
    elif type(data)==TextChunksDataset:
        n = int(len(data)*ratio)
        train_data = TextChunksDataset("",data.context_length)
        test_data = TextChunksDataset("",data.context_length)
        train_data.data = data.data
        train_data.tokenizer = data.tokenizer
        test_data.data = data.data
        test_data.tokenizer = data.tokenizer
        train_data.mappingArray = data.mappingArray[n:]
        test_data.mappingArray = data.mappingArray[:n]
        return train_data, test_data
    
        

In [11]:
train, test = split_dataset(data,0.1)
print(train)
print(test)

TextChunksDataset(length: 51633,context_length: 8)
TextChunksDataset(length: 5737,context_length: 8)


This is the code implemented in `lib/utils/datasets.py`

In [12]:
from utils.datasets import split_dataset

In [13]:
tokenizer.decodeText(train[45000][0])

' to feel'