Adding route to custom libraries

In [1]:
import sys
import os


dirname = os.path.abspath(os.path.join(os.getcwd(), "..", "scripts/lib"))
sys.path.append(dirname)

## Importing libraries

In [2]:
import torch

from utils.compile import compileFolder
from utils.tokenizer import CharTokenizer, END_CHAR

## Setting up the data and other

In [3]:
# Importing the data
raw_data = compileFolder('tate')

# Creating the tokenizer
tokenizer = CharTokenizer(raw_data)

### We will need a proper class to get items from the dataset

In [4]:
from torch.utils.data import Dataset

class TextChunksDataset(Dataset):
    def __init__(self, raw_data, context_length, tokenizer=None) -> None:
        super().__init__()
        self.data = []
        if tokenizer==None:
            tokenizer = CharTokenizer(raw_data)
        self.tokenizer = tokenizer
        self.context_length = context_length
        self.mappingArray = []
        idx = 0
        for chunk in raw_data:
            chunkTensor = self.tokenizer.encode([END_CHAR]+list(chunk), False)
            self.data.append(chunkTensor)
            for i in range(len(chunkTensor)-self.context_length-1):
                self.mappingArray.append(idx)
                idx+=1
            self.mappingArray.append(idx)
            idx += self.context_length + 1
        self.mappingArray = torch.tensor(self.mappingArray)
        self.data = torch.cat(self.data)
    
    def __len__(self) -> int:
        return len(self.mappingArray)
    
    def __getitem__(self, index, block_size=1) -> torch.Tensor:
        if type(index)==int:
            return self.data[self.mappingArray[index]:self.mappingArray[index]+self.context_length], self.data[self.mappingArray[index]+self.context_length]
        elif type(index)==slice:
            L = []
            for k in range(index.start or 0,index.stop or len(self),index.step or 1):
                L.append(self[k])
            return L
        

In [5]:
textChunksDataset = TextChunksDataset(raw_data, 8, tokenizer)

In [6]:
for i in range(600,610):
    print(tokenizer.decodeText(textChunksDataset[i][0]),"->",tokenizer.decodeText([textChunksDataset[i][1]]))

 underst -> a
understa -> n
nderstan -> d
derstand ->  
erstand  -> t
rstand t -> h
stand th -> i
tand thi -> s
and this -> ,
nd this, -> 



This is the code implemented in `lib/utils/loader.py`

In [7]:
from utils.loader import TextChunksDataset

## Setting Hyperparameters

In [8]:
# The max block size (also known as max context) [in tokens]
block_size = 8

# How much does the test/validation set represent of the total data
test_train_split_ratio = 0.1

In [9]:
def split_dataset(data,ratio):
    """
    Returns (train,test)
    """
    if type(data)==list:
        data = torch.cat(data)
    n = int(len(data)*ratio)
    return data[n:], data[:n]
    
        

In [10]:
train, test = split_dataset(data,0.1)
print(train.shape)
print(test.shape)

NameError: name 'data' is not defined