In [46]:
import os
import numpy as np


class TextVectorizer:
    def __init__(self, max_vocab_size=20000, min_freq=1, max_len=32):
        """
        max_vocab_size: maximum number of tokens in vocab (including special tokens)
        min_freq: minimum frequency for a word to be included
        max_len: fixed sequence length (for padding/truncating)
        """
        self.max_vocab_size = max_vocab_size
        self.min_freq = min_freq
        self.max_len = max_len

        self.PAD_TOKEN = "<pad>"
        self.UNK_TOKEN = "<unk>"

    def Normalize(self, text):
        text = text.strip().lower().split()
        return text

    def fit(self, file_path):
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"{file_path} not found")
        
        freq = {}

        with open(file_path, 'r', encoding = 'utf-8') as f:
            for line in f:
                tokens = self.Normalize(line)
                for tok in tokens:
                    freq[tok] = freq.get(tok, 0) + 1
        filtered_tokens = []
        for w,c in freq.items():
            if c >= self.min_freq:
                filtered_tokens.append(w)
        filtered_tokens.sort(key = lambda w: freq[w], reverse = True)
    
        vocab, index = {}, 2 # initilizing because 'pad':0, 'unk':1
        vocab[self.PAD_TOKEN] = 0
        vocab[self.UNK_TOKEN] = 1
        
        for token in filtered_tokens:
            if token not in vocab:
                vocab[token] = index
                index += 1
        
        inverse_vocab = {index:token for token, index in vocab.items()}

        return vocab, inverse_vocab 
        

tex = TextVectorizer()
tex.fit('data/huggingface/train.txt')

({'<pad>': 0,
  '<unk>': 1,
  'the': 2,
  'of': 3,
  'and': 4,
  'to': 5,
  'in': 6,
  'a': 7,
  'is': 8,
  'that': 9,
  'for': 10,
  'as': 11,
  'are': 12,
  'with': 13,
  'on': 14,
  'by': 15,
  'it': 16,
  'be': 17,
  'or': 18,
  'from': 19,
  'this': 20,
  'an': 21,
  'have': 22,
  'was': 23,
  'can': 24,
  'their': 25,
  'at': 26,
  'which': 27,
  'not': 28,
  'they': 29,
  'has': 30,
  'more': 31,
  'you': 32,
  'will': 33,
  'we': 34,
  'your': 35,
  'but': 36,
  'one': 37,
  'were': 38,
  'also': 39,
  'its': 40,
  'about': 41,
  'other': 42,
  'all': 43,
  'than': 44,
  'when': 45,
  'these': 46,
  'how': 47,
  'been': 48,
  'if': 49,
  'what': 50,
  'most': 51,
  'new': 52,
  'who': 53,
  'our': 54,
  'may': 55,
  'many': 56,
  'there': 57,
  'into': 58,
  'some': 59,
  'such': 60,
  'people': 61,
  'would': 62,
  'his': 63,
  'use': 64,
  'he': 65,
  'used': 66,
  'had': 67,
  'so': 68,
  'first': 69,
  'through': 70,
  'over': 71,
  'only': 72,
  'between': 73,
  'do': 74,
