In [1]:
import pandas as pd
import plotly.express as px
import numpy as np

In [2]:
import re

## 0. Number of Free Parameters

In [3]:
V = 17694
n = 6

m = 100
h = 60

V * (1 + n*m + h) + h * (1 + (n-1)*m)

11725794

## 1. Downloading Data

In [4]:
import nltk
from nltk.corpus import brown,wordnet

In [5]:
len(brown.words())

1161192

## 2. Creating Data Set from `brown` corpus


> Total words in brown corpus 116,1192. The ﬁrst 800,000 words were used for training, the following 200,000 for validation (model selection, weight decay, early stopping) and the remaining 161,192 for testing. 

Training: 800,000

Validation: 200,000

Test: 161,192

In [6]:
trn_ln = 800000
val_ln = 200000
test_ln = 161192

### 2.1. Creating Train, Validation & Test Corpus

No Special Preprocessing is being applier

In [7]:
training_corpus = {}
validation_corpus = {}
test_corpus = {}

for ix, word in enumerate(brown.words()):
    if ix <= trn_ln - 1:
        training_corpus[word.lower()] = training_corpus.get(word.lower(), 0) + 1
    
    elif ix <= trn_ln + val_ln - 1:
        validation_corpus[word.lower()] = validation_corpus.get(word, 0) + 1

    else:
        test_corpus[word.lower()] = test_corpus.get(word, 0) + 1

In [8]:
print(f"Vocabulary size in Training Split: {len(training_corpus.items())}")
print(f"Vocabulary size in Validation Split: {len(validation_corpus.items())}")
print(f"Vocabulary size in Test Split: {len(test_corpus.items())}")

Vocabulary size in Training Split: 41146
Vocabulary size in Validation Split: 17460
Vocabulary size in Test Split: 14437


### 2.2. Distribution of Tokens in Training Corpus

In [9]:
len(training_corpus.items())

41146

In [11]:
df_word_count = pd.DataFrame(data=sorted(training_corpus.items(), key= lambda kv: -kv[1]), columns=['Word', 'Count'])

In [12]:
px.histogram(df_word_count, x='Count', nbins=8000, title='Word Count Distribution in Training Split')

In [13]:
## based on paper, words appearing >= 3 times are considered as part of the vocabulary

vocab = [word for word, count in training_corpus.items() if count >= 3]

### 2.3. Creating encoder

In [14]:
vocabulary = list(df_word_count['Word'].unique())

In [15]:
len(set(vocab))

16428

#### 2.4. Understanding Presence of Special Characters

In [16]:
special_chars = set()
for word in vocab:
    special_chars.update(re.findall(r'[^a-zA-Z0-9]', word))

print("Unique special characters:", special_chars)

Unique special characters: {'(', '/', '?', "'", '-', ':', '%', '&', '.', ')', '}', '`', ';', '!', '*', '$', '{', ','}


In [17]:
def special_char_vocab(vocabulary: list[str], special_char: str) -> list[str]:
    return [word for word in vocabulary if special_char in word]

In [18]:
req_data = []
for char in special_chars:
    use_words = special_char_vocab(vocab, char)
    for word in use_words:
        req_data.append((char, word))

df_special_chars_use = pd.DataFrame(data=req_data, columns=['Special Character', 'Word'])

In [19]:
df_special_chars_use

Unnamed: 0,Special Character,Word
0,(,(
1,(,381(a)
2,(,368(a)(1)
3,/,1-1/2
4,/,3-1/2
...,...,...
1072,",",200000
1073,",",500000
1074,",",800000
1075,",",250000


In [20]:
encoder = {word: ix for ix, word in enumerate(vocab)}

## Adding Start and End tokens
encoder['<s>']   = len(encoder)
encoder['</s>']  = len(encoder)
encoder['<unk>'] = len(encoder)

### 2.4. Creating Decoder

In [21]:
decoder = {ix: word for word, ix in encoder.items()}

### 2.5 Implementing Simple Tokeniser

Text is simply split by using space and special chars

In [22]:
class Tokenizer:
    def __init__(self, encoder: dict):
        self.str_to_int = encoder
        self.int_to_str = {i: s for s, i in encoder.items()}


    def encode(self, text: str) -> list[int]:

        text_split = re.split(r'([,.!?]|\s)', text) # splitting on , & . and spaces

        text_split = [word for word in text_split if word.strip()]


        return [self.str_to_int.get(word, self.str_to_int['<unk>']) for word in text_split]
    
    def decode(self, text: list[int]) -> str:
        return ' '.join([self.int_to_str.get(ix, '<unk>') for ix in text])


### 2.5. Implementing Tokeniser using above encoder & decoder

In [23]:
# class Tokenizer:

#     special_chars = {'-', '&', '/', '{', "'", '.', '*', '$', '%', ')', '(', '?', '!', '}', ',', ';', ':', '`'}

#     def __init__(self, encoder: dict):
#         self.str_to_int = encoder
#         self.int_to_str = {i: s for s, i in encoder.items()}

    
#     def encode(self, text: str):

#         pattern = r"\s+|[{}]+|\d+\.\d+|\d+|\w+".format(re.escape(''.join(special_chars)))

#         tokens = re.findall(pattern, text)

#         tokens_space_removed = [token for token in tokens if token.strip()]

#         tokens_space_removed = [s if s in self.str_to_int else "UNK" for s in tokens_space_removed]

#         ids = [self.str_to_int[s] for s in tokens_space_removed]

#         return ids
    

#     def decode(self, ids: list[int]):

#         decoder_text = " ".join([self.int_to_str[i] for i in ids])

#         return decoder_text

In [35]:
tokenizer = Tokenizer(encoder)

In [37]:
tokenizer.decode(tokenizer.encode("lend me $1"))

'lend me $1'

In [38]:
text = "Do you know what is the time?"

In [39]:
text_split = re.split(r'([,.!?]|\s)', text)
text_split = [word for word in text_split if word.strip()]