In [34]:
import pandas as pd
import plotly.express as px
import numpy as np

In [53]:
import re

### Free Parameters

In [2]:
V = 17694
n = 6

m = 100
h = 60

V * (1 + n*m + h) + h * (1 + (n-1)*m)

11725794

## 1. Downloading Data

In [3]:
import nltk
import csv

from nltk.corpus import brown, wordnet

In [4]:
nltk.download('brown')
nltk.download('wordnet')

[nltk_data] Downloading package brown to
[nltk_data]     /Users/niteshkumarsharma/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/niteshkumarsharma/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
len(brown.words())

1161192

## 2. Creating Data Set from `brown` corpus


> Total words in brown corpus 116,1192. The ﬁrst 800,000 words were used for training, the following 200,000 for validation (model selection, weight decay, early stopping) and the remaining 161,192 for testing. 

Training: 800,000

Validation: 200,000

Test: 161,192

In [6]:
trn_ln = 800000
val_ln = 200000
test_ln = 161192

In [7]:
brown.words()[trn_ln]

'.'

### 2.1. Creating Train, Validation & Test Corpus

In [8]:
training_corpus = {}
validation_corpus = {}
test_corpus = {}

for ix, word in enumerate(brown.words()):
    if ix <= trn_ln - 1:
        training_corpus[word] = training_corpus.get(word, 0) + 1
    
    elif ix <= trn_ln + val_ln - 1:
        validation_corpus[word] = validation_corpus.get(word, 0) + 1

    else:
        test_corpus[word] = test_corpus.get(word, 0) + 1

In [9]:
print(f"Vocabulary size in Training Split: {len(training_corpus.items())}")
print(f"Vocabulary size in Validation Split: {len(validation_corpus.items())}")
print(f"Vocabulary size in Test Split: {len(test_corpus.items())}")

Vocabulary size in Training Split: 46187
Vocabulary size in Validation Split: 19005
Vocabulary size in Test Split: 15566


### 2.2. Distribution of Tokens in Training Corpus

In [10]:
len(training_corpus.items())

46187

In [11]:
df_word_count = pd.DataFrame(data=sorted(training_corpus.items(), key= lambda kv: -kv[1]), columns=['Word', 'Count'])

In [32]:
px.histogram(df_word_count, x='Count', nbins=8000, title='Word Count Distribution in Training Split')

In [37]:
## based on paper
df_word_count['Word'] = np.where(df_word_count['Count'] <= 3, 'UNK', df_word_count['Word'])

### 2.3. Creating encoder

In [38]:
vocabulary = list(df_word_count['Word'].unique())

In [48]:
encoder = {word: ix for ix, word in enumerate(vocabulary)}

## Adding Start and End tokens
encoder['<s>'] = len(encoder)
encoder['</s>'] = len(encoder)

### 2.4. Creating Decoder

In [51]:
decoder = {ix: word for word, ix in encoder.items()}

### 2.5. Implementing Tokeniser using above encoder & decoder

In [66]:
special_chars = set()
for word in vocabulary:
    special_chars.update(re.findall(r'[^a-zA-Z0-9]', word))

print("Unique special characters:", special_chars)

Unique special characters: {'-', '&', '/', '{', "'", '.', '*', '$', '%', ')', '(', '?', '!', '}', ',', ';', ':', '`'}


In [88]:
class Tokenizer:

    special_chars = {'-', '&', '/', ' ','{', "'", '.', '*', '$', '%', ')', '(', '?', '!', '}', ',', ';', ':', '`'}

    def __init__(self, encoder: dict):
        self.str_to_int = encoder
        self.int_to_str = {i: s for s, i in encoder.items()}

    
    def encode(self, text: str):


        pattern = f"([{re.escape(''.join(self.special_chars))}])"

        tokens = re.split(pattern, text)


        tokens_space_removed = [item.strip() for item in tokens if item.strip()]

        ## replacing unknown tokens with <|unk|>
        tokens_space_removed = [s if s in self.str_to_int else "UNK" for s in tokens_space_removed]

        ids = [self.str_to_int[s] for s in tokens_space_removed]

        return ids
    

    def decode(self, ids: list[int]):

        pass



In [89]:
tk = Tokenizer(encoder)

In [90]:
cc = tk.encode("Hello, World! This is a test string.")

In [91]:
word_recons = [decoder[ix] for ix in cc]

In [92]:
word_recons

['UNK', ',', 'World', '!', 'This', 'is', 'a', 'test', 'string', '.']

In [79]:
encoder["test"]

841

In [86]:
cc

[14113, 1, 800, 151, 14113, 2]

In [85]:
encoder["This"]

79