In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import importlib
from pathlib import Path
# from phonemes import encoder
# from phonemes import decoder
# from phonemes import dataset
import tiktoken
token_enc = tiktoken.get_encoding('gpt2')

In [2]:
importlib.reload(encoder)
path = Path(r"C:\Users\benak\Downloads")

bert_state_dict = torch.load(path / 'phonemes_bert.pth', weights_only=True)
gru_state_dict = torch.load(path / 'phonemes_gru.pth', weights_only=True)

bert = encoder.EncoderBERT(n_emb=64, exp=4, n_heads=2, n_blocks=2, dropout=0.0, pool='cls', mask_attn=True)
gru = decoder.DecoderGRU(input_size=(64+50), hidden_size=128)
bert_state = {k.replace('_orig_mod.', ''): v for k, v in bert_state_dict.items()}
gru_state = {k.replace('_orig_mod.', ''): v for k, v in gru_state_dict.items()}
bert.load_state_dict(bert_state)
gru.load_state_dict(gru_state)
bert.eval(), gru.eval()

NameError: name 'encoder' is not defined

In [None]:
wd = dataset.Words()  # dataset of words and phonetic embeddings - accessible through .mappings dict
def wenc(word): return wd.enc(word).reshape(1, -1)

In [None]:
bert(wenc('aardvark'))

tensor([[-0.1482, -0.4275,  0.4877, -0.1933, -0.0583, -0.1747, -0.0392,  0.2760,
          0.2529, -0.2938,  0.2242,  0.4975, -0.7377, -0.4148,  0.3168, -0.5848,
          0.0058, -0.1914,  0.0914, -0.1985, -0.1735,  0.2365, -0.0322, -0.1618,
         -0.5113, -0.0505, -0.0313, -0.5150,  0.0351,  0.7522,  0.6980,  0.1678,
         -0.3494, -0.3898, -0.9819,  0.1042,  0.3944, -0.3385,  0.5397, -0.1283,
         -0.4749,  0.3122,  0.2301,  0.0791,  0.2777,  0.0676,  0.0225,  0.3422,
         -0.2355,  0.2503]], grad_fn=<AddmmBackward0>)

In [None]:

import random
from sklearn.neighbors import NearestNeighbors

class PhoneticJumbler:
    def __init__(self, words_dataset, encoder, decoder = None, n_neighbors: int = 5):
        self.wd = words_dataset
        self.encoder = encoder
        self.decoder = decoder
        self.nbrs = (
            NearestNeighbors(n_neighbors=n_neighbors, metric='cosine')
            .fit(self.wd.embeddings)
        )
    
    def find_closest_words(self, emb):
        distances, indices = self.nbrs.kneighbors(emb)
        words = list(self.wd.mappings.keys())
        return [words[i] for i in indices[0]]
    
    def jumble(self, word: str, use_nn: bool = True):
        """
        Get phonetically similar words for fine-tuning
        """
        # try to get actual phonetic embedding, use encoder otherwise
        try:
            emb = self.wd.mappings.get(word, self.encoder(wenc(word)))
            emb = emb.reshape(1, -1)
            
            if use_nn or self.decoder is None:
                emb = emb.detach().numpy()
                return random.choice(self.find_closest_words(emb))
            else:
                return wd.dec(self.decoder.predict(emb).reshape(-1))
        except Exception as e:
            print(f"Error jumbling word '{word}': {e}")
            raise

In [85]:
jumbler = PhoneticJumbler(wd, bert, gru)
jumbler.jumble('eunoia', use_nn=True)

'una'

In [None]:
import nltk
assert(nltk.download('wordnet'))
from nltk.corpus import wordnet as wn

# yield sample definitions from word "weights"
for i,s in enumerate(wn.synsets('weights')):
    print(f'{i}: ({s.pos()}) {s.definition()}')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\benak\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0: (n) the vertical force exerted by a mass as a result of gravity
1: (n) sports equipment used in calisthenic exercises and weightlifting; it is not attached to anything and is raised and lowered by use of the hands and arms
2: (n) the relative importance granted to something
3: (n) an artifact that is heavy
4: (n) an oppressive feeling of heavy force
5: (n) a system of units used to express the weight of something
6: (n) a unit used to measure weight
7: (n) (statistics) a coefficient assigned to elements of a frequency distribution in order to represent their relative importance
8: (v) weight down with a load
9: (v) present with a bias


In [68]:
w = 'weights'

df = [(w, s.definition()) for s in wn.synsets(w)]
df

[('weights', 'the vertical force exerted by a mass as a result of gravity'),
 ('weights',
  'sports equipment used in calisthenic exercises and weightlifting; it is not attached to anything and is raised and lowered by use of the hands and arms'),
 ('weights', 'the relative importance granted to something'),
 ('weights', 'an artifact that is heavy'),
 ('weights', 'an oppressive feeling of heavy force'),
 ('weights', 'a system of units used to express the weight of something'),
 ('weights', 'a unit used to measure weight'),
 ('weights',
  '(statistics) a coefficient assigned to elements of a frequency distribution in order to represent their relative importance'),
 ('weights', 'weight down with a load'),
 ('weights', 'present with a bias')]

In [None]:
# create dataloader compatible datasets for training and validation
from torch.utils.data import Dataset, DataLoader
pad_token = token_enc.max_token_value + 1  # add padding token

class DefData:
    def __init__(self):
        # nested dict -- {word: {idx: (def, tokenized_def)}}
        self.df = {}
        self.max_def_len = 0
        for w in wn.all_lemma_names():
            if w.isalpha():
                inner_dict = {}
                for i,s in enumerate(wn.synsets(w)):
                    definition = s.definition()
                    token_def = token_enc.encode(definition)
                    inner_dict[i] = [definition, token_def]
                    if (def_len:=len(token_def)) > self.max_def_len: 
                        self.max_def_len = def_len
                self.df[w] = inner_dict

        # distinct pairs for train-test splits
        self.words, self.token_defs = [], []
        for w in self.df.keys():
            for d in self.df[w]:
                self.pad_def(self.df[w][d][1])  # pad tokenized definitions
                self.words.append(w)
                self.token_defs.append(self.df[w][d][1])
        
        # cast to indexable datastructures
        self.words = np.array(self.words)
        self.token_defs = torch.tensor(self.token_defs)

    def pad_def(self, tokenized_definition):
        """Pad tokenized definition up to max definition length in dataset"""
        tokenized_definition.extend(
            [token_enc.max_token_value]  # eos token
            + [pad_token] * (self.max_def_len - len(tokenized_definition))
        )

    def train_test_split(self, train_pct: float = 0.8, jumble_pct: float = 0.4):
        """
        Split dataset into training and testing sets
        Splits all word-definition pairs, so the same word may be in train and test set w/ different defs
        """
        total_words = len(self.df.keys())
        indices = np.random.permutation(total_words)
        train_idx = indices[:int(total_words * train_pct)]
        test_idx = indices[int(total_words * train_pct):]

        train_data = DefDataSplit(dataset=self, indices=train_idx, jumble_pct=jumble_pct)
        test_data = DefDataSplit(dataset=self, indices=test_idx)

        return train_data, test_data


class DefDataSplit(Dataset):
    def __init__(self, dataset: DefData, indices, jumble_pct: float = None):
        super().__init__()
        self.dd = dataset
        self.jumble_pct = jumble_pct
        self.jumbler = jumbler
        
    def __len__(self):
        return len(self.words)

    def __getitem__(self, idx: int):
        # phonetically mask word at random with jumble_pct probability
        rng = np.random.default_rng()
        if self.jumble_pct is None or self.jumble_pct == 0.0:
            word = self.words[idx]
        elif (rng.random() < self.jumble_pct):
                word = jumbler.jumble(word, use_nn=True)
        else:
            word = self.words[idx]
        
        prefix = torch.tensor(token_enc.encode(f"The definition of {word} is:"))
        token_def = torch.cat([prefix, self.token_defs[idx]])
        token_def = F.pad(
            token_def, (0, 256 - len(token_def)),  # 256 is friendly and well above longest def
            'constant', pad_token
        )
        return word, token_def 
    

In [179]:
dd = DefData()
train_data, test_data = dd.train_test_split(train_pct=0.8)

In [2]:
# helper function for colab -- ugh
def load_py(filename, alias: str = None):
    """Handle importing and reloading modules when in colab"""
    import requests, importlib, sys
    if not filename.endswith(".py"): filename += ".py"
    
    if 'google.colab' in sys.modules:
        if '/content/' not in sys.path:
            sys.path.insert(0, '/content/')

        url = f"https://raw.githubusercontent.com/BenAF002/dabble_bot_v2_dev/refs/heads/main/{filename}"
        code = requests.get(url).text

        # write to /content for colab
        destination = f"/content/{filename}"
        with open(destination, "w") as f:
            f.write(code)

    # handle module reloading
    module_name = filename[:-3] # Strip the .py extension
    module_name = module_name.rpartition('/')[-1]  # get substring after last backslash, if exists
    if alias in globals():
        importlib.reload(globals()[alias])
    elif module_name in globals():
        importlib.reload(globals()[module_name])
    else:
        if alias is not None:
            globals()[alias] = importlib.import_module(module_name)
        else:
            globals()[module_name] = importlib.import_module(module_name)

In [5]:
load_py('dabble_gpt')

In [3]:
import dabble_gpt
importlib.reload(dabble_gpt)

gpt = dabble_gpt.GPT().from_pretrained(model_type='gpt2', lora_rank=4)

In [4]:
seq = gpt.generate(seq="Scientists recently discovered a herd of fuzzy ", max_new_tokens=100)

In [5]:
seq

'Scientists recently discovered a herd of fuzzy vernaculars in a Siberian area. "Their hair was quite wide," says the paleontologist Dr. Anna R. Bekkova, who led the study.\n\nIn her laboratory, the team collected fossils that were too narrow for the study. They determined that these were probably the earliest prunus ever spotted in this region. The team also found traces of an amphora called the pinnacles.\n\n"An amphora isn\'t a very large animal," says Ber'

In [42]:
t = torch.tensor([1,2,3]).reshape(1, -1)
t

tensor([[1, 2, 3]])

In [47]:
t[-1, -1].reshape(1) #, -1)

tensor([3])

In [31]:
for name, module in gpt.named_modules():
    print(name, type(module))

 <class 'dabble_gpt.GPT'>
transformer <class 'torch.nn.modules.container.ModuleDict'>
transformer.wte <class 'torch.nn.modules.sparse.Embedding'>
transformer.wpe <class 'torch.nn.modules.sparse.Embedding'>
transformer.h <class 'torch.nn.modules.container.ModuleList'>
transformer.h.0 <class 'dabble_gpt.Block'>
transformer.h.0.ln_1 <class 'torch.nn.modules.normalization.LayerNorm'>
transformer.h.0.attn <class 'dabble_gpt.CausalSelfAttention'>
transformer.h.0.attn.c_attn <class 'torch.nn.modules.linear.Linear'>
transformer.h.0.attn.c_proj <class 'torch.nn.modules.linear.Linear'>
transformer.h.0.attn.dropout <class 'torch.nn.modules.dropout.Dropout'>
transformer.h.0.ln_2 <class 'torch.nn.modules.normalization.LayerNorm'>
transformer.h.0.mlp <class 'dabble_gpt.MLP'>
transformer.h.0.mlp.c_fc <class 'torch.nn.modules.linear.Linear'>
transformer.h.0.mlp.gelu <class 'torch.nn.modules.activation.GELU'>
transformer.h.0.mlp.c_proj <class 'torch.nn.modules.linear.Linear'>
transformer.h.1 <class 'da

In [29]:
65536 / 8 / 15

546.1333333333333

In [190]:
seq = torch.tensor(token_enc.encode("Scientists recently discovered a herd of fuzzy "))
out_seq = token_enc.decode(seq.detach().numpy())
out_seq

'Scientists recently discovered a herd of fuzzy '

In [65]:
wn.synsets('lichenales')[0].definition()

'category used especially in former classifications for organisms now constituting the division Lichenes'