<a href="https://colab.research.google.com/github/AbdulSubhan669/transformers-for-noobs/blob/main/nlm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [209]:
import torch
import torch.nn as nn
import re
from collections import defaultdict, Counter
from torch.nn.utils.rnn import pad_sequence
import math
import torch.nn.functional as F


In [210]:
"""!pip install datasets
from datasets import load_dataset
ds = load_dataset("princeton-nlp/datasets-for-simcse")
dataset = ds['train']"""

'!pip install datasets\nfrom datasets import load_dataset\nds = load_dataset("princeton-nlp/datasets-for-simcse")\ndataset = ds[\'train\']'

In [211]:
dataset = {'text': ['YMCA in South Australia', "South Australia (SA) \xa0has a unique position in Australia's history as, unlike the other states which were founded as colonies, South Australia began as a self governing province Many were attracted to this and Adelaide and SA developed as an independent and free thinking state.", 'The compound of philosophical radicalism, evangelical religion and self reliant ability typical of its founders had given an equalitarian flavour to South Australian thinking from the beginning.', 'It was into this social setting that in February 1850 a meeting was called primarily for the formation of an Association (apparently meaning a Y.M.C.A.)', "for apprentices and others, after their day's work, to enjoy books, lectures, discussions, readings, friendly relief and recreation for a leisure hour.", 'In September 1850 records show that this became “The Young Men\'s Christian Association of South Australia" as evidenced by a member\'s letter in London Y.M.C.A.', 'Report 1851.', 'There was no census in 1850 but the 1851 census put the total population of South Australia at 63,700 with males numbering 35,302.', 'The discovery of Gold in Ballarat caused a large migration from South Australia and by 1852 some 8000 had left for the Goldfields.', 'As a consequence the various YMCA groups that had become established failed and by 1870 none remained.']}

In [212]:
class BPETokenizer:
    def __init__(self):
        self.vocab = {}
        self.token_to_index = {}
        self.index_to_token = {}

    def initialize_vocab(self, dataset, batch_size=10000):

        vocab = defaultdict(int)


        for i in range(len(dataset['text'])):
            sentence = dataset['text'][i]
            words = sentence.strip().split()
            for word in words:
                vocab[word] += 1
                #vocab[word + '</w>'] += 1

        self.vocab = vocab
        print(f"Initialized Vocab: {dict(list(self.vocab.items())[:10])}")

    def get_stats(self):

        pairs = defaultdict(int)
        for word, freq in self.vocab.items():
            symbols = word.split()
            for i in range(len(symbols) - 1):
                pairs[(symbols[i], symbols[i + 1])] += freq
        return pairs

    def merge_vocab(self, pair):

        new_vocab = {}
        bigram = re.escape(' '.join(pair))
        pattern = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')

        for word in self.vocab:
            new_word = pattern.sub(''.join(pair), word)
            new_vocab[new_word] = self.vocab[word]

        self.vocab = new_vocab

    def perform_bpe(self, num_merges):

        for i in range(num_merges):
            pairs = self.get_stats()
            if not pairs:
                break
            best_pair = max(pairs, key=pairs.get)
            self.merge_vocab(best_pair)
            print(f"Iteration {i+1}: Merged {best_pair}")
            print(f"Updated Vocab: {dict(list(self.vocab.items())[:10])}")

    def build_token_to_index(self):

        for idx, token in enumerate(self.vocab.keys()):
            self.token_to_index[token] = idx
            self.index_to_token[idx] = token

        return self.token_to_index

    def tokenize(self,text):
      tokens = []
      print(f"Token to Index Mapping: {self.token_to_index}")
      for word in text.split():
        #token = word + "</w>"
        token = word
        #print(f"Trying to match: {token}")
        #print(f"Generated token: '{token}'")
        if token in self.token_to_index:
          tokens.append(self.token_to_index[token])
        else:
           print("________")

      return tokens



bpe_tokenizer = BPETokenizer()
bpe_tokenizer.initialize_vocab(dataset)
bpe_tokenizer.perform_bpe(num_merges=500)
token_to_index = bpe_tokenizer.build_token_to_index()


Initialized Vocab: {'YMCA': 2, 'in': 6, 'South': 7, 'Australia': 5, '(SA)': 1, 'has': 1, 'a': 8, 'unique': 1, 'position': 1, "Australia's": 1}


In [213]:
class PositionalEncoding(nn.Module):
  def __init__(self,embedding_dim, max_length = 5000):
    super(PositionalEncoding, self).__init__()
    self.encoding = torch.zeros(max_length, embedding_dim)
    position = torch.arange(0, max_length).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * -(torch.log(torch.tensor(10000.0)) / embedding_dim))
    self.encoding[:, 0::2] = torch.sin(position * div_term)
    self.encoding[:, 1::2] = torch.cos(position * div_term)
    self.encoding = self.encoding.unsqueeze(0)

  def forward(self,x):
    return x + self.encoding[:, :x.size(1)]


In [214]:
class Mistral_Attention(nn.Module):
  def __init__(self,embedding_dim, num_heads):
    super(Mistral_Attention, self).__init__()
    self.num_heads = num_heads
    self.embedding_dim = embedding_dim
    self.head_dim = embedding_dim // num_heads

    assert (
        self.head_dim * num_heads == embedding_dim
    ), "Embedding dimension must be 0 modulo number of heads."

    self.q = nn.Linear(embedding_dim, embedding_dim)
    self.k = nn.Linear(embedding_dim, embedding_dim)
    self.v = nn.Linear(embedding_dim, embedding_dim)
    self.out = nn.Linear(embedding_dim, embedding_dim)

  def forward(self,x):
    batch_size, seq_length, _ = x.size()

    query = self.q(x).view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1,2)
    key = self.k(x).view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1,2)
    value = self.v(x).view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1,2)


    attention_scores = torch.matmul(query, key.transpose(-2, -1)) / self.head_dim ** 0.5
    attention_weights = F.softmax(attention_scores, dim=-1)
    context = torch.matmul(attention_weights, value)
    context = context.transpose(1, 2).contiguous().view(batch_size, seq_length, self.embedding_dim)
    return self.out(context)










In [215]:
class Transformer(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_heads, ff_dim):
        super(Transformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.attention = Mistral_Attention(embedding_dim, num_heads)
        self.feed_forward_1 = nn.Linear(embedding_dim, ff_dim)  # First layer of FFN
        self.feed_forward_2 = nn.Linear(ff_dim, embedding_dim)  # Project back to embedding_dim
        self.output_layer = nn.Linear(embedding_dim, vocab_size)  # Adjust according to your needs
        self.layer_norm1 = nn.LayerNorm(embedding_dim)
        self.layer_norm2 = nn.LayerNorm(embedding_dim)  # Should match embedding_dim

    def forward(self, x):
        x = self.embedding(x)  # Shape: (batch_size, seq_length, embedding_dim)
        attention_output = self.attention(x)  # Shape: (batch_size, seq_length, embedding_dim)
        x = self.layer_norm1(attention_output + x)  # Add & Norm
        ff_output = self.feed_forward_1(x)  # Shape: (batch_size, seq_length, ff_dim)
        ff_output = self.feed_forward_2(ff_output)  # Shape: (batch_size, seq_length, embedding_dim)
        x = self.layer_norm2(ff_output + x)  # Add & Norm
        return self.output_layer(x)  # Final output

    def predict(self, input_tokens, max_length=20):
        """
        Generate new tokens based on the input tokens.

        :param input_tokens: Initial tokens for generation (tensor).
        :param max_length: Maximum length of the generated sequence.
        :return: Generated token sequence.
        """
        self.eval()  # Set the model to evaluation mode
        generated_tokens = input_tokens.tolist()  # Store input tokens for generating output

        # Start generating tokens
        for _ in range(max_length):
            input_tensor = torch.tensor(generated_tokens).unsqueeze(0)  # Shape: (1, current_length)
            with torch.no_grad():
                output = self.forward(input_tensor)  # Forward pass

            # Get the logits for the last token
            last_token_logits = output[0, -1, :]  # Shape: (vocab_size,)
            probs = torch.softmax(last_token_logits, dim=-1)  # Get probabilities

            # Sample the next token (you can change this to argmax for greedy decoding)
            next_token = torch.multinomial(probs, num_samples=1).item()  # Sample next token

            generated_tokens.append(next_token)  # Add the new token to the sequence

        return generated_tokens



In [216]:
import torch

vocab_size = len(bpe_tokenizer.token_to_index)
embedding_dim = 128
num_heads = 8
ff_dim = 512

model = Transformer(vocab_size, embedding_dim, num_heads, ff_dim)


model.eval()


Transformer(
  (embedding): Embedding(143, 128)
  (attention): Mistral_Attention(
    (q): Linear(in_features=128, out_features=128, bias=True)
    (k): Linear(in_features=128, out_features=128, bias=True)
    (v): Linear(in_features=128, out_features=128, bias=True)
    (out): Linear(in_features=128, out_features=128, bias=True)
  )
  (feed_forward_1): Linear(in_features=128, out_features=512, bias=True)
  (feed_forward_2): Linear(in_features=512, out_features=128, bias=True)
  (output_layer): Linear(in_features=128, out_features=143, bias=True)
  (layer_norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  (layer_norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
)

In [217]:

seed_text = "Once upon a time in Austrailia"
seed_tokens = bpe_tokenizer.tokenize(seed_text)
print(f"Seed Tokens: {seed_tokens}")


max_length = 100
generated_tokens = model.predict(torch.tensor(seed_tokens), max_length=max_length)
print(f"Generated Tokens: {generated_tokens}")


generated_text = ' '.join(bpe_tokenizer.index_to_token[token] for token in generated_tokens)





Token to Index Mapping: {'YMCA': 0, 'in': 1, 'South': 2, 'Australia': 3, '(SA)': 4, 'has': 5, 'a': 6, 'unique': 7, 'position': 8, "Australia's": 9, 'history': 10, 'as,': 11, 'unlike': 12, 'the': 13, 'other': 14, 'states': 15, 'which': 16, 'were': 17, 'founded': 18, 'as': 19, 'colonies,': 20, 'began': 21, 'self': 22, 'governing': 23, 'province': 24, 'Many': 25, 'attracted': 26, 'to': 27, 'this': 28, 'and': 29, 'Adelaide': 30, 'SA': 31, 'developed': 32, 'an': 33, 'independent': 34, 'free': 35, 'thinking': 36, 'state.': 37, 'The': 38, 'compound': 39, 'of': 40, 'philosophical': 41, 'radicalism,': 42, 'evangelical': 43, 'religion': 44, 'reliant': 45, 'ability': 46, 'typical': 47, 'its': 48, 'founders': 49, 'had': 50, 'given': 51, 'equalitarian': 52, 'flavour': 53, 'Australian': 54, 'from': 55, 'beginning.': 56, 'It': 57, 'was': 58, 'into': 59, 'social': 60, 'setting': 61, 'that': 62, 'February': 63, '1850': 64, 'meeting': 65, 'called': 66, 'primarily': 67, 'for': 68, 'formation': 69, 'Assoc

In [218]:
print(f"Generated Text: {generated_text}")


Generated Text: a in equalitarian put meeting history (apparently Men's this Y.M.C.A. leisure their attracted YMCA London Adelaide self recreation unlike given unlike and states founders 1850 population population after Men's apprentices 35,302. governing no Goldfields. that Australia" recreation social after recreation work, day's 1852 typical unique 1851 meaning show YMCA primarily founded unique February recreation Adelaide total none founders none called founders thinking a February remained. philosophical leisure called Association total 1852 numbering Gold Report states 63,700 thinking which which independent others, given various February total Many Australia's apprentices self census Christian It 1870 for census In colonies, some evidenced this with remained.
