## Task 1

In [24]:
from transformers import AutoTokenizer
from collections import defaultdict

# Corpus declaration
corpus = [
    "there is a big house",
    "i buy a house",
    "they buy the new house",
]

In [25]:

# Declare the BPE tokenizer to divide the corpus into words
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Word frequency
word_freqs = defaultdict(int)

for text in corpus:
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    for word in new_words:
        word_freqs[word] += 1

print("Word frequencies:")
print(word_freqs)
print("================")

Word frequencies:
defaultdict(<class 'int'>, {'there': 1, 'Ġis': 1, 'Ġa': 2, 'Ġbig': 1, 'Ġhouse': 3, 'i': 1, 'Ġbuy': 2, 'they': 1, 'Ġthe': 1, 'Ġnew': 1})


In [26]:

# Compute base vocabulary
# First, extract the present letters (the alphabet)

alphabet = []

for word in word_freqs.keys():
    for letter in word:
        if letter not in alphabet:
            alphabet.append(letter)
alphabet.sort()

print("The alphabet: ")
print(alphabet)
print("================")

vocab = ["<|endoftext|>"] + alphabet.copy()

The alphabet: 
['a', 'b', 'e', 'g', 'h', 'i', 'n', 'o', 'r', 's', 't', 'u', 'w', 'y', 'Ġ']


In [27]:

# Split words into individual characters
splits = {word: [c for c in word] for word in word_freqs.keys()}

In [28]:

# The BPE algorithm is based on most frequent pair and merging
# Helper function to compute the most frequent pair

def compute_pair_freqs(splits):
    pair_freqs = defaultdict(int)
    for word, freq in word_freqs.items():
        split = splits[word]
        if len(split) == 1:
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            pair_freqs[pair] += freq
    return pair_freqs

In [29]:
# Compute initial pairs
pair_freqs = compute_pair_freqs(splits)

for i, key in enumerate(pair_freqs.keys()):
    print(f"{key}: {pair_freqs[key]}")
    if i >= 5:
        break

('t', 'h'): 3
('h', 'e'): 3
('e', 'r'): 1
('r', 'e'): 1
('Ġ', 'i'): 1
('i', 's'): 1


In [30]:
# Get most frequent pair
best_pair = ""
max_freq = None

for pair, freq in pair_freqs.items():
    if max_freq is None or max_freq < freq:
        best_pair = pair
        max_freq = freq

print(best_pair, max_freq)

('t', 'h') 3


In [31]:
# Add the merge to the vocab
merges = {("t", "h"): "th"}
vocab.append("th")

In [32]:
# Function to merge the pair in the splits dictionary
def merge_pair(a, b, splits):
    for word in word_freqs:
        split = splits[word]
        if len(split) == 1:
            continue

        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                split = split[:i] + [a + b] + split[i + 2 :]
            else:
                i += 1
        splits[word] = split
    return splits

In [33]:
splits = merge_pair("t", "h", splits)
print(splits["they"])

['th', 'e', 'y']


In [34]:
# Loop to grow the vocab size with the new merges
vocab_size = 10

while len(vocab) < vocab_size:
    pair_freqs = compute_pair_freqs(splits)
    best_pair = ""
    max_freq = None
    for pair, freq in pair_freqs.items():
        if max_freq is None or max_freq < freq:
            best_pair = pair
            max_freq = freq
    splits = merge_pair(*best_pair, splits)
    merges[best_pair] = best_pair[0] + best_pair[1]
    vocab.append(best_pair[0] + best_pair[1])

In [35]:
print(merges)

{('t', 'h'): 'th'}


In [36]:
print(vocab)

['<|endoftext|>', 'a', 'b', 'e', 'g', 'h', 'i', 'n', 'o', 'r', 's', 't', 'u', 'w', 'y', 'Ġ', 'th']


In [37]:
# To tokenize a new text, we pre-tokenize it, split it and aplpy the merge rules learned
def tokenize(text):
    pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
    pre_tokenized_text = [word for word, offset in pre_tokenize_result]
    splits = [[l for l in word] for word in pre_tokenized_text]
    for pair, merge in merges.items():
        for idx, split in enumerate(splits):
            i = 0
            while i < len(split) - 1:
                if split[i] == pair[0] and split[i + 1] == pair[1]:
                    split = split[:i] + [merge] + split[i + 2 :]
                else:
                    i += 1
            splits[idx] = split

    return sum(splits, [])

In [38]:
tokenize("they buy a big house")

['th',
 'e',
 'y',
 'Ġ',
 'b',
 'u',
 'y',
 'Ġ',
 'a',
 'Ġ',
 'b',
 'i',
 'g',
 'Ġ',
 'h',
 'o',
 'u',
 's',
 'e']

## Task 4


In [39]:
import torch
from torch.nn import functional as F
import string
from transformers import (
    BertTokenizer, BertForMaskedLM,
    AutoTokenizer, AutoModelWithLMHead,
    XLNetTokenizer, XLNetLMHeadModel,
    logging
)
logging.set_verbosity_error()

In [40]:
no_words_to_be_predicted = globals()
select_model = globals()
enter_input_text = globals()

In [41]:
def set_model_config(**kwargs):
  for key, value in kwargs.items():
    print("{0} = {1}".format(key, value))

  no_words_to_be_predicted = list(kwargs.values())[0] # integer values
  select_model = list(kwargs.values())[1] # possible values = 'bert' or 'gpt' or 'xlnet'
  enter_input_text = list(kwargs.values())[2] #only string

  return no_words_to_be_predicted, select_model, enter_input_text

In [42]:
def load_model(model_name):
    if "bert" in model_name.lower():
      bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
      bert_model = BertForMaskedLM.from_pretrained('bert-base-uncased').eval()
      return bert_tokenizer,bert_model
    elif "gpt" in model_name.lower():
      gpt_tokenizer = AutoTokenizer.from_pretrained("gpt2")
      gpt_model = AutoModelWithLMHead.from_pretrained("gpt2")
      print(f"Tokenizer: {gpt_tokenizer}, gpt_model: {gpt_model}")
      return gpt_tokenizer,gpt_model
    else:
      xlnet_tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
      xlnet_model = AutoModelWithLMHead.from_pretrained("xlnet-base-cased")
      return xlnet_tokenizer, xlnet_model

In [43]:
# bert encode
def encode_bert(tokenizer, text_sentence, add_special_tokens=True):
  text_sentence = text_sentence.replace('<mask>', tokenizer.mask_token)
  # if <mask> is the last token, append a "." so that models dont predict punctuation.
  if tokenizer.mask_token == text_sentence.split()[-1]:
    text_sentence += ' .'
    input_ids = torch.tensor([tokenizer.encode(text_sentence, add_special_tokens=add_special_tokens)])
    mask_idx = torch.where(input_ids == tokenizer.mask_token_id)[1].tolist()[0]
  return input_ids, mask_idx

# bert decode
def decode_bert(tokenizer, pred_idx, top_clean):
  ignore_tokens = string.punctuation + '[PAD]'
  tokens = []
  for w in pred_idx:
    token = ''.join(tokenizer.decode(w).split())
    if token not in ignore_tokens:
      tokens.append(token.replace('##', ''))
  return '\n'.join(tokens[:top_clean])

In [44]:
@torch.no_grad()
def get_all_predictions(input_text, model_name, top_clean=5):
    model_name = model_name.lower()

    if model_name == "bert":
        tok, mdl = load_model("bert")
        input_ids, mask_idx = encode_bert(tok, input_text, add_special_tokens=True)
        outputs = mdl(input_ids)
        logits = outputs.logits[0, mask_idx]
        probs = F.softmax(logits, dim=-1)
        topk = torch.topk(probs, k=top_clean)
        pred_tokens = topk.indices.tolist()
        return {"bert": decode_bert(tok, pred_tokens, top_clean)}

    elif model_name == "gpt":
        tok, mdl = load_model("gpt")
        enc = tok(input_text, return_tensors="pt")
        outputs = mdl(**enc)
        logits = outputs.logits[0, -1, :]  # next-token distribution
        probs = F.softmax(logits, dim=-1)
        topk = torch.topk(probs, k=top_clean)
        ids = topk.indices.tolist()
        # decode each candidate token by itself for cleaner strings
        tokens = [tok.decode([i]).strip() for i in ids]
        return {"gpt": "\n".join(tokens)}

    elif model_name == "xlnet":
        tok, mdl = load_model("xlnet")
        # XLNet is permutation-based but AutoModelForCausalLM head gives next-token logits
        enc = tok(input_text, return_tensors="pt")
        outputs = mdl(**enc)
        logits = outputs.logits[0, -1, :]
        probs = F.softmax(logits, dim=-1)
        topk = torch.topk(probs, k=top_clean)
        ids = topk.indices.tolist()
        tokens = [tok.decode([i]).strip() for i in ids]
        return {"xlnet": "\n".join(tokens)}

    else:
        raise ValueError("Unknown model_name")

In [45]:
def get_prediction_end_of_sentence(input_text, model_name):
  try:
    if model_name.lower() == "bert":
      input_text += ' <mask>'
      print(input_text)
      res = get_all_predictions(input_text, model_name, top_clean=int(no_words_to_be_predicted))
      return res
    elif model_name.lower() == "gpt":
      print(input_text)
      res = get_all_predictions(input_text, model_name, top_clean=int(no_words_to_be_predicted))
      return res
    else:
      print(input_text)
      res = get_all_predictions(input_text, model_name, top_clean=int(no_words_to_be_predicted))
      return res

  except Exception as error:
    pass

In [47]:
no_words_to_be_predicted, select_model, enter_input_text = set_model_config(no_words_to_be_predicted=2, select_model = "gpt", enter_input_text = "To be or not")

bert_tokenizer, bert_model = load_model(select_model)
res = get_prediction_end_of_sentence(enter_input_text, select_model)
print(f"The result is: {res[select_model].split("\n")}")

final_sentence = " ".join([i for i in res[select_model].split("\n")])
print(f"The final structured sentence is: {enter_input_text + " " + final_sentence}")


no_words_to_be_predicted = 2
select_model = gpt
enter_input_text = To be or not
Tokenizer: GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
), gpt_model: GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout