#**Space-based Tokenization**

In [10]:
import nltk
from collections import Counter

with open ('clean_shakespeare.txt', 'r') as f:
  text = f.read()

def get_words(text):

  pattern = r'''(?x)          # set flag to allow verbose regexps
          (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
          | \w+'\w+           # contractions
        | \w+(?:-\w+)*        # words with optional internal hyphens
        | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
        | \.\.\.              # ellipsis
        | [][.,;"'?():_`-]    # these are separate tokens; includes ], [
      '''

  vocab = nltk.regexp_tokenize(text, pattern)
  vocab = [word.lower() for word in vocab]
  unique_words = set(vocab)
  c = Counter(vocab)

  sorted_dict = {key: value for key, value in sorted(
      c.items(), key=lambda item: item[1], reverse=True)}

  return sorted_dict

# Byte Pair Encoding

- make Shakespeare into test 1% and train
- train the segmenter with varying k (try normalization strategies)
- compare the performance against a different set
- figure out a measure for accuracy

In [2]:
from itertools import islice

def performance(sorted_dict, vocab, k=10000):

    def take(n, iterable):
        """Return the first n items of the iterable as a list."""
        return list(islice(iterable, n))

    
    # extract top k keys
    qualifier = [key for key, _ in take(k, sorted_dict.items())]

    # clean vocab by stripping the _
    cleaned_vocab = {word.rstrip('_') for word in vocab}
    
    # calculate overlap
    overlap = set(qualifier) & cleaned_vocab
    num_overlap = len(overlap)

    # percentage
    percentage = (num_overlap / len(qualifier)) * 100
    return percentage


In [21]:
import numpy as np
from collections import defaultdict

from collections import OrderedDict
from collections import Counter


def bpe(dictionary, k=None):
  """ dictionary (dict): a dictionary that contains the tokens and their respective counts
    return: vocab_bpe (list), vocabulary of the corpus"""

  # get all unique characters in original corpus
  all_keys = "_ ".join(dictionary.keys())
  vocab_bpe = list(set(all_keys))

  # Corpus/dictionary in einzelne tokens splitten, nach jedem Wort (VOR space!) "_" einfügen
    # worte als list of characters
  dict_matrix = []
  for key, value in dictionary.items():
    new_key = list(f"{str(key)}_ ")
    dict_matrix.append([new_key, value])


  # NICHT corpus, sondern liste an Wörtern in einzelne tokens splitten,
  # >> jede occurence mit counts der Worte multiplizieren
  token_freq = defaultdict(int)


  iteration = True
  num_rounds = 0
  while iteration:
    for token_list, value in dict_matrix:
      for i in range(len(token_list)-2):
        # wollen den und den nächsten token als key
        search_key = token_list[i] + token_list[i+1]
        # zu dictionary hinzufügen falls key noch nicht existiert
        token_freq[search_key] += value
    # word_freqs: gehen jede existierende Folge aus zwei tokens in list of words von vorne bis hinten durch

    c = Counter(token_freq)

    sorted_token_freq = {key: value for key, value in sorted(
        c.items(), key=lambda item: item[1], reverse=True)}

    # Find the most frequent token not already in vocab_bpe
    for token in sorted_token_freq.keys():
      if token not in vocab_bpe:
        first_token = token
        break
      else:
        first_token = None  # Optional: fallback in case all tokens are already in vocab

    if first_token:
      vocab_bpe.append(first_token)
    else:
      print("No new token to add.")

    # höchster count wird gemerged:
    # add to vocab
    #first_token = list(sorted_token_freq.keys())[0]
    #vocab_bpe.append(first_token)
    # replace in list of words
    # start again?
    for i in range(len(dict_matrix)):
        token_list, value = dict_matrix[i]
        j = 0
        while j < len(token_list) - 1:
            search_key = token_list[j] + token_list[j + 1]
            if search_key == first_token:
                merged_token = search_key
                # Merge the tokens
                token_list = token_list[:j] + [merged_token] + token_list[j + 2:]
                # Don't increment j — might be able to merge again
            else:
                j += 1
        dict_matrix[i][0] = token_list
    if k:
       k -= 1
       iteration = (k > 0)
    
    else:
       num_rounds += 1
       accuracy = performance(dictionary, vocab_bpe, 500)
       iteration != (accuracy > 70)

       if num_rounds > 1500:
          print("exceeded, accuracy: ", accuracy)


          iteration = False

  return vocab_bpe

In [14]:
with open ('clean_shakespeare.txt', 'r') as f:
  text = f.read()

#split into test and train 10% to 90%
text_train = text[:int(len(text)*0.99)]
text_test = text[int(len(text)*0.99):]

In [22]:
dict_train = get_words(text_train)
dict_test = get_words(text_test)

vocab_train = bpe(dict_train)

exceeded, accuracy:  75.6


In [23]:
train_accuracy = performance(dict_train, vocab_train, 500)
test_accuracy = performance(dict_test, vocab_train, 500)

print("train accuracy: ", train_accuracy)
print("test accuracy: ", test_accuracy)

train accuracy:  75.6
test accuracy:  57.99999999999999


In [24]:
import tiktoken

def compare_to_gpt_encoding(text, alphabet, model_name="gpt-3.5-turbo"):
    """
    Encodes a text string using tiktoken and prints the tokens along with their IDs.

    Args:
        text: The text string to encode.

    """
    alphabet = {word.rstrip('_') for word in alphabet}
    try:
        encoding = tiktoken.encoding_for_model(model_name)
    except KeyError:
        print(f"Warning: Model '{model_name}' not found.  Using 'cl100k_base' instead.")
        encoding = tiktoken.get_encoding("cl100k_base")

    tokens = encoding.encode(text)
    tp = 0

    unique_tokens = set(tokens)
    for token in unique_tokens:
      decoded_token = encoding.decode([token])
      #check if the decoded token is in the dict alphabet
      if(decoded_token in alphabet):
        tp = tp+1
    total_len = len(alphabet)
    print(tp/total_len)


with open ('shakespeare.txt', 'r') as f:
  text = f.read()
model_name="gpt-3.5-turbo"
tokens_train = compare_to_gpt_encoding(text_train, vocab_train, model_name)
tokens_test = compare_to_gpt_encoding(text_test, vocab_train, model_name)

0.6382818387339865
0.15900527505651846


# Different Corpus: Friends TV Show

In [25]:
with open ('friends.txt', 'r') as f:
  friends = f.read()

friends_dict = get_words(friends)
friends_accuracy = performance(friends_dict, vocab_train, 500)
print("Accuracy on Friends: ", friends_accuracy)

tokens_friends = compare_to_gpt_encoding(friends, vocab_train, model_name)

Accuracy on Friends:  46.800000000000004
0.5553880934438583


# Task 2

## N-Grams (first uni-gram, gradual increasing to 4, have n as input, Laplace-Smoothing)
## Perplexity

## Interpolation vs Backoff

## Generator