# Exploring Subword Tokenization

## Installing

In [1]:
!pip install transformers
!pip install sentencepiece
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 5.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.6.0-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 1.1 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 6.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 37.0 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAM

In [2]:
!pip install tabulate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from tabulate import tabulate

In [4]:
import transformers
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

from collections import Counter, defaultdict

import matplotlib.pyplot as plt


In [5]:
# whole_tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking")
whole_tokenizer = AutoTokenizer.from_pretrained("GroNLP/bert-base-dutch-cased")

Downloading:   0%|          | 0.00/254 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/608 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/236k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

## Calculating

In [6]:
def get_tokens(file_path, bench=False):
  all_annos =[]
  complex_words = []

  if bench:
    with open(file_path, 'r',encoding="utf-8") as infile:
      data = infile.readlines()

  else: 
    with open(file_path, 'r',encoding="ISO-8859-1") as infile:
      data = infile.readlines()

  print("dataset of size:", len(data)) 
  for row in data:
    row = row.strip()
    info = row.split("\t")

    complex_word = info[1]
    complex_words.append(complex_word)
    annotations = info[3:]
    
    if bench:
      clean_annotations = [anno[2:] for anno in annotations]
    
    else: 
      clean_annotations = annotations

    for a in clean_annotations:
      all_annos.append(a)
  
  return all_annos, complex_words

In [7]:
def relative_subwords (abs_subwords):
  relative_dict = dict()
  
  total = sum(abs_subwords.values())
  
  for len, freq in abs_subwords.items():
    relative_dict[len] = (freq/total)*100
  
  return relative_dict


In [8]:
def count_subwordtokenization(words):

  tokenize_sizes = defaultdict(int)
  for word in words:
    tokenized_word = whole_tokenizer.tokenize(word)
    nr_of_subwords = len(tokenized_word)
    print(word)
    print(tokenized_word)
    print(nr_of_subwords)
    if nr_of_subwords in tokenize_sizes.keys():
      tokenize_sizes[nr_of_subwords]+=1
    else:
      tokenize_sizes[nr_of_subwords]=1
  
  relative = relative_subwords (tokenize_sizes)

  return dict(tokenize_sizes), relative

## Getting the numbers

In [9]:
# data_files = ["/content/BenchLS.txt", "/content/lex.mturk.txt","/content/NNSeval.txt"]
# data_files = ["/content/dutch_sents_for_annotation.txt"]
data_files = ["/content/dutch_train_sents.txt"]

for file in data_files:
  dataset = file.replace("/content/","")
  print(dataset)

  if "Bench" in file or "NNSeval" in file or "dutch" in file:
    annotations, complex_words = get_tokens(file, bench=True)

  else: 
    annotations, complex_words = get_tokens(file, bench=False)

  abs_cword, rel_cword = count_subwordtokenization(complex_words)

  abs_annos, rel_annos = count_subwordtokenization(annotations)

  print(f"""For file {file.replace("/content/","")}:\n
  Complex Words Subwords:
  {abs_cword}\n
  {rel_cword}\n
  Annotation Subwords:
  {abs_annos}\n
  {rel_annos}\n""")

  print("percentage of subword tokenized complex words")
  print(100-rel_cword[1])

  print("percentage of subword tokenized annotations")
  print(100-rel_annos[1])



dutch_train_sents.txt
dataset of size: 1026
duurzame
['duurzame']
1
ontwikkelen
['ontwikkelen']
1
ontwikkelen
['ontwikkelen']
1
Wellicht
['Well', '##icht']
2
duurzame
['duurzame']
1
diversiteit
['diversiteit']
1
stimuleert
['stimuleer', '##t']
2
concrete
['concrete']
1
duurzame
['duurzame']
1
stimulerend
['stimuleren', '##d']
2
daar
['daar']
1
input
['in', '##put']
2
duurzame
['duurzame']
1
gemotiveerd
['gemotiveerd']
1
diversiteit
['diversiteit']
1
ontplooien
['ontplooien']
1
incidenten
['incidenten']
1
stimuleren
['stimuleren']
1
specifiek
['specifiek']
1
Daar
['Daar']
1
integrale
['integrale']
1
monitor
['monitor']
1
behoeft
['behoeft']
1
Impact
['Imp', '##act']
2
impact
['impact']
1
monitors
['monitor', '##s']
2
interventies
['interventie', '##s']
2
stimuleren
['stimuleren']
1
monitors
['monitor', '##s']
2
pilot
['pilot']
1
integrale
['integrale']
1
impact
['impact']
1
specifieke
['specifieke']
1
interventies
['interventie', '##s']
2
diversiteit
['diversiteit']
1
Integrale
['In', '

KeyError: ignored