<a href="https://colab.research.google.com/github/Amsterdam-Internships/Readability-Lexical-Simplification/blob/master/WordPieceTokenization_in_the_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exploring Subword Tokenization

## Installing

In [None]:
!pip install transformers
!pip install sentencepiece
!pip install torch

In [None]:
!pip install tabulate

In [None]:
from tabulate import tabulate

In [None]:
import transformers
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

from collections import Counter, defaultdict

import matplotlib.pyplot as plt


In [None]:
# whole_tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking")
whole_tokenizer = AutoTokenizer.from_pretrained("GroNLP/bert-base-dutch-cased")

## Calculating

In [None]:
def get_tokens(file_path, bench=False):
  all_annos =[]
  complex_words = []

  if bench:
    with open(file_path, 'r',encoding="utf-8") as infile:
      data = infile.readlines()

  else: 
    with open(file_path, 'r',encoding="ISO-8859-1") as infile:
      data = infile.readlines()

  print("dataset of size:", len(data)) 
  for row in data:
    row = row.strip()
    info = row.split("\t")

    complex_word = info[1]
    complex_words.append(complex_word)
    annotations = info[3:]
    
    if bench:
      clean_annotations = [anno[2:] for anno in annotations]
    
    else: 
      clean_annotations = annotations

    for a in clean_annotations:
      all_annos.append(a)
  
  return all_annos, complex_words

In [None]:
def relative_subwords (abs_subwords):
  relative_dict = dict()
  
  total = sum(abs_subwords.values())
  
  for len, freq in abs_subwords.items():
    relative_dict[len] = (freq/total)*100
  
  return relative_dict


In [None]:
def count_subwordtokenization(words):

  tokenize_sizes = defaultdict(int)
  for word in words:
    tokenized_word = whole_tokenizer.tokenize(word)
    nr_of_subwords = len(tokenized_word)
    print(word)
    print(tokenized_word)
    print(nr_of_subwords)
    if nr_of_subwords in tokenize_sizes.keys():
      tokenize_sizes[nr_of_subwords]+=1
    else:
      tokenize_sizes[nr_of_subwords]=1
  
  relative = relative_subwords (tokenize_sizes)

  return dict(tokenize_sizes), relative

## Getting the numbers

In [None]:
# data_files = ["/content/BenchLS.txt", "/content/lex.mturk.txt","/content/NNSeval.txt"]
data_files = ["/content/dutch_sents_for_annotation.txt"]

for file in data_files:
  dataset = file.replace("/content/","")
  print(dataset)

  if "Bench" in file or "NNSeval" in file or "dutch" in file:
    annotations, complex_words = get_tokens(file, bench=True)

  else: 
    annotations, complex_words = get_tokens(file, bench=False)

  abs_cword, rel_cword = count_subwordtokenization(complex_words)

  abs_annos, rel_annos = count_subwordtokenization(annotations)

  print(f"""For file {file.replace("/content/","")}:\n
  Complex Words Subwords:
  {abs_cword}\n
  {rel_cword}\n
  Annotation Subwords:
  {abs_annos}\n
  {rel_annos}\n""")

  print("percentage of subword tokenized complex words")
  print(100-rel_cword[1])

  print("percentage of subword tokenized annotations")
  print(100-rel_annos[1])

