In [1]:
from transformers import (
    AutoTokenizer,
)
from datasets import load_dataset
import pandas as pd
import preprocessor as p

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#constant
max_length = 128
text_col = 'content'
label_col = 'quadruplet'
preprocess_type = 'p00'
TOKENIZER_PATH = f'../tokenizer'
PRETRAINED_MODEL = "Wikidepia/IndoT5-base"
DATA_PATH = '../Data/quadruplet/quadruplet_annottated_sample_dataset_clean.csv'

In [3]:
old_tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
new_tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)

In [4]:
df = pd.read_csv(DATA_PATH)
df['clean_tweet'] = df['content'].apply(p.clean)

# Using old tokenizer

In [5]:
all_tokens_count = 0
not_splitted_tokens_count = 0
tokens_one_word_count = 0
tokens_two_word_count = 0
tokens_three_word_count = 0
for i in range(len(df)):
    row = df.iloc[i]
    clean_tweet = row['clean_tweet']
    tokens = old_tokenizer.tokenize(clean_tweet)
    all_tokens_count+=len(tokens)
    not_splitted_tokens = [tok for tok in tokens if tok.startswith('▁')]
    not_splitted_tokens_count+=len(not_splitted_tokens)
    splitted_tokens = [tok for tok in tokens if not tok.startswith('▁')]
    for tok in splitted_tokens:
        if len(tok)==1:
            tokens_one_word_count+=1
        elif len(tok)==2:
            tokens_two_word_count+=1
        else:
            tokens_three_word_count+=1

In [6]:
all_tokens_count, not_splitted_tokens_count, tokens_one_word_count, tokens_two_word_count, tokens_three_word_count

(36779, 20326, 5692, 4459, 6302)

In [7]:
not_splitted_tokens_count_perc = round(not_splitted_tokens_count/all_tokens_count*100,3)
tokens_one_word_count_perc = round(tokens_one_word_count/all_tokens_count*100, 3)
tokens_two_word_count_perc = round(tokens_two_word_count/all_tokens_count*100, 3)
tokens_three_word_count_perc = round(tokens_three_word_count/all_tokens_count*100, 3)

In [8]:
print(f'all tokens from tokenizer {all_tokens_count}')
print(f'all tokens not splitted from tokenizer {not_splitted_tokens_count} or {not_splitted_tokens_count_perc} percent from total tokens')
print(f'all tokens from tokenizer that splitted into only one character {tokens_one_word_count} or {tokens_one_word_count_perc} percent from total tokens')
print(f'all tokens from tokenizer that splitted into two characters {tokens_two_word_count} or {tokens_two_word_count_perc} percent from total tokens')
print(f'all tokens from tokenizer that splitted into three characters or more {tokens_three_word_count} or {tokens_three_word_count_perc} percent from total tokens')

all tokens from tokenizer 36779
all tokens not splitted from tokenizer 20326 or 55.265 percent from total tokens
all tokens from tokenizer that splitted into only one character 5692 or 15.476 percent from total tokens
all tokens from tokenizer that splitted into two characters 4459 or 12.124 percent from total tokens
all tokens from tokenizer that splitted into three characters or more 6302 or 17.135 percent from total tokens


# Using new tokenizer

In [9]:
all_tokens_count = 0
not_splitted_tokens_count = 0
tokens_one_word_count = 0
tokens_two_word_count = 0
tokens_three_word_count = 0
for i in range(len(df)):
    row = df.iloc[i]
    clean_tweet = row['clean_tweet']
    tokens = new_tokenizer.tokenize(clean_tweet)
    all_tokens_count+=len(tokens)
    not_splitted_tokens = [tok for tok in tokens if tok.startswith('▁')]
    not_splitted_tokens_count+=len(not_splitted_tokens)
    splitted_tokens = [tok for tok in tokens if not tok.startswith('▁')]
    for tok in splitted_tokens:
        if len(tok)==1:
            tokens_one_word_count+=1
        elif len(tok)==2:
            tokens_two_word_count+=1
        else:
            tokens_three_word_count+=1

In [10]:
all_tokens_count, not_splitted_tokens_count, tokens_one_word_count, tokens_two_word_count, tokens_three_word_count

(23846, 20326, 1572, 500, 1448)

In [11]:
not_splitted_tokens_count_perc = round(not_splitted_tokens_count/all_tokens_count*100,3)
tokens_one_word_count_perc = round(tokens_one_word_count/all_tokens_count*100, 3)
tokens_two_word_count_perc = round(tokens_two_word_count/all_tokens_count*100, 3)
tokens_three_word_count_perc = round(tokens_three_word_count/all_tokens_count*100, 3)

In [12]:
print(f'all tokens from tokenizer {all_tokens_count}')
print(f'all tokens not splitted from tokenizer {not_splitted_tokens_count} or {not_splitted_tokens_count_perc} percent from total tokens')
print(f'all tokens from tokenizer that splitted into only one character {tokens_one_word_count} or {tokens_one_word_count_perc} percent from total tokens')
print(f'all tokens from tokenizer that splitted into two characters {tokens_two_word_count} or {tokens_two_word_count_perc} percent from total tokens')
print(f'all tokens from tokenizer that splitted into three characters or more {tokens_three_word_count} or {tokens_three_word_count_perc} percent from total tokens')

all tokens from tokenizer 23846
all tokens not splitted from tokenizer 20326 or 85.239 percent from total tokens
all tokens from tokenizer that splitted into only one character 1572 or 6.592 percent from total tokens
all tokens from tokenizer that splitted into two characters 500 or 2.097 percent from total tokens
all tokens from tokenizer that splitted into three characters or more 1448 or 6.072 percent from total tokens


In [28]:
tokenizer.encode('saya sukaa')

[1535, 2394, 7, 1]

In [26]:
tokens = tokenizer.tokenize('saya sukaaaaa makan anjingmakan')
splitted_word = [tok for tok in tokens if not tok.startswith('▁')]

In [27]:
splitted_word

['a', 'a', 'a', 'a', 'ma', 'kan']