In [1]:
!pip install transformers

import pandas as pd
import numpy as np
import os
import gc
import random

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# seed 값 설정
torch.manual_seed(555)

from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score

import transformers
from transformers import AdamW

from tqdm import tqdm
import warnings


[0m

In [2]:
!pip install tokenizers

[0m

In [3]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

In [4]:
tokenizer = Tokenizer(models.Unigram())


In [5]:
from tokenizers import Regex

tokenizer.normalizer = normalizers.Sequence(
    [
        normalizers.Replace("``", '"'),
        normalizers.Replace("''", '"'),
        normalizers.NFKD(),
        normalizers.StripAccents(),
        normalizers.Replace(Regex(" {2,}"), " "),
    ]
)

In [6]:
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace()

In [7]:
tokenizer.pre_tokenizer.pre_tokenize_str("Let's test the pre-tokenizer!")

[("▁Let's", (0, 5)),
 ('▁test', (5, 10)),
 ('▁the', (10, 14)),
 ('▁pre-tokenizer!', (14, 29))]

In [8]:
special_tokens = ["<cls>", "<sep>", "<unk>", "<pad>", "<mask>", "<s>", "</s>"]
trainer = trainers.UnigramTrainer(
    vocab_size=25000, special_tokens=special_tokens, unk_token="<unk>"
)

In [10]:
tokenizer.model = models.Unigram()
tokenizer.train(["/root/team26/korean-hate-speech/unlabeled/unlabeled_comments.txt"], trainer=trainer)




In [None]:
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)

['▁L', 'et', "'", 's', '▁', 'te', 'st', '▁th', 'is', '▁to', 'k', 'en', 'i', 'z', 'er', '.']


In [None]:
cls_token_id = tokenizer.token_to_id("<cls>")
sep_token_id = tokenizer.token_to_id("<sep>")
print(cls_token_id, sep_token_id)

0 1


In [None]:
tokenizer.post_processor = processors.TemplateProcessing(
    single="$A:0 <sep>:0 <cls>:2",
    pair="$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2",
    special_tokens=[("<sep>", sep_token_id), ("<cls>", cls_token_id)],
)

In [None]:
encoding = tokenizer.encode("Let's test this tokenizer...", "on a pair of sentences!")
print(encoding.tokens)
print(encoding.type_ids)

['▁L', 'et', "'", 's', '▁', 'te', 'st', '▁th', 'is', '▁to', 'k', 'en', 'i', 'z', 'er', '...', '<sep>', '▁', 'on', '▁a', '▁', 'pa', 'ir', '▁o', 'f', '▁s', 'ent', 'en', 'ce', 's', '!', '<sep>', '<cls>']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2]


In [None]:
tokenizer.decoder = decoders.Metaspace()

In [None]:
import transformers
from transformers import XLNetTokenizerFast
from transformers import XLMRobertaTokenizerFast

wrapped_tokenizer2 = XLMRobertaTokenizerFast(tokenizer_object=tokenizer)
wrapped_tokenizer2.save_pretrained("my_xlmr")



('my_tokenizer/tokenizer_config.json',
 'my_tokenizer/special_tokens_map.json',
 'my_tokenizer/tokenizer.json')