In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import urllib.request
from sklearn import preprocessing
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [2]:
# 훈련 데이터 다운로드
#urllib.request.urlretrieve("https://raw.githubusercontent.com/kakaobrain/KorNLUDatasets/master/KorNLI/multinli.train.ko.tsv", filename="multinli.train.ko.tsv")
#urllib.request.urlretrieve("https://raw.githubusercontent.com/kakaobrain/KorNLUDatasets/master/KorNLI/snli_1.0_train.ko.tsv", filename="snli_1.0_train.ko.tsv")

In [3]:
# 검증 데이터 다운로드
#urllib.request.urlretrieve("https://raw.githubusercontent.com/kakaobrain/KorNLUDatasets/master/KorNLI/xnli.dev.ko.tsv", filename="xnli.dev.ko.tsv")

In [4]:
#테스트 데이터 다운로드
#urllib.request.urlretrieve("https://raw.githubusercontent.com/kakaobrain/KorNLUDatasets/master/KorNLI/xnli.test.ko.tsv", filename="xnli.test.ko.tsv")

In [5]:
train_snli = pd.read_csv("snli_1.0_train.ko.tsv", sep='\t', quoting=3)
train_xnli = pd.read_csv("multinli.train.ko.tsv", sep='\t', quoting=3)
val_data = pd.read_csv("xnli.dev.ko.tsv", sep='\t', quoting=3)
test_data = pd.read_csv("xnli.test.ko.tsv", sep='\t', quoting=3)

In [6]:
train_snli.head()

Unnamed: 0,sentence1,sentence2,gold_label
0,말을 탄 사람이 고장난 비행기 위로 뛰어오른다.,한 사람이 경쟁을 위해 말을 훈련시키고 있다.,neutral
1,말을 탄 사람이 고장난 비행기 위로 뛰어오른다.,한 사람이 식당에서 오믈렛을 주문하고 있다.,contradiction
2,말을 탄 사람이 고장난 비행기 위로 뛰어오른다.,사람은 야외에서 말을 타고 있다.,entailment
3,카메라에 웃고 손을 흔드는 아이들,그들은 부모님을 보고 웃고 있다,neutral
4,카메라에 웃고 손을 흔드는 아이들,아이들이 있다,entailment


In [7]:
train_xnli.head()

Unnamed: 0,sentence1,sentence2,gold_label
0,개념적으로 크림 스키밍은 제품과 지리라는 두 가지 기본 차원을 가지고 있다.,제품과 지리학은 크림 스키밍을 작동시키는 것이다.,neutral
1,시즌 중에 알고 있는 거 알아? 네 레벨에서 다음 레벨로 잃어버리는 거야 브레이브스...,사람들이 기억하면 다음 수준으로 물건을 잃는다.,entailment
2,우리 번호 중 하나가 당신의 지시를 세밀하게 수행할 것이다.,우리 팀의 일원이 당신의 명령을 엄청나게 정확하게 실행할 것이다.,entailment
3,어떻게 아세요? 이 모든 것이 다시 그들의 정보다.,이 정보는 그들의 것이다.,entailment
4,"그래, 만약 네가 테니스화 몇 개를 사러 간다면, 나는 왜 그들이 100달러대에서 ...",테니스화의 가격은 다양하다.,neutral


In [8]:
# 결합 후 섞기
train_data = train_snli.append(train_xnli)
train_data = train_data.sample(frac=1) #셔플링 100%

  train_data = train_snli.append(train_xnli)


In [9]:
train_data.head()

Unnamed: 0,sentence1,sentence2,gold_label
321592,"방은 매우 시크하고, 완전한 오디오 시스템을 갖추고 있다.",방에는 시대에 뒤떨어진 유행에 뒤떨어진 감각이 있다.,contradiction
168761,마드모아젤 신시아를 보호하고 있었어요.”,그는 신시아 마드모아젤을 보호하는데 실패했다.,neutral
328484,검은 정장을 입은 남자가 흰 정장과 검은 드레스를 입은 사람들 앞에 서 있다.,검은 옷을 입은 남자가 쇼를 하고 있다.,neutral
524919,지팡이를 든 남자가 걷고 있다.,남자는 소파에 앉아 차를 마셨다.,contradiction
351090,한 남자가 서서 동료들과 토론하고 있다,남자가 동료들 근처에 조용히 서 있다.,contradiction


In [10]:
val_data.head()

Unnamed: 0,sentence1,sentence2,gold_label
0,"그리고 그가 말했다, ""엄마, 저 왔어요.""",그는 학교 버스가 그를 내려주자마자 엄마에게 전화를 걸었다.,neutral
1,"그리고 그가 말했다, ""엄마, 저 왔어요.""",그는 한마디도 하지 않았다.,contradiction
2,"그리고 그가 말했다, ""엄마, 저 왔어요.""",그는 엄마에게 집에 갔다고 말했다.,entailment
3,내가 무엇을 위해 가고 있는지 또는 어떤 것을 위해 있는지 몰랐기 때문에 워싱턴의 ...,나는 워싱턴에 가본 적이 없어서 거기 배정을 받았을 때 그 장소를 찾으려다가 길을 ...,neutral
4,내가 무엇을 위해 가고 있는지 또는 어떤 것을 위해 있는지 몰랐기 때문에 워싱턴의 ...,워싱턴으로 진군하면서 해야 할 일이 무엇인지 정확히 알고 있었다.,contradiction


In [11]:
test_data.head()

Unnamed: 0,sentence1,sentence2,gold_label
0,"글쎄, 나는 그것에 관해 생각조차 하지 않았지만, 나는 너무 좌절했고, 결국 그에게...",나는 그와 다시 이야기하지 않았다.,contradiction
1,"글쎄, 나는 그것에 관해 생각조차 하지 않았지만, 나는 너무 좌절했고, 결국 그에게...",나는 다시 그와 이야기를 하기 시작했다는 것에 너무 화가 났다.,entailment
2,"글쎄, 나는 그것에 관해 생각조차 하지 않았지만, 나는 너무 좌절했고, 결국 그에게...",우리는 좋은 대화를 나눴다.,neutral
3,"그리고 저는 그것이 특권이라고 생각했습니다, 그리고 여전히, 여전히, 당시 저는 A...",그날 현장에 나만 있었던 게 아니라는 걸 몰랐던 것이다.,neutral
4,"그리고 저는 그것이 특권이라고 생각했습니다, 그리고 여전히, 여전히, 당시 저는 A...",나는 AFFC 공군 경력 분야에서 그 번호를 가진 유일한 사람이라는 인상을 가지고 ...,entailment


In [12]:
def drop_na_and_duplicates(df):
    df = df.dropna()
    df = df.drop_duplicates()
    df = df.reset_index(drop=True)
    return df

In [13]:
train_data = drop_na_and_duplicates(train_data)
val_data = drop_na_and_duplicates(val_data)
test_data = drop_na_and_duplicates(test_data)

In [14]:
# 서브워드 토크나이저 로드
tokenizer = BertTokenizer.from_pretrained("klue/bert-base")

In [15]:
#최대 길이
max_seq_len=128

In [16]:
# 두개의 문장을 입력 받아야 한다.
def convert_example_to_feature(sent_list1, sent_list2, max_seq_len, tokenizer):
    input_ids, attention_masks, token_type_ids = [], [], []
    #1. wordpiece 임베딩 : 실질적인 입력
    #2. 실제 단어 어텐션 연산을 위한 어텐션 임베딩
    #3. 세그먼트 임베딩 = token_type_ids (문장 구분)
    #4. 포지셔널 임베딩 : 단어의 위치 정보를 표시  //128크기
    for sent1, sent2 in tqdm(zip(sent_list1, sent_list2), total = len(sent_list1)):
        encoding_result = tokenizer.encode_plus(sent1, sent2, max_length=max_seq_len, pad_to_max_length=True)
        #서브워드 토크나이저의 encode_plus 함수 동작
        # 1. 문장을 토크나이징
        # 2. 문장의 각 위치에 특수 토큰 추가
        # 3. 각 토큰을 인덱스(정수)로 변환
        # 4. max_length에 따라 문장의 길이를 맞추는 작업을 진행
        # 5. pad_to_max_length 기능을 통해 최대 문장 길이 까지 패딩
        # 6. 어텐션 마스크를 리턴
        # 7. 세그먼트 임베딩 벡터를 리턴
        
        input_ids.append(encoding_result['input_ids']) #
        attention_masks.append(encoding_result['attention_mask'])
        token_type_ids.append(encoding_result['token_type_ids'])

    input_ids = np.array(input_ids, dtype=int)
    attention_masks = np.array(attention_masks, dtype=int)
    token_type_ids = np.array(token_type_ids, dtype=int)

    return (input_ids, attention_masks, token_type_ids)

In [17]:
X_train = convert_example_to_feature(train_data['sentence1'], train_data['sentence2'], max_seq_len = max_seq_len, tokenizer=tokenizer)
# sent1과 sent2를 인자로 3개의 임베딩 벡터 생성

  0%|                                                                               | 0/941814 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  0%|                                                                   | 280/941814 [00:00<05:39, 2772.35it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always b

  1%|▌                                                                 | 7978/941814 [00:02<04:44, 3287.15it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  1%|▌                                                                 | 8319/941814 [00:02<04:41, 3313.80it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  1%|▌                                                                 | 8651/941814 [00:02<04:43, 3

  2%|█▏                                                               | 17690/941814 [00:05<04:37, 3327.73it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  2%|█▏                                                               | 18023/941814 [00:05<04:42, 3269.96it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  2%|█▎                                                               | 19382/941814 [00:05<04:34, 3

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  3%|██▏                                                              | 31769/941814 [00:09<04:49, 3139.60it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  3%|██▏                                                              | 32099/941814 [00:09<04:45, 3186.13it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs w

  5%|███▍                                                             | 49674/941814 [00:15<04:23, 3391.82it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  5%|███▍                                                             | 50706/941814 [00:15<04:22, 3390.35it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  5%|███▌                                                             | 51726/941814 [00:15<04:25, 3357.51it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have bee

  7%|████▎                                                            | 62627/941814 [00:19<04:35, 3192.91it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  7%|████▎                                                            | 62947/941814 [00:19<04:35, 3194.78it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  7%|████▎                                                            | 63267/941814 [00:19<04:35, 3187.10it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have bee

  8%|█████                                                            | 73759/941814 [00:22<04:33, 3171.54it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  8%|█████▏                                                           | 74748/941814 [00:23<04:27, 3240.64it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  8%|█████▏                                                           | 75073/941814 [00:23<04:29, 3214.76it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have bee

  9%|█████▉                                                           | 85988/941814 [00:26<04:21, 3267.47it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  9%|█████▉                                                           | 86315/941814 [00:26<04:22, 3258.51it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  9%|█████▉                                                           | 86646/941814 [00:26<04:21, 3273.59it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have bee

 11%|██████▉                                                          | 99933/941814 [00:30<04:11, 3345.65it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 11%|██████▊                                                         | 100268/941814 [00:31<04:17, 3269.22it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs w

 12%|███████▋                                                        | 112458/941814 [00:34<04:08, 3336.48it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 12%|███████▋                                                        | 113798/941814 [00:35<04:09, 3314.19it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 12%|███████▊                                                        | 114476/941814 [00:35<04:07, 3343.86it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have bee

 13%|████████▌                                                       | 126030/941814 [00:39<04:14, 3207.36it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 13%|████████▌                                                       | 126694/941814 [00:39<04:10, 3248.79it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 14%|████████▋                                                       | 127342/941814 [00:39<04:13, 3213.43it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have bee

 15%|█████████▎                                                      | 136898/941814 [00:42<04:08, 3245.10it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 15%|█████████▎                                                      | 137226/941814 [00:42<04:07, 3255.46it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 15%|█████████▎                                                      | 137553/941814 [00:42<04:07, 3250.12it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have bee

 16%|██████████▏                                                     | 150799/941814 [00:46<03:59, 3297.16it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 16%|██████████▎                                                     | 151465/941814 [00:47<03:59, 3299.76it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs w

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 17%|██████████▉                                                     | 161717/941814 [00:50<03:56, 3295.39it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 17%|███████████                                                     | 162381/941814 [00:50<03:57, 3279.36it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 17%|███████████                                                     | 163385/941814 [00:50<03:55, 3

 19%|████████████▏                                                   | 179043/941814 [00:55<03:55, 3232.65it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 19%|████████████▏                                                   | 179705/941814 [00:55<03:54, 3251.59it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 19%|████████████▎                                                   | 180682/941814 [00:55<03:56, 3218.79it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have bee

 20%|████████████▉                                                   | 190091/941814 [00:59<03:55, 3192.54it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 20%|████████████▉                                                   | 190417/941814 [00:59<03:53, 3212.60it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs w

 22%|█████████████▉                                                  | 204340/941814 [01:03<03:50, 3201.47it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 22%|█████████████▉                                                  | 205345/941814 [01:03<03:43, 3300.39it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs w

 23%|██████████████▉                                                 | 220597/941814 [01:08<03:49, 3144.65it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 23%|███████████████                                                 | 220941/941814 [01:08<03:43, 3222.29it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 23%|███████████████                                                 | 221272/941814 [01:08<03:41, 3

 25%|███████████████▊                                                | 232120/941814 [01:12<03:38, 3246.42it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 25%|███████████████▊                                                | 232445/941814 [01:12<03:38, 3247.45it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs w

 26%|████████████████▋                                               | 244849/941814 [01:16<03:31, 3299.82it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 26%|████████████████▋                                               | 245192/941814 [01:16<03:28, 3338.16it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 26%|████████████████▋                                               | 245872/941814 [01:16<03:27, 3357.63it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have bee

 27%|█████████████████▌                                              | 258252/941814 [01:20<03:23, 3364.15it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 27%|█████████████████▌                                              | 258931/941814 [01:20<03:23, 3358.50it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 28%|█████████████████▋                                              | 259631/941814 [01:20<03:19, 3423.39it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have bee

 28%|██████████████████▏                                             | 267959/941814 [01:23<03:26, 3265.39it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 29%|██████████████████▎                                             | 269294/941814 [01:24<03:24, 3295.89it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 29%|██████████████████▌                                             | 272683/941814 [01:25<03:23, 3

 31%|███████████████████▋                                            | 288805/941814 [01:29<03:15, 3344.87it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 31%|███████████████████▋                                            | 289140/941814 [01:30<03:17, 3307.10it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 31%|███████████████████▋                                            | 289471/941814 [01:30<03:20, 3259.51it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have bee

 32%|████████████████████▎                                           | 298638/941814 [01:32<03:09, 3399.16it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 32%|████████████████████▎                                           | 299318/941814 [01:33<03:11, 3360.44it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 32%|████████████████████▎                                           | 299655/941814 [01:33<03:13, 3

 33%|█████████████████████▎                                          | 313118/941814 [01:37<03:13, 3248.37it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 33%|█████████████████████▎                                          | 314105/941814 [01:37<03:14, 3223.14it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 33%|█████████████████████▍                                          | 314758/941814 [01:37<03:13, 3239.05it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have bee

 35%|██████████████████████▍                                         | 331101/941814 [01:42<03:03, 3331.69it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 35%|██████████████████████▌                                         | 331765/941814 [01:42<03:06, 3277.15it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 35%|██████████████████████▌                                         | 332093/941814 [01:43<03:09, 3

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 36%|███████████████████████▎                                        | 342838/941814 [01:46<03:05, 3223.49it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 36%|███████████████████████▎                                        | 343161/941814 [01:46<03:08, 3169.95it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs w

 38%|████████████████████████                                        | 354537/941814 [01:49<02:56, 3319.92it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 38%|████████████████████████▏                                       | 355213/941814 [01:50<02:55, 3333.97it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 38%|████████████████████████▏                                       | 355554/941814 [01:50<02:55, 3346.89it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have bee

 39%|████████████████████████▉                                       | 366924/941814 [01:54<02:55, 3271.99it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 39%|████████████████████████▉                                       | 367582/941814 [01:54<02:56, 3252.07it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 39%|█████████████████████████                                       | 368246/941814 [01:54<02:56, 3

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 40%|█████████████████████████▊                                      | 380350/941814 [01:58<02:48, 3333.82it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 40%|█████████████████████████▉                                      | 381358/941814 [01:58<02:51, 3271.41it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 41%|█████████████████████████▉                                      | 382037/941814 [01:58<02:48, 3

 42%|██████████████████████████▋                                     | 392338/941814 [02:01<02:47, 3275.02it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 42%|██████████████████████████▋                                     | 392666/941814 [02:02<02:51, 3209.58it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 42%|██████████████████████████▋                                     | 393002/941814 [02:02<02:49, 3

 43%|███████████████████████████▌                                    | 406424/941814 [02:06<02:43, 3274.63it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 43%|███████████████████████████▋                                    | 406752/941814 [02:06<02:43, 3266.25it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 43%|███████████████████████████▋                                    | 407419/941814 [02:06<02:42, 3291.10it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have bee

 45%|████████████████████████████▌                                   | 420562/941814 [02:10<02:34, 3376.66it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 45%|████████████████████████████▋                                   | 421572/941814 [02:10<02:36, 3329.05it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs w

 46%|█████████████████████████████▌                                  | 434658/941814 [02:14<02:28, 3424.00it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 46%|█████████████████████████████▌                                  | 435343/941814 [02:14<02:30, 3376.29it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 46%|█████████████████████████████▌                                  | 435681/941814 [02:15<02:32, 3327.88it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have bee

 48%|██████████████████████████████▌                                 | 449750/941814 [02:19<02:27, 3331.45it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 48%|██████████████████████████████▋                                 | 451459/941814 [02:19<02:24, 3388.89it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs w

 49%|███████████████████████████████▌                                | 464059/941814 [02:23<02:23, 3324.12it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 49%|███████████████████████████████▌                                | 465064/941814 [02:23<02:23, 3329.49it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 49%|███████████████████████████████▋                                | 465411/941814 [02:23<02:21, 3371.42it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have bee

 51%|████████████████████████████████▍                               | 476660/941814 [02:27<02:16, 3412.63it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 51%|████████████████████████████████▍                               | 478045/941814 [02:27<02:14, 3442.85it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 51%|████████████████████████████████▌                               | 478390/941814 [02:27<02:15, 3

 52%|█████████████████████████████████▎                              | 489375/941814 [02:31<02:15, 3341.63it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 52%|█████████████████████████████████▎                              | 490047/941814 [02:31<02:15, 3328.06it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 52%|█████████████████████████████████▎                              | 491053/941814 [02:32<02:15, 3322.45it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have bee

 54%|██████████████████████████████████▎                             | 504364/941814 [02:36<02:10, 3345.51it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 54%|██████████████████████████████████▎                             | 505744/941814 [02:36<02:07, 3431.11it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 54%|██████████████████████████████████▍                             | 506088/941814 [02:36<02:07, 3413.53it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have bee

 55%|███████████████████████████████████▎                            | 520124/941814 [02:40<02:04, 3394.93it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 55%|███████████████████████████████████▍                            | 521138/941814 [02:41<02:05, 3358.36it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 55%|███████████████████████████████████▍                            | 521820/941814 [02:41<02:05, 3

 57%|████████████████████████████████████▎                           | 534857/941814 [02:45<01:59, 3391.33it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 57%|████████████████████████████████████▍                           | 535899/941814 [02:45<01:58, 3438.86it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 57%|████████████████████████████████████▌                           | 537279/941814 [02:45<01:58, 3413.44it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have bee

 58%|█████████████████████████████████████▏                          | 547901/941814 [02:49<01:57, 3366.62it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 58%|█████████████████████████████████████▎                          | 548253/941814 [02:49<01:55, 3407.51it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 58%|█████████████████████████████████████▎                          | 548951/941814 [02:49<01:54, 3437.32it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have bee

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 59%|█████████████████████████████████████▉                          | 558546/941814 [02:52<01:56, 3297.52it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 59%|██████████████████████████████████████                          | 559234/941814 [02:52<01:53, 3376.87it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs w

 61%|███████████████████████████████████████▎                        | 577786/941814 [02:57<01:49, 3337.22it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 62%|███████████████████████████████████████▍                        | 581256/941814 [02:58<01:43, 3482.50it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 62%|███████████████████████████████████████▋                        | 583353/941814 [02:59<01:45, 3

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 63%|████████████████████████████████████████▍                       | 594842/941814 [03:03<01:45, 3298.69it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 63%|████████████████████████████████████████▍                       | 595882/941814 [03:03<01:41, 3391.97it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 63%|████████████████████████████████████████▌                       | 596562/941814 [03:03<01:43, 3

 65%|█████████████████████████████████████████▎                      | 608841/941814 [03:07<01:40, 3316.81it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 65%|█████████████████████████████████████████▍                      | 609514/941814 [03:07<01:39, 3331.22it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 65%|█████████████████████████████████████████▍                      | 609856/941814 [03:07<01:38, 3357.56it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have bee

 66%|██████████████████████████████████████████▍                     | 624885/941814 [03:11<01:34, 3365.17it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 66%|██████████████████████████████████████████▌                     | 625926/941814 [03:12<01:31, 3436.03it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 66%|██████████████████████████████████████████▌                     | 626271/941814 [03:12<01:31, 3430.00it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have bee

 67%|███████████████████████████████████████████▏                    | 635525/941814 [03:16<03:56, 1297.42it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 68%|███████████████████████████████████████████▏                    | 635866/941814 [03:16<03:11, 1594.69it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 68%|███████████████████████████████████████████▎                    | 636890/941814 [03:16<02:04, 2

 69%|████████████████████████████████████████████                    | 647506/941814 [03:19<01:26, 3415.45it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 69%|████████████████████████████████████████████                    | 647854/941814 [03:19<01:25, 3424.46it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs w

 70%|█████████████████████████████████████████████                   | 662310/941814 [03:24<01:22, 3399.31it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 70%|█████████████████████████████████████████████                   | 663010/941814 [03:24<01:21, 3438.11it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 70%|█████████████████████████████████████████████                   | 663355/941814 [03:24<01:21, 3

 71%|█████████████████████████████████████████████▊                  | 673353/941814 [03:27<01:19, 3386.04it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 72%|█████████████████████████████████████████████▊                  | 673692/941814 [03:27<01:19, 3367.60it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs w

 73%|██████████████████████████████████████████████▍                 | 683985/941814 [03:30<01:16, 3369.22it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 73%|██████████████████████████████████████████████▌                 | 684332/941814 [03:30<01:15, 3399.11it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 73%|██████████████████████████████████████████████▌                 | 684673/941814 [03:30<01:15, 3392.27it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have bee

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 74%|███████████████████████████████████████████████                 | 692621/941814 [03:33<01:15, 3299.57it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 74%|███████████████████████████████████████████████                 | 692954/941814 [03:33<01:15, 3308.44it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs w

 75%|███████████████████████████████████████████████▉                | 704948/941814 [03:36<01:09, 3408.08it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 75%|███████████████████████████████████████████████▉                | 705291/941814 [03:36<01:09, 3404.49it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 75%|███████████████████████████████████████████████▉                | 705633/941814 [03:36<01:09, 3409.10it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have bee

 76%|████████████████████████████████████████████████▊               | 718999/941814 [03:40<01:06, 3356.13it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 76%|████████████████████████████████████████████████▉               | 720026/941814 [03:41<01:05, 3379.78it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs w

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 78%|█████████████████████████████████████████████████▊              | 732330/941814 [03:44<01:02, 3378.48it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 78%|█████████████████████████████████████████████████▊              | 733364/941814 [03:45<01:01, 3406.85it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 78%|█████████████████████████████████████████████████▉              | 734386/941814 [03:45<01:01, 3

 79%|██████████████████████████████████████████████████▊             | 747776/941814 [03:49<00:56, 3424.49it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 79%|██████████████████████████████████████████████████▊             | 748119/941814 [03:49<00:57, 3365.84it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs w

 81%|███████████████████████████████████████████████████▊            | 763250/941814 [03:53<00:52, 3393.12it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 81%|███████████████████████████████████████████████████▉            | 763590/941814 [03:54<00:52, 3385.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 81%|███████████████████████████████████████████████████▉            | 764290/941814 [03:54<00:51, 3424.46it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have bee

 82%|████████████████████████████████████████████████████▊           | 776977/941814 [03:58<00:48, 3381.24it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 83%|████████████████████████████████████████████████████▊           | 777316/941814 [03:58<00:49, 3344.06it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 83%|████████████████████████████████████████████████████▊           | 777654/941814 [03:58<00:49, 3

 84%|█████████████████████████████████████████████████████▌          | 788184/941814 [04:01<00:46, 3327.17it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 84%|█████████████████████████████████████████████████████▌          | 788852/941814 [04:01<00:46, 3317.72it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 84%|█████████████████████████████████████████████████████▋          | 789862/941814 [04:01<00:45, 3

 85%|██████████████████████████████████████████████████████▍         | 800684/941814 [04:05<00:42, 3314.01it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 85%|██████████████████████████████████████████████████████▍         | 801016/941814 [04:05<00:42, 3296.30it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 85%|██████████████████████████████████████████████████████▌         | 802403/941814 [04:05<00:40, 3

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 87%|███████████████████████████████████████████████████████▍        | 815545/941814 [04:09<00:37, 3393.75it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have

 88%|████████████████████████████████████████████████████████▏       | 826425/941814 [04:14<01:22, 1391.88it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 88%|████████████████████████████████████████████████████████▏       | 826757/941814 [04:14<01:08, 1680.25it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 88%|████████████████████████████████████████████████████████▏       | 827103/941814 [04:14<00:57, 1990.06it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have bee

 89%|████████████████████████████████████████████████████████▉       | 837731/941814 [04:17<00:30, 3428.38it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 89%|█████████████████████████████████████████████████████████       | 840475/941814 [04:18<00:30, 3367.66it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 89%|█████████████████████████████████████████████████████████▏      | 841162/941814 [04:18<00:29, 3388.66it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have bee

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 91%|██████████████████████████████████████████████████████████▎     | 858316/941814 [04:23<00:24, 3369.46it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 91%|██████████████████████████████████████████████████████████▎     | 858658/941814 [04:23<00:24, 3374.46it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs w

 93%|███████████████████████████████████████████████████████████▍    | 875136/941814 [04:28<00:19, 3351.70it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 93%|███████████████████████████████████████████████████████████▍    | 875472/941814 [04:28<00:19, 3344.01it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs w

 94%|████████████████████████████████████████████████████████████    | 884332/941814 [04:31<00:16, 3392.09it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 94%|████████████████████████████████████████████████████████████    | 884675/941814 [04:31<00:16, 3393.27it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 94%|████████████████████████████████████████████████████████████▏   | 885350/941814 [04:31<00:16, 3

 96%|█████████████████████████████████████████████████████████████▎  | 901511/941814 [04:36<00:12, 3327.40it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 96%|█████████████████████████████████████████████████████████████▎  | 901845/941814 [04:36<00:12, 3302.03it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 96%|█████████████████████████████████████████████████████████████▎  | 902176/941814 [04:36<00:12, 3

 97%|█████████████████████████████████████████████████████████████▉  | 911256/941814 [04:39<00:09, 3340.18it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 97%|█████████████████████████████████████████████████████████████▉  | 911600/941814 [04:39<00:08, 3359.90it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 97%|█████████████████████████████████████████████████████████████▉  | 911937/941814 [04:39<00:08, 3

 98%|██████████████████████████████████████████████████████████████▌ | 919946/941814 [04:42<00:06, 3249.13it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 98%|██████████████████████████████████████████████████████████████▋ | 922620/941814 [04:42<00:05, 3296.30it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs w

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
 99%|███████████████████████████████████████████████████████████████▋| 936436/941814 [04:46<00:01, 3352.70it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
100%|███████████████████████████████████████████████████████████████▋| 937121/941814 [04:47<00:01, 3358.24it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
100%|███████████████████████████████████████████████████████████████▋| 937457/941814 [04:47<00:01, 3

In [18]:
#첫번째 문장의 인코딩 결과
input_id = X_train[0][0]
attention_mask = X_train[1][0]
token_type_id = X_train[2][0]

print('단어에 대한 정수 인코딩 : ', input_id)
print('어텐션 마스크 : ', attention_mask)
print('세그먼트 인코딩 : ', token_type_id)
print('각 인코딩 길이 : ', len(input_id))
print('정수 인코딩 복원 : ', tokenizer.decode(input_id))


단어에 대한 정수 인코딩 :  [    2  1129  2073  4230 11819 19521    16  5124  2470 14584  4119  2069
  5330  2088  1513  2062    18     3  1129  2170  2259  3891  2170 28788
  4847  6882  2170 28788  4847  5700  2052  1513  2062    18     3     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0]
어텐션 마스크 :  [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [19]:
X_val = convert_example_to_feature(val_data['sentence1'], val_data['sentence2'], max_seq_len=max_seq_len, tokenizer=tokenizer )

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2490/2490 [00:00<00:00, 3003.63it/s]


In [50]:
# 최대 길이: 128
input_id = X_val[0][0]
attention_mask = X_val[1][0]
token_type_id = X_val[2][0]

print('단어에 대한 정수 인코딩 :',input_id)
print('어텐션 마스크 :',attention_mask)
print('세그먼트 인코딩 :',token_type_id)
print('각 인코딩의 길이 :', len(input_id))
print('정수 인코딩 복원 :',tokenizer.decode(input_id))

단어에 대한 정수 인코딩 : [    2  3673   636  2116  1041  2371  2062    16     6  4122    16  1535
  1458 10283    18     6     3   636  2259  3741  4942  2116   636  2138
  4105  2223  2155  6000  4122  2170  2318  4117  2138   572  2359  2062
    18     3     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0]
어텐션 마스크 : [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [20]:
X_test = convert_example_to_feature(test_data['sentence1'], test_data['sentence2'], max_seq_len=max_seq_len, tokenizer=tokenizer )

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5010/5010 [00:01<00:00, 2966.25it/s]


In [48]:
test_data.head()

Unnamed: 0,sentence1,sentence2,gold_label
0,"글쎄, 나는 그것에 관해 생각조차 하지 않았지만, 나는 너무 좌절했고, 결국 그에게...",나는 그와 다시 이야기하지 않았다.,contradiction
1,"글쎄, 나는 그것에 관해 생각조차 하지 않았지만, 나는 너무 좌절했고, 결국 그에게...",나는 다시 그와 이야기를 하기 시작했다는 것에 너무 화가 났다.,entailment
2,"글쎄, 나는 그것에 관해 생각조차 하지 않았지만, 나는 너무 좌절했고, 결국 그에게...",우리는 좋은 대화를 나눴다.,neutral
3,"그리고 저는 그것이 특권이라고 생각했습니다, 그리고 여전히, 여전히, 당시 저는 A...",그날 현장에 나만 있었던 게 아니라는 걸 몰랐던 것이다.,neutral
4,"그리고 저는 그것이 특권이라고 생각했습니다, 그리고 여전히, 여전히, 당시 저는 A...",나는 AFFC 공군 경력 분야에서 그 번호를 가진 유일한 사람이라는 인상을 가지고 ...,entailment


In [47]:
X_test[0][0]

array([   2, 8911,   16,  717, 2259, 3724, 2170, 9149, 3628, 2446, 2232,
       1889, 2118, 1380, 2886, 3683,   16,  717, 2259, 3760, 8642, 2371,
       2088,   16, 3983,  636, 2170, 2318, 3690, 3758, 2205, 2318,  859,
       2359, 2062,   18,    3,  717, 2259,  636, 2522, 3690, 3758, 2205,
       2118, 1380, 2886, 2062,   18,    3,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0])

In [49]:
X_test[0][1]

array([   2, 8911,   16,  717, 2259, 3724, 2170, 9149, 3628, 2446, 2232,
       1889, 2118, 1380, 2886, 3683,   16,  717, 2259, 3760, 8642, 2371,
       2088,   16, 3983,  636, 2170, 2318, 3690, 3758, 2205, 2318,  859,
       2359, 2062,   18,    3,  717, 2259, 3690,  636, 2522, 3758, 2138,
       1889, 2015, 3670, 2371, 4000,  575, 2170, 3760, 9562,  726, 2062,
         18,    3,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0])

In [21]:
np.unique(train_data.gold_label)

array(['contradiction', 'entailment', 'neutral'], dtype=object)

In [22]:
train_label = train_data['gold_label'].tolist()
val_label = val_data['gold_label'].tolist()
test_label = test_data['gold_label'].tolist()

In [23]:
#레이블을 정수 인덱스로 변환
idx_encode = preprocessing.LabelEncoder()
#레이블 객체 레이블 학습
idx_encode.fit(train_label)

y_train = idx_encode.transform(train_label)
y_val = idx_encode.transform(val_label)
y_test = idx_encode.transform(test_label)

In [24]:
idx_encode.classes_

array(['contradiction', 'entailment', 'neutral'], dtype='<U13')

In [25]:
label_idx = dict(zip(list(idx_encode.classes_), idx_encode.transform(list(idx_encode.classes_))))
print(label_idx)

idx_label = {value : key for key, value in label_idx.items()}
idx_label

{'contradiction': 0, 'entailment': 1, 'neutral': 2}


{0: 'contradiction', 1: 'entailment', 2: 'neutral'}

In [26]:
# 분류를 위한 트랜스포머 bert모델 로드
from transformers import TFBertForSequenceClassification
from tensorflow.keras.callbacks import EarlyStopping

In [27]:
model = TFBertForSequenceClassification.from_pretrained("klue/bert-base", num_labels=3, from_pt=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.hf_compute_loss, metrics = ['accuracy'])

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
early_stopping = EarlyStopping(
    monitor="val_accuracy", 
    min_delta=0.001,
    patience=2)

model.fit(
    X_train, y_train, epochs=2, batch_size=16, validation_data = (X_val, y_val),
    callbacks = [early_stopping]
)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x26c191b8d00>

In [31]:
print("\n 테스트 정확도: %.4f" % (model.evaluate(X_test, y_test, batch_size=512)[1]))


 테스트 정확도: 0.7553


In [65]:
print(len(X_test[0])) #test_dataset의 워드피스임베딩벡터
print(len(X_test[1])) #test_dataset의 attention_masks
len(X_test[2]) #test_dataset의 token_type_ids 

5010
5010


5010

In [66]:
len(test_data)

5010

In [57]:
X_test[0][:2]

array([[   2, 8911,   16,  717, 2259, 3724, 2170, 9149, 3628, 2446, 2232,
        1889, 2118, 1380, 2886, 3683,   16,  717, 2259, 3760, 8642, 2371,
        2088,   16, 3983,  636, 2170, 2318, 3690, 3758, 2205, 2318,  859,
        2359, 2062,   18,    3,  717, 2259,  636, 2522, 3690, 3758, 2205,
        2118, 1380, 2886, 2062,   18,    3,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0],
       [   2, 8911,   16,  717, 2259, 3724, 2170, 9149, 3628, 2446, 2232,
        1889, 2118, 1380, 2886, 3683,   16,  717, 2259, 3760,

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 4870.30it/s]


In [73]:

sampleData1 = np.array(['나는 생각한다.', '나는 생각한다.', '나는 생각한다.'])
tmp1 = pd.Series(sampleData1)

sampleData2 = np.array(['고로 존재한다.', '고로 죽을을 맞이한다.', '그것은 플라스틱 물체이다.'])
tmp2 = pd.Series(sampleData2)

In [75]:
ex = convert_example_to_feature(tmp1, tmp2, max_seq_len, tokenizer)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 2999.50it/s]


In [89]:
result = model.predict(ex)
for i in range(len(ex[0])):
    print(idx_label[np.argmax(result[0][i])])

neutral
neutral
neutral
