# 사전 준비

In [None]:
!pip install transformers
!pip install datasets
!pip install sacremoses

**GLUE의 SST-2 데이터 불러오기**

In [2]:
from datasets import load_dataset

datasets = load_dataset("glue", "sst2")

Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading and preparing dataset glue/sst2 (download: 7.09 MiB, generated: 4.81 MiB, post-processed: Unknown size, total: 11.90 MiB) to /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data:   0%|          | 0.00/7.44M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [4]:
# label 0: negative(부정) / 1: positive(긍정) / -1: test data (비공개)
print(datasets["train"][0])
print(datasets["validation"][0])
print(datasets["test"][0])

{'sentence': 'hide new secretions from the parental units ', 'label': 0, 'idx': 0}
{'sentence': "it 's a charming and often affecting journey . ", 'label': 1, 'idx': 0}
{'sentence': 'uneasy mishmash of styles and genres .', 'label': -1, 'idx': 0}


**XLM 모델과 토크나이저 불러오기**

In [5]:
from transformers import XLMTokenizer, XLMForSequenceClassification

tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
model = XLMForSequenceClassification.from_pretrained("xlm-mlm-en-2048")

Downloading:   0%|          | 0.00/646k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/487k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/840 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-mlm-en-2048 were not used when initializing XLMForSequenceClassification: ['pred_layer.proj.weight', 'pred_layer.proj.bias']
- This IS expected if you are initializing XLMForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMForSequenceClassification were not initialized from the model checkpoint at xlm-mlm-en-2048 and are newly initialized: ['sequence_summary.summary.bias', 'transformer.position_ids', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infer

In [6]:
model.config

XLMConfig {
  "_name_or_path": "xlm-mlm-en-2048",
  "architectures": [
    "XLMWithLMHeadModel"
  ],
  "asm": false,
  "attention_dropout": 0.1,
  "bos_index": 0,
  "bos_token_id": 0,
  "causal": false,
  "dropout": 0.1,
  "emb_dim": 2048,
  "embed_init_std": 0.02209708691207961,
  "end_n_top": 5,
  "eos_index": 1,
  "gelu_activation": true,
  "init_std": 0.02,
  "is_encoder": true,
  "lang_id": 0,
  "layer_norm_eps": 1e-12,
  "mask_index": 5,
  "mask_token_id": 0,
  "max_position_embeddings": 512,
  "model_type": "xlm",
  "n_heads": 16,
  "n_langs": 1,
  "n_layers": 12,
  "pad_index": 2,
  "pad_token_id": 2,
  "sinusoidal_embeddings": false,
  "start_n_top": 5,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.22.2",
  "unk_index": 3,
  "use_lang_emb": true,
  "vocab_size": 30145
}

# 데이터 구축

데이터 준비
train:validation:test = 7 : 1 : 2

In [7]:
from tqdm.auto import tqdm as tqdm_auto

In [8]:
ids = datasets['validation'].num_rows

In [9]:
train_sentence = [datasets['train']['sentence'][idx] for idx in tqdm_auto(range(0, ids*7))]
train_label = [datasets['train']['label'][idx] for idx in tqdm_auto(range(0, ids*7))]

  0%|          | 0/6104 [00:00<?, ?it/s]

  0%|          | 0/6104 [00:00<?, ?it/s]

In [10]:

val_sentence = [datasets['validation']['sentence'][idx] for idx in tqdm_auto(range(0, ids))]
val_label = [datasets['validation']['label'][idx] for idx in tqdm_auto(range(0, ids))]

  0%|          | 0/872 [00:00<?, ?it/s]

  0%|          | 0/872 [00:00<?, ?it/s]

In [11]:
# SST-2의 test data는 비공개로 train data의 일부로 test data를 만든다
test_sentence = [datasets['train']['sentence'][idx] for idx in tqdm_auto(range(ids*7, ids*9))]
test_label = [datasets['train']['label'][idx] for idx in tqdm_auto(range(ids*7, ids*9))]

  0%|          | 0/1744 [00:00<?, ?it/s]

  0%|          | 0/1744 [00:00<?, ?it/s]

In [12]:
# 마지막 train data와 test data의 마지막과 처음이 중복인지 확인
print("last train data:", train_sentence[-1])
print("last test data:", test_sentence[0])

last train data: a moral 
last test data: that gives movies about ordinary folk a bad name 


**토크나이징**

In [13]:
# 패딩 채우기
train_input = tokenizer(train_sentence, padding=True, truncation=True, max_length=64, return_tensors="pt")
val_input = tokenizer(val_sentence, padding=True, truncation=True, max_length=64, return_tensors="pt")
test_input = tokenizer(test_sentence, padding=True, truncation=True, max_length=64, return_tensors="pt")