<a href="https://colab.research.google.com/github/ElaYJ/Study_Deep_Learning/blob/main/Lecture/55_GPT_HF_CW_Justice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GPT Hugging Face Courcework : Justice Binary Classification

- [Justice datasets](https://people.eecs.berkeley.edu/~hendrycks/ethics.tar)

- 해당 폴더에서 Justice 데이터셋을 이용하여 정의로움을 측정한다.

    <img src="https://github.com/ElaYJ/supplement/assets/153154981/5a7c9811-5ebe-4c0c-a063-db461e6bf9c6" width="57%">

- 정의로우면 0 으로 레이블링, 아니면 1로 레이블링 한다.

- BERT나 GPT와 같은 모델을 fine tunning 한다.

<br></br>

## 과제 프로세스

1. 여러분이 배운 huggingface 라이브러리를 이용하여 auto tokenizer를 사용하여 전처리 합니다.

2. 정의로우면 0, 아니면 1로 레이블링을 한 후 지도학습 기반의 binary 분류 모델로 학습을 합니다.

3. 모델은 huggingface hub에서 가져와서 진행합니다. 어떠한 모델을 사용해도 상관 없습니다.

4. 최종 결과는 정확도로 측정하세요.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!tar -xvf "/content/drive/MyDrive/zero-base DS/ethics.tar"

ethics/
ethics/deontology/
ethics/utilitarianism/
ethics/README.txt
ethics/justice/
ethics/commonsense/
ethics/virtue/
ethics/virtue/virtue_test_hard.csv
ethics/virtue/virtue_test.csv
ethics/virtue/virtue_train.csv
ethics/commonsense/cm_test_hard.csv
ethics/commonsense/cm_train.csv
ethics/commonsense/cm_test.csv
ethics/commonsense/cm_ambig.csv
ethics/justice/justice_test.csv
ethics/justice/justice_train.csv
ethics/justice/justice_test_hard.csv
ethics/utilitarianism/util_train.csv
ethics/utilitarianism/util_test_hard.csv
ethics/utilitarianism/util_test.csv
ethics/deontology/deontology_test.csv
ethics/deontology/deontology_train.csv
ethics/deontology/deontology_test_hard.csv


In [3]:
# !pip install datasets

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datase

In [17]:
import numpy as np
import pandas as pd
import os
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset, load_metric
from transformers import (
    AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, AdamW, get_scheduler
)
from tqdm.auto import tqdm
from warnings import filterwarnings
filterwarnings("ignore", category=FutureWarning)


In [18]:
# dataset 전처리
def preprocess(tokenizer, dataset, batch_sz):
    tokenized_dataset = dataset.map(lambda x: tokenizer(x['sentences'], truncation=True), batched=True)
    tokenized_dataset = tokenized_dataset.remove_columns(['sentences'])
    tokenized_dataset.set_format("torch")

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    train_dataloader = DataLoader(
        tokenized_dataset['train'], shuffle=True, batch_size=batch_sz, collate_fn=data_collator
    )
    test_dataloader = DataLoader(
        tokenized_dataset['test'], batch_size=batch_sz, collate_fn=data_collator
    )
    test_hard_dataloader = DataLoader(
        tokenized_dataset['test_hard'], batch_size=batch_sz, collate_fn=data_collator
    )
    return train_dataloader, test_dataloader, test_hard_dataloader

In [19]:
# 모델 Accuracy 평가
def evaluate(model, metric, test_dataloader):
    model.eval()
    for batch in test_dataloader:
        data = {k: v.cuda() for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**data)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch['labels'])

    acc = metric.compute()
    print(acc)
    return acc['accuracy']

In [25]:
# 학습 및 결과 출력
def train_process(model, epochs, train_dataloader, test_dataloader, test_hard_dataloader):
    optimizer = AdamW(model.parameters(), lr=5e-5, no_deprecation_warning=True)
    lr_scheduler = get_scheduler(
        "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=(epochs * len(train_dataloader))
    )

    metric = load_metric("accuracy")
    test_accs = []; test_hard_accs = [];

    model.cuda()
    model.train()
    for epoch in range(epochs):
        for step, batch in enumerate(train_dataloader):
            data = {k: v.cuda() for k, v in batch.items()}
            outputs = model(**data)

            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

            if step % 200 == 0 and step > 0:
                p = 100.*step/len(train_dataloader)
                print('Train Epoch: {} [{:.0f}%]\t Loss: {:.6f}'.format(epoch+1, p, loss))

        print('\ntest set:')
        test_accs.append(evaluate(model, metric, test_dataloader))
        print('test hard set:')
        test_hard_accs.append(evaluate(model, metric, test_hard_dataloader))
        print('-'*37, end='\n\n')

    return np.max(test_accs), np.max(test_hard_accs)

In [21]:
# Justice 분류 실행 함수
def Justice_classification(datasets, tokenizer, model, batch_size, epochs):
    train_dataloader, test_dataloader, test_hard_dataloader = preprocess(
        tokenizer, datasets, batch_size
    )
    test_best_acc, test_hard_best_acc = train_process(
        model, epochs, train_dataloader, test_dataloader, test_hard_dataloader
    )
    print("Test Accuracy:", round(test_best_acc, 5))
    print("Test_hard Accuracy:", round(test_hard_best_acc, 5))

-----

<br></br>

## Justice Dataset

In [9]:
data_dir = os.path.abspath("./ethics/justice")
data_files = {"train": "justice_train.csv", "test": "justice_test.csv", "test_hard": "justice_test_hard.csv"}
justice_dataset = load_dataset("csv", data_dir=data_dir, data_files=data_files)
justice_dataset = justice_dataset.rename_columns({"label": "labels", "scenario":"sentences"})
justice_dataset

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating test_hard split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'sentences'],
        num_rows: 21791
    })
    test: Dataset({
        features: ['labels', 'sentences'],
        num_rows: 2704
    })
    test_hard: Dataset({
        features: ['labels', 'sentences'],
        num_rows: 2052
    })
})

In [10]:
# 데이터셋 확인
path = os.path.join(data_dir, data_files["train"])
df = pd.read_csv(path)
df

Unnamed: 0,label,scenario
0,1,I deserve to be paid by my boss because I keep...
1,0,I deserve to be given a brand new car by my bo...
2,0,I usually check in with Joe at security at wor...
3,1,I usually check in with Joe at security at wor...
4,1,I usually check in with Joe at security at wor...
...,...,...
21786,1,I used to help my sister study for her geometr...
21787,0,I typically play soccer with my friend at rece...
21788,0,I typically play soccer with my friend at rece...
21789,1,I typically play soccer with my friend at rece...




<br></br>

## 모델1. BERT ⭐

- "bert-base-uncased"

- 모델 중 가장 Accuracy가 높음.

    - __Test Accuracy__ : $~\large\boxed{~0.77293~}$
    
    - __Test_hard Accuracy__ : $~\large\boxed{~0.60283~}$

In [23]:
checkpoint = 'bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
model

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [26]:
batch_size = 16
epochs = 3

Justice_classification(justice_dataset, tokenizer, model, batch_size, epochs)

Map:   0%|          | 0/21791 [00:00<?, ? examples/s]

Map:   0%|          | 0/2704 [00:00<?, ? examples/s]

Map:   0%|          | 0/2052 [00:00<?, ? examples/s]

Train Epoch: 1 [15%]	 Loss: 0.528660
Train Epoch: 1 [29%]	 Loss: 0.670842
Train Epoch: 1 [44%]	 Loss: 0.494601
Train Epoch: 1 [59%]	 Loss: 0.412264
Train Epoch: 1 [73%]	 Loss: 0.605456
Train Epoch: 1 [88%]	 Loss: 0.396272

test set:
{'accuracy': 0.742603550295858}
test hard set:
{'accuracy': 0.5799220272904484}
---------------------------

Train Epoch: 2 [15%]	 Loss: 0.237260
Train Epoch: 2 [29%]	 Loss: 0.170025
Train Epoch: 2 [44%]	 Loss: 0.309016
Train Epoch: 2 [59%]	 Loss: 0.122579
Train Epoch: 2 [73%]	 Loss: 0.171250
Train Epoch: 2 [88%]	 Loss: 0.197002

test set:
{'accuracy': 0.7662721893491125}
test hard set:
{'accuracy': 0.5960038986354775}
---------------------------

Train Epoch: 3 [15%]	 Loss: 0.025216
Train Epoch: 3 [29%]	 Loss: 0.051010
Train Epoch: 3 [44%]	 Loss: 0.065943
Train Epoch: 3 [59%]	 Loss: 0.047645
Train Epoch: 3 [73%]	 Loss: 0.187787
Train Epoch: 3 [88%]	 Loss: 0.056365

test set:
{'accuracy': 0.7729289940828402}
test hard set:
{'accuracy': 0.6028265107212476}
-

<br></br>

## 모델2. GPT3

- __sentry_summary_20_balanced_to_GPT3_financial_summary_v1_512__

- This model is a fine-tuned version of distilbert-base-uncased on an unknown dataset.

- https://huggingface.co/web2boomer/sentry_summary_20_balanced_to_GPT3_financial_summary_v1_512

</br>

- Accuracy Result

    - __Test Accuracy__: $~\large\boxed{~0.75555~}$
    
    - __Test_hard Accuracy__: $~\large\boxed{~0.57261~}$

In [29]:
checkpoint = "web2boomer/sentry_summary_20_balanced_to_GPT3_financial_summary_v1_512"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
model

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [30]:
batch_size = 16
epochs = 3

Justice_classification(justice_dataset, tokenizer, model, batch_size, epochs)

Map:   0%|          | 0/21791 [00:00<?, ? examples/s]

Map:   0%|          | 0/2704 [00:00<?, ? examples/s]

Map:   0%|          | 0/2052 [00:00<?, ? examples/s]

Train Epoch: 1 [15%]	 Loss: 0.555666
Train Epoch: 1 [29%]	 Loss: 0.487098
Train Epoch: 1 [44%]	 Loss: 0.452588
Train Epoch: 1 [59%]	 Loss: 0.472644
Train Epoch: 1 [73%]	 Loss: 0.480753
Train Epoch: 1 [88%]	 Loss: 0.499544

test set:
{'accuracy': 0.7385355029585798}
test hard set:
{'accuracy': 0.554093567251462}
---------------------------

Train Epoch: 2 [15%]	 Loss: 0.369619
Train Epoch: 2 [29%]	 Loss: 0.103962
Train Epoch: 2 [44%]	 Loss: 0.451978
Train Epoch: 2 [59%]	 Loss: 0.279675
Train Epoch: 2 [73%]	 Loss: 0.599945
Train Epoch: 2 [88%]	 Loss: 0.335851

test set:
{'accuracy': 0.7555473372781065}
test hard set:
{'accuracy': 0.5623781676413255}
---------------------------

Train Epoch: 3 [15%]	 Loss: 0.025071
Train Epoch: 3 [29%]	 Loss: 0.041803
Train Epoch: 3 [44%]	 Loss: 0.418419
Train Epoch: 3 [59%]	 Loss: 0.065830
Train Epoch: 3 [73%]	 Loss: 0.019045
Train Epoch: 3 [88%]	 Loss: 0.225217

test set:
{'accuracy': 0.7514792899408284}
test hard set:
{'accuracy': 0.5726120857699805}
-

<br></br>

## 모델3. DistilBERT

- __DistilBERT base uncased finetuned SST-2__

- This model is a fine-tune checkpoint of DistilBERT-base-uncased, fine-tuned on SST-2. This model reaches an accuracy of 91.3 on the dev set (for comparison, Bert bert-base-uncased version reaches an accuracy of 92.7).

- https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english

</br>

- Accuracy Result:

    - __Test Accuracy__ : $~\large\boxed{~0.74297~}$
    
    - __Test_hard Accuracy__ : $~\large\boxed{~0.58577~}$

In [35]:
checkpoint = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [37]:
batch_size = 16
epochs = 5

Justice_classification(justice_dataset, tokenizer, model, batch_size, epochs)

Train Epoch: 1 [15%]	 Loss: 0.467557
Train Epoch: 1 [29%]	 Loss: 0.398367
Train Epoch: 1 [44%]	 Loss: 0.801932
Train Epoch: 1 [59%]	 Loss: 0.509796
Train Epoch: 1 [73%]	 Loss: 0.334474
Train Epoch: 1 [88%]	 Loss: 0.369304

test set:
{'accuracy': 0.727810650887574}
test hard set:
{'accuracy': 0.5531189083820662}
---------------------------

Train Epoch: 2 [15%]	 Loss: 0.133793
Train Epoch: 2 [29%]	 Loss: 0.206307
Train Epoch: 2 [44%]	 Loss: 0.407460
Train Epoch: 2 [59%]	 Loss: 0.255288
Train Epoch: 2 [73%]	 Loss: 0.272562
Train Epoch: 2 [88%]	 Loss: 0.427241

test set:
{'accuracy': 0.7311390532544378}
test hard set:
{'accuracy': 0.5735867446393762}
---------------------------

Train Epoch: 3 [15%]	 Loss: 0.002483
Train Epoch: 3 [29%]	 Loss: 0.016457
Train Epoch: 3 [44%]	 Loss: 0.010637
Train Epoch: 3 [59%]	 Loss: 0.056275
Train Epoch: 3 [73%]	 Loss: 0.058264
Train Epoch: 3 [88%]	 Loss: 0.151810

test set:
{'accuracy': 0.7307692307692307}
test hard set:
{'accuracy': 0.577485380116959}
--

<br></br>

-----

## The End~!