# 1. Load & Execute CV model

In [9]:
from transformers import DetrImageProcessor, DetrForObjectDetection
import torch
from PIL import Image
import requests

path = "/content/cafe.jpg"
image = Image.open(path)


In [10]:
#Detection Model 1. detr-resnet-50

# you can specify the revision tag if you don't want the timm dependency
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50", revision="no_timm")
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50", revision="no_timm")

inputs = processor(images=image, return_tensors="pt")
outputs = model(**inputs)

# convert outputs (bounding boxes and class logits) to COCO API
# let's only keep detections with score > 0.9
target_sizes = torch.tensor([image.size[::-1]])
results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]

for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    print(
            f"Detected {model.config.id2label[label.item()]} with confidence "
            f"{round(score.item(), 3)} at location {box}"
    )


Detected person with confidence 0.979 at location [728.59, 0.54, 1078.59, 352.72]
Detected cup with confidence 0.914 at location [8.97, 934.48, 598.39, 1429.66]
Detected dining table with confidence 0.977 at location [0.29, 140.4, 1079.24, 1423.58]
Detected cup with confidence 0.995 at location [642.42, 282.14, 1079.71, 835.08]
Detected cup with confidence 0.984 at location [598.34, 743.1, 1078.88, 1257.38]


In [11]:
#Detection model 2. yolos-tiny


from transformers import YolosImageProcessor, YolosForObjectDetection
model = YolosForObjectDetection.from_pretrained('hustvl/yolos-tiny')
image_processor = YolosImageProcessor.from_pretrained("hustvl/yolos-tiny")

inputs = image_processor(images=image, return_tensors="pt")
outputs = model(**inputs)

# model predicts bounding boxes and corresponding COCO classes
logits = outputs.logits
bboxes = outputs.pred_boxes


# print results
target_sizes = torch.tensor([image.size[::-1]])
results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[0]
for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    print(
        f"Detected {model.config.id2label[label.item()]} with confidence "
        f"{round(score.item(), 3)} at location {box}"
    )


Detected person with confidence 0.928 at location [814.62, 0.73, 1079.63, 362.99]
Detected cup with confidence 0.976 at location [621.95, 753.01, 1077.01, 1269.86]
Detected dining table with confidence 0.954 at location [-0.29, 152.1, 1079.71, 1423.26]
Detected cup with confidence 0.99 at location [642.34, 272.5, 1078.27, 838.53]


카페에서 찍은 사진으로 detection model 2개를 돌려보았을 때, 컵을 3개 찾아냈다는 점에서 detr-resnet-50의 성능이 더 좋다고 생각한다.


## 2. NLP model fine-tuning

[해당 블로그](https://m.blog.naver.com/horajjan/221739630055)를 참고하였습니다.

In [1]:
!pip install transformers



In [2]:
import tensorflow as tf
import torch

from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import random
import time
import datetime

In [3]:
# 네이버 영화리뷰 감정분석 데이터 다운로드
!git clone https://github.com/e9t/nsmc.git

Cloning into 'nsmc'...
remote: Enumerating objects: 14763, done.[K
remote: Counting objects: 100% (14762/14762), done.[K
remote: Compressing objects: 100% (13012/13012), done.[K
remote: Total 14763 (delta 1748), reused 14762 (delta 1748), pack-reused 1[K
Receiving objects: 100% (14763/14763), 56.19 MiB | 6.15 MiB/s, done.
Resolving deltas: 100% (1748/1748), done.
Updating files: 100% (14737/14737), done.


In [4]:
# download dataset
# 성능을 보여주는 과제가 아니므로 train과 test dataset을 합쳐서 사용하겠습니다.
train = pd.read_csv("nsmc/ratings_train.txt", sep='\t')
test = pd.read_csv("nsmc/ratings_test.txt", sep='\t')

train = pd.concat([train, test])
train = train.sample(frac=1, random_state=42).reset_index(drop=True)
train.head(5)

Unnamed: 0,id,document,label
0,9490954,"유행하는 로맨틱 코미디 주제지만 스토리는 연애 못해본 작가가 쓴 것 같고, 네러티브...",0
1,8398586,향수를 불러일으키는 영화,1
2,5915113,완전 재밌어여!!,1
3,77456,99s년 이후로 두번째 다시 봤는데... 넘 감동적인 영화입니다..,1
4,6252159,거의 팬픽수준,0


In [5]:
# preprocessing
sentences = train['document']
labels = train['label'].values

# [CLS], [SEP] 토큰 붙여줌
sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]

# 형식 확인
sentences[:10], labels

(['[CLS] 유행하는 로맨틱 코미디 주제지만 스토리는 연애 못해본 작가가 쓴 것 같고, 네러티브는 몰입을 뚝뚝 떨군다. [SEP]',
  '[CLS] 향수를 불러일으키는 영화 [SEP]',
  '[CLS] 완전 재밌어여!! [SEP]',
  '[CLS] 99s년 이후로 두번째 다시 봤는데... 넘 감동적인 영화입니다.. [SEP]',
  '[CLS] 거의 팬픽수준 [SEP]',
  '[CLS] 모든 걸 담으려다 넘쳐 흐름. 과유불급이라. [SEP]',
  '[CLS] 원작이 일본만화라던데 ㅋㅋㅋ 때타올하고 온돌이 나오네? 작가가 자이니치 인가? ㅋㅋㅋ [SEP]',
  '[CLS] 사건속에 우연, 설득력이 없는 영화. 반말하는 자식은 별로. [SEP]',
  '[CLS] 10점 줄께... 난 누드에 약한 남자. ㅋㅋ 그래픽만으로는 10점 받아도 될 것 같네... 내용을 따지려면 애니를 보지말고 책을 봐라... [SEP]',
  '[CLS] 원작 배경 의상 배우를 넘어서는 어설픈 연출력... [SEP]'],
 array([0, 1, 1, ..., 0, 1, 1]))

In [6]:
# tokenize and convert token to index
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]


input_idx = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_idx = pad_sequences(input_idx, maxlen=128, dtype="long", truncating="post", padding="post")

input_idx[:2]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

array([[   101,   9625,  25549,  12178,   9202, 118916, 119375,   9812,
         22458,  48446,   9689,  17730,  28578,   9477,  26444,  26344,
          9568, 119121,   9290,  14523,  40419,   9652,  11287,  11287,
          9512,   8870,   8855,  11664,    117,   9011,  30873,  45725,
         52015,  11018,   9287,  58303,  10622,   9147, 118842,   9141,
         17360,  11903,    119,    102,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0, 

In [7]:
# Attention Mask
attention_masks = []

# padding이면 0으로 설정해서 어텐션을 수행하지 않도록 함
for seq in input_idx:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)


In [8]:
# 훈련셋과 검증셋으로 분리
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_idx, labels, random_state=2024, test_size=0.3)

# 어텐션 마스크를 훈련셋과 검증셋으로 분리
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_idx, random_state=2024, test_size=0.3)

# 데이터를 파이토치의 텐서로 변환
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)

print(train_inputs[0])
print(train_labels[0])
print(train_masks[0])
print(validation_inputs[0])
print(validation_labels[0])
print(validation_masks[0])

tensor([   101,   9670,  89523,    100,  42428,  14496,   9703, 118626,  14153,
          9364, 119081,  48345,    102,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0, 

In [20]:
# batch size 설정, train/validation dataloader 생성
# 시간을 고려하여 batch size와 epoch를 낮췄습니다
batch_size = 64

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [21]:
# 디바이스 설정
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [22]:
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)
model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [23]:
# 옵티마이저 설정
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # 학습률
                  eps = 1e-8 # 0으로 나누는 것을 방지하기 위한 epsilon 값
                )

# 에폭수
epochs = 2

# 총 훈련 스텝 : 배치반복 횟수 * 에폭
total_steps = len(train_dataloader) * epochs

# 처음에 학습률을 조금씩 변화시키는 스케줄러 생성
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)



In [24]:
# 정확도 계산 함수
def flat_accuracy(preds, labels):

    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [25]:
# 시간 표시 함수
def format_time(elapsed):

    # 반올림
    elapsed_rounded = int(round((elapsed)))

    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [19]:
# torch.cuda.empty_cache()

In [26]:
# # 재현을 위해 랜덤시드 고정
# seed_val = 42
# random.seed(seed_val)
# np.random.seed(seed_val)
# torch.manual_seed(seed_val)
# torch.cuda.manual_seed_all(seed_val)

# 그래디언트 초기화
model.zero_grad()

# 에폭만큼 반복
for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # 시작 시간 설정
    t0 = time.time()

    # 로스 초기화
    total_loss = 0

    # 훈련모드로 변경
    model.train()

    # 데이터로더에서 배치만큼 반복하여 가져옴
    for step, batch in enumerate(train_dataloader):
        # 경과 정보 표시
        if step % 500 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)

        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch

        # Forward 수행
        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        labels=b_labels)

        # 로스 구함
        loss = outputs[0]

        # 총 로스 계산
        total_loss += loss.item()

        # Backward 수행으로 그래디언트 계산
        loss.backward()

        # 그래디언트 클리핑
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # 그래디언트를 통해 가중치 파라미터 업데이트
        optimizer.step()

        # 스케줄러로 학습률 감소
        scheduler.step()

        # 그래디언트 초기화
        model.zero_grad()

    # 평균 로스 계산
    avg_train_loss = total_loss / len(train_dataloader)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))

    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    #시작 시간 설정
    t0 = time.time()

    # 평가모드로 변경
    model.eval()

    # 변수 초기화
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # 데이터로더에서 배치만큼 반복하여 가져옴
    for batch in validation_dataloader:
        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)

        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch

        # 그래디언트 계산 안함
        with torch.no_grad():
            # Forward 수행
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask)

        # 출력 로짓 구함
        logits = outputs[0]

        # CPU로 데이터 이동
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # 출력 로짓과 라벨을 비교하여 정확도 계산
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...
  Batch   500  of  2,188.    Elapsed: 0:10:32.
  Batch 1,000  of  2,188.    Elapsed: 0:21:02.
  Batch 1,500  of  2,188.    Elapsed: 0:31:32.
  Batch 2,000  of  2,188.    Elapsed: 0:42:02.

  Average training loss: 0.38
  Training epcoh took: 0:45:58

Running Validation...
  Accuracy: 0.86
  Validation took: 0:07:26

Training...
  Batch   500  of  2,188.    Elapsed: 0:10:30.
  Batch 1,000  of  2,188.    Elapsed: 0:21:01.
  Batch 1,500  of  2,188.    Elapsed: 0:31:32.
  Batch 2,000  of  2,188.    Elapsed: 0:42:01.

  Average training loss: 0.28
  Training epcoh took: 0:45:58

Running Validation...
  Accuracy: 0.86
  Validation took: 0:07:23

Training complete!


In [30]:
# 모델 저장 경로 설정
model_save_path = "model.pth"

# 모델 저장
torch.save(model.state_dict(), model_save_path)


In [31]:
# 입력 데이터 변환
def convert_input_data(sentences):

    # BERT의 토크나이저로 문장을 토큰으로 분리
    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

    # 입력 토큰의 최대 시퀀스 길이
    MAX_LEN = 128

    # 토큰을 숫자 인덱스로 변환
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

    # 문장을 MAX_LEN 길이에 맞게 자르고, 모자란 부분을 패딩 0으로 채움
    input_ids = pad_sequences(input_ids, maxlen=128, dtype="long", truncating="post", padding="post")

    # 어텐션 마스크 초기화
    attention_masks = []

    # 어텐션 마스크를 패딩이 아니면 1, 패딩이면 0으로 설정
    # 패딩 부분은 BERT 모델에서 어텐션을 수행하지 않아 속도 향상
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    # 데이터를 파이토치의 텐서로 변환
    inputs = torch.tensor(input_ids)
    masks = torch.tensor(attention_masks)

    return inputs, masks

In [32]:
# 문장 테스트
def test_sentences(sentences):

    # 평가모드로 변경
    model.eval()

    # 문장을 입력 데이터로 변환
    inputs, masks = convert_input_data(sentences)

    # 데이터를 GPU에 넣음
    b_input_ids = inputs.to(device)
    b_input_mask = masks.to(device)

    # 그래디언트 계산 안함
    with torch.no_grad():
        # Forward 수행
        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask)

    # 출력 로짓 구함
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()

    return logits

In [33]:
logits = test_sentences(['실제 역사를 거의 그대로 다루다보니 보면서 화가 나긴 하지만 영화적으로 흥미롭게 구성을 잘해서 몰입도가 높아요. 몇 번을 봐도 볼 때마다 새롭게 보이는 면이 있구요. 의미와 재미를 다 잡은 작품이에요!'])

print(logits)
print(np.argmax(logits))

[[-2.7376888  2.8388615]]
1
