In [1]:
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
clf = pipeline("sentiment-analysis")
result = clf("what a beautiful day!")[0]
print(f"감성분석 결과: {result['label']}, 감성스코어: {result['score']:0.4f}")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Downloading model.safetensors: 100%|██████████| 268M/268M [00:30<00:00, 8.86MB/s]
2023-08-10 20:14:22.333796: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-10 20:14:22.372716: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDi

감성분석 결과: POSITIVE, 감성스코어: 0.9999


In [None]:
text_generator = pipeline("text-generation")
result = text_generator("Alice was beginning to get very tired of sitting by her sister on the bank,")
print(result[0]['generated_text'])

No model was supplied, defaulted to gpt2 and revision 6c0e608 (https://huggingface.co/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.
Downloading (…)lve/main/config.json: 100%|██████████| 665/665 [00:00<00:00, 1.42MB/s]
Downloading model.safetensors: 100%|██████████| 548M/548M [01:18<00:00, 7.00MB/s]
All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
Downloading (…)olve/main/vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 1.44MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 840kB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 5.16MB/s]
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Alice was beginning to get very tired of sitting by her sister on the bank, but I knew that I could find her anywhere on the outside. In the shadows—afterwards—it was difficult finding her."

A few days later, when


## 14.6 자동 클래스를 이용한 토크나이저와 모형의 사용

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [3]:
# Auto Classes를 이용해 사전학습된 내용에 맞는 토크나이저와 모형을 자동으로 설정
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-cased-finetuned-mrpc"
)

# 의미적으로 유사한 두 문장을 선언
input_sentence = "She angered me with her inappropriate comments, rumor-spreading, and disrespectfulness at the formal dinner table"
target_sequence = "she made me angry when she was rude at dinner"
# 토큰화
tokens = tokenizer(input_sentence, target_sequence, return_tensors="pt")

# 모형으로 결과를 예측
logits = model(**tokens).logits

# 소프트맥스를 이용해 결과값을 클래스에 대한 확률로 변환
results = torch.softmax(logits, dim=1).tolist()[0]

for i, label in enumerate(['no', 'yes']):
    print(f"{label}: {int(round(results[i] * 100))}%")

Downloading pytorch_model.bin: 100%|██████████| 433M/433M [01:01<00:00, 7.00MB/s] 


no: 43%
yes: 57%


In [4]:
target_sequence = "The boy quickly ran across the finish line, seizing yet another victory"
tokens = tokenizer(input_sentence, target_sequence, return_tensors="pt")
logits = model(**tokens).logits
results = torch.softmax(logits, dim=1).tolist()[0]

for i, label in enumerate(['no', 'yes']):
    print(f"{label}: {int(round(results[i] * 100))}%")

no: 95%
yes: 5%


In [5]:
import nltk
from nltk.corpus import movie_reviews
from sklearn.model_selection import train_test_split
import numpy as np

In [6]:
# movei review data에서 file id를 가져옴
fileids = movie_reviews.fileids()

# file id를 이용해 raw text file을 가져옴
reviews = [movie_reviews.raw(fileid) for fileid in fileids]
categories = [movie_reviews.categories(fileid)[0] for fileid in fileids]

# label을 0, 1의 값으로 변환
label_dict={'pos':1, 'neg':0}
y = np.array([label_dict[c] for c in categories])

X_train, X_test, y_train, y_test = train_test_split(reviews, y, test_size=0.2, random_state=7)

print("train set count: ", len(X_train))
print('Test set count: ', len(X_test))

train set count:  1600
Test set count:  400


In [29]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch 
import torch.nn.functional as F

In [30]:
# gpu 사용

device = torch.device("mps")
print(f"Using {device} device")

Using mps device


In [35]:
# Auto Classes를 이용해 사전학습된 내용에 맞는 토크나이저와 모형을 자동으로 설정
tokenizer = AutoTokenizer.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english"
)

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english"
)

# 모델을 gpu로 옮겨서 연산을 준비
model = model.to(device)

batch_size = 10 # 모형으로 한번에 예측할 데이터의 수
y_pred = [] # 전체 예측결과를 저장

num_batch = len(y_test)//batch_size

for i in range(num_batch):
    inputs = tokenizer(
        X_test[i*batch_size:(i+1)*batch_size],
        truncation=True,
        padding=True,
        return_tensors="pt"
    )

    # 토큰화 결과를 GPU로 이동
    inputs = inputs.to(device)

    # 모형으로 결과를 예측
    logits = model(**inputs).logits

    # 결과값을 클래스에 대한 확률로 변환
    pred = F.softmax(logits, dim=-1)

    # 예측결과를 CPU로 가져와서 넘파이로 변환한 후,
    # argmax로 확률이 가장 큰 클래스를 선택함
    results = pred.cpu().detach().numpy().argmax(axis=1)

    # 전체 예측결과에 추가
    y_pred.extend(results.tolist())

# gpu 메모리를 비움
torch.mps.empty_cahce()

score = sum(y_test = np.array(y_pred))/len(y_test)
print(f"NLTK 영화리뷰 감성분석 정확도:{score}")