In [3]:
pip install torch

Collecting torch
  Downloading torch-2.0.1-cp311-cp311-win_amd64.whl (172.3 MB)
     ---------------------------------------- 0.0/172.3 MB ? eta -:--:--
     --------------------------------------- 0.5/172.3 MB 16.8 MB/s eta 0:00:11
     --------------------------------------- 1.0/172.3 MB 10.8 MB/s eta 0:00:16
     --------------------------------------- 1.5/172.3 MB 12.2 MB/s eta 0:00:14
     --------------------------------------- 2.1/172.3 MB 11.9 MB/s eta 0:00:15
      -------------------------------------- 2.5/172.3 MB 11.4 MB/s eta 0:00:15
      -------------------------------------- 3.0/172.3 MB 11.4 MB/s eta 0:00:15
      -------------------------------------- 3.6/172.3 MB 11.4 MB/s eta 0:00:15
      -------------------------------------- 4.0/172.3 MB 11.1 MB/s eta 0:00:16
      -------------------------------------- 4.4/172.3 MB 10.7 MB/s eta 0:00:16
     - ------------------------------------- 4.9/172.3 MB 10.8 MB/s eta 0:00:16
     - ------------------------------------- 5.

퍼셉트론 : 가장 간단한 신경망

In [67]:
import torch
import torch.nn as nn

class Perceptron(nn.Module) :
    # 퍼셉트론은 하나의 선형 층
    def __init__(self, input_dim) :  # input_dim : 입력 특성 크기
        super(Perceptron, self).__init__()
        self.fc1 = nn.Linear(input_dim, 1)
    
    def forward(self, x_in) :
        '''
        퍼셉트론의 정방향 계산
        
        x_in(torch.Tensor) : 입력 데이터 텐서
        x_in.shape : (batch, num_features)  # input_dim = num_features
                       데이터, 변수
        반환값 : 결과 텐서, tensor.shape : (batch, )
        '''
        return torch.sigmoid(self.fc1(x_in)).squeeze() # 차원이 1인 차원 제거

In [68]:
a = Perceptron(4)
x_in = torch.Tensor(3, 4)
x_in.shape
a.forward(x_in) # (3, 1) # __init__ fc1이 input_dim, 2이면 (3, 2)반환

tensor([0.5799, 0.5799, 0.5799], grad_fn=<SqueezeBackward0>)

In [64]:
x_in

tensor([[1.0286e-38, 1.0653e-38, 1.0194e-38, 8.4490e-39],
        [1.0469e-38, 9.3674e-39, 9.9184e-39, 8.7245e-39],
        [9.2755e-39, 8.9082e-39, 9.9184e-39, 8.4490e-39]])

활성화 함수 ; 비선형 함수

손실함수

In [69]:
# 평균 제곱 오차 손실(MSE)
import torch
import torch.nn as nn

mse_loss = nn.MSELoss()
outputs = torch.randn(3, 5, requires_grad=True) # 예측
targets = torch.randn(3, 5) # 타깃값
loss = mse_loss(outputs, targets)
print(loss)

tensor(4.6011, grad_fn=<MseLossBackward0>)


In [70]:
# 범주형 크로스 엔트로피 손실
ce_loss = nn.CrossEntropyLoss()
outputs = torch.randn(3, 5, requires_grad=True)
targets = torch.tensor([1, 0, 3], dtype = torch.int64) # 정수로 하는 이유 -> 범주형을 인덱스
loss = ce_loss(outputs, targets)
print(loss)

tensor(1.9061, grad_fn=<NllLossBackward0>)


In [71]:
# 이진 크로스 엔트로피 손실
bce_loss = nn.BCELoss()
sigmoid = nn.Sigmoid()
probabilities = sigmoid(torch.randn(4, 1, requires_grad=True))
targets = torch.tensor([1, 0, 1, 0], dtype=torch.float32).view(4, 1)
loss = bce_loss(probabilities, targets)
print(probabilities)
print(loss)

tensor([[0.2631],
        [0.4016],
        [0.3129],
        [0.5826]], grad_fn=<SigmoidBackward0>)
tensor(0.9711, grad_fn=<BinaryCrossEntropyBackward0>)


지도학습 훈련

In [72]:
LEFT_CENTER = (3, 3)
RIGHT_CENTER = (3, -2)

import numpy as np
# 데이터 준비함수
# 데이터 포인트가 두 분포 중 어디에 속하는지 구별
def get_toy_data(batch_size, left_center=LEFT_CENTER, right_center=RIGHT_CENTER):
    x_data = []
    y_targets = np.zeros(batch_size)
    for batch_i in range(batch_size):
        if np.random.random() > 0.5:
            x_data.append(np.random.normal(loc=left_center))
        else:
            x_data.append(np.random.normal(loc=right_center))
            y_targets[batch_i] = 1
    return torch.tensor(x_data, dtype=torch.float32), torch.tensor(y_targets, dtype=torch.float32)

In [76]:
# Adam 옵티마이저
import torch.nn as nn
import torch.optim as optim

input_dim = 2
lr = 0.001
perceptron = Perceptron(input_dim = input_dim)
bce_loss = nn.BCELoss()
optimizer = optim.Adam(params = perceptron.parameters(), lr=lr)

In [78]:
batch_size = 1000
n_epochs = 12
n_batches = 5

# 각 에포크는 전체 훈련 데이터 사용
for i in range(n_epochs) :
    # 내부 반복은 데이터셋에 있는 배치에 대해 수행
    for j in range(n_batches) :
        
        # 데이터 가져오기
        x_data, y_target = get_toy_data(batch_size)
        
        # 그레이디언트 초기화
        optimizer.zero_grad()
        
        # 모델의 정방향 계산 수행
        y_pred = perceptron(x_data).squeeze()
        
        # 최적화하려는 손실 계산
        loss = bce_loss(y_pred, y_target)
        
        # 손실 신호 거꾸로 전파
        loss.backward()
        
        # 옵티마이저로 파라미터에 그레이디언트 업데이트
        optimizer.step()

In [79]:
optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0
)

예제 : 레스토랑 리뷰 감성 분류
1. 옐프 리뷰 데이터셋

In [1]:
import collections
import numpy as np
import pandas as pd
import re

from argparse import Namespace

In [4]:
args = Namespace(
    raw_train_dataset_csv=r"C:\Users\knuyh\Desktop\민지\스터디\파이토치로 배우는 자연어처리\raw_train.csv",
    raw_test_dataset_csv=r"C:\Users\knuyh\Desktop\민지\스터디\파이토치로 배우는 자연어처리\raw_test.csv",
    train_proportion=0.7,
    val_proportion=0.3,
    output_munged_csv=r"C:\Users\knuyh\Desktop\민지\스터디\파이토치로 배우는 자연어처리\reviews_with_splits_full.csv",
    seed=1337
)

In [5]:
# 원본 데이터를 읽습니다
train_reviews = pd.read_csv(args.raw_train_dataset_csv, header=None, names=['rating', 'review'])
train_reviews = train_reviews[~pd.isnull(train_reviews.review)]
test_reviews = pd.read_csv(args.raw_test_dataset_csv, header=None, names=['rating', 'review'])
test_reviews = test_reviews[~pd.isnull(test_reviews.review)]

In [6]:
# 고유 클래스
set(train_reviews.rating) # 1, 2 점만

{1, 2}

In [7]:
# 훈련, 검증, 테스트를 만들기 위해 별점을 기준으로 나눕니다
by_rating = collections.defaultdict(list)
for _, row in train_reviews.iterrows():
    by_rating[row.rating].append(row.to_dict())
    
'''
[row]

rating                                                    1
review    Unfortunately, the frustration of being Dr. Go...
Name: 0, dtype: object
rating                                                    2
review    Been going to Dr. Goldberg for over 10 years. ...
Name: 1, dtype: object
'''

In [79]:
by_rating[1][:3]

[{'rating': 1,
  'review': 'Just recently went there and ordered \\"Super Nachos\\" Didn\'t get anything close to super nachos. Side order of guacomole was super runny; order of nachos was way too small for the price paid for it and no refried beans were on the nachos; barely any meat or cheese also. Went back for a refund, some lady who claimed to be the manager  named \\"Virginia\\" denied my request for another menu item comparable to the price I paid for these lousy super nachos or a refund. As a returning customer, I\'m appalled at such service. I liked going there, because the location was convenient from my house and the food and service was always good up until now. I will not go there again !!! Rude attitude and lousy customer service! F+!',
  'split': 'train'},
 {'rating': 1,
  'review': 'Slow wait staff, simple food yet overpriced. I work downtown and refuse to go here after 2 bad experiences.',
  'split': 'train'},
 {'rating': 1,
  'review': 'Not worth the money! My husband

In [36]:
# 분할 데이터를 만듭니다.
final_list = []
np.random.seed(args.seed)

for _, item_list in sorted(by_rating.items()):  # _ : 별점 (1, 2, 0, 3, 4, 100), item_list :

    np.random.shuffle(item_list)
    
    n_total = len(item_list)
    #print(n_total)  # 0 280000 280000 0 0 0
    n_train = int(args.train_proportion * n_total) # 0.7 *
    n_val = int(args.val_proportion * n_total) # 0.3 *
    
    # 데이터 포인터에 분할 속성을 추가합니다
    for item in item_list[:n_train]:
        item['split'] = 'train'
    
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'

    # 최종 리스트에 추가합니다
    final_list.extend(item_list)

In [42]:
final_list[:3]

[{'rating': 1,
  'review': 'Just recently went there and ordered \\"Super Nachos\\" Didn\'t get anything close to super nachos. Side order of guacomole was super runny; order of nachos was way too small for the price paid for it and no refried beans were on the nachos; barely any meat or cheese also. Went back for a refund, some lady who claimed to be the manager  named \\"Virginia\\" denied my request for another menu item comparable to the price I paid for these lousy super nachos or a refund. As a returning customer, I\'m appalled at such service. I liked going there, because the location was convenient from my house and the food and service was always good up until now. I will not go there again !!! Rude attitude and lousy customer service! F+!',
  'split': 'train'},
 {'rating': 1,
  'review': 'Slow wait staff, simple food yet overpriced. I work downtown and refuse to go here after 2 bad experiences.',
  'split': 'train'},
 {'rating': 1,
  'review': 'Not worth the money! My husband

In [43]:
for _, row in test_reviews.iterrows():
    row_dict = row.to_dict()
    row_dict['split'] = 'test'
    final_list.append(row_dict)

In [44]:
# 분할 데이터를 데이터 프레임으로 만듭니다
final_reviews = pd.DataFrame(final_list)
final_reviews

Unnamed: 0,rating,review,split
0,1,"Just recently went there and ordered \""Super N...",train
1,1,"Slow wait staff, simple food yet overpriced. I...",train
2,1,Not worth the money! My husband took his paren...,train
3,1,Front desk service was awful. The two people b...,train
4,1,Went after it been open a couple weeks. They m...,train
...,...,...,...
597995,1,After spending 80 bucks per person ($ 20 for e...,test
597996,2,Stellar! One of my favorite places to eat in a...,test
597997,1,We stopped by here for a dessert after Fuddruc...,test
597998,1,Wait staff was attentive but the food was very...,test


In [45]:
final_reviews.split.value_counts()

split
train    392000
val      168000
test      38000
Name: count, dtype: int64

In [46]:
final_reviews[pd.isnull(final_reviews.review)]

Unnamed: 0,rating,review,split


In [47]:
# 리뷰를 전처리합니다
def preprocess_text(text):
    if type(text) == float:
        print(text)
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text
    
final_reviews.review = final_reviews.review.apply(preprocess_text)

In [48]:
final_reviews['rating'] = final_reviews.rating.apply({1: 'negative', 2: 'positive'}.get)

In [49]:
final_reviews.head()

Unnamed: 0,rating,review,split
0,negative,just recently went there and ordered super nac...,train
1,negative,"slow wait staff , simple food yet overpriced ....",train
2,negative,not worth the money ! my husband took his pare...,train
3,negative,front desk service was awful . the two people ...,train
4,negative,went after it been open a couple weeks . they ...,train


In [50]:
final_reviews.to_csv(args.output_munged_csv, index=False)

2. 파이토치 데이터셋 이해하기

In [87]:
# ReviewDataset 클래스는 데이터셋이 최소한으로 정제되고 3개로 나뉘었다고 가정
from torch.utils.data import Dataset

class ReviewDataset(Dataset) :
    def __init__(self, review_df, vectorizer) :
        '''
        매개변수
        review_df : DF, 데이터셋
        vectorizer : ReviewVectorizer 객체
        '''
        self.review_df = review_df
        self.vectorizer = vectorizer
        
        self.train_df = self.review_df[self.review_df.split == 'train']
        self.train_size = len(self.train_df)        
        self.val_df = self.review_df[self.review_df.split == 'val']
        self.val_size = len(self.val_df)
        self.test_df = self.review_df[self.review_df.split == 'test']
        self.test_size = len(self.test_df)
        
        self._lookup_dict = {'train' : (self.train_df, self.train_size),
                            'val' : (self.val_df, self.val_size),
                            'test' : (self.test_df, self.test_size)}
        self.set_split('train')
        
    @classmethod
    def load_dataset_and_make_vectorizer(cls, review_csv) :
        # 데이터셋 로드하고 새로운 ReviewVectorizer 객체 생성
        review_df = pd.read_csv(review_csv)
        train_review_df = review_df[review_df.split=='train']
        return cls(review_df, ReviewVectorizer.from_dataframe(train_review_df))  # ReviewDataset 인스턴스 반환
        # ReviewVectorizer : 리뷰 텍스트를 수치 벡터로 변환 /데이터프레임 기반
    
    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        """ 파일에서 ReviewVectorizer 객체를 로드하는 정적 메서드
        
        매개변수:
            vectorizer_filepath (str): 직렬화된 ReviewVectorizer 객체의 위치
        반환값:
            ReviewVectorizer의 인스턴스
        """
        with open(vectorizer_filepath) as fp:
            return ReviewVectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):
        """ ReviewVectorizer 객체를 json 형태로 디스크에 저장합니다
        
        매개변수:
            vectorizer_filepath (str): ReviewVectorizer 객체의 저장 위치
        """
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)
    
    def get_vectorizer(self) :
        return self._vectorizer # ReviewVectorizer 객체 반환
    
    def set_split(self, split='train') :
        # 데이터프레임에 있는 열 사용해 분할 세트 선택
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]
    
    # 파이토치 데이터셋 주요 진입 메서드
    def __len__(self) : # 데이터셋 크기 리턴
        return self._target_size
    def __getitem__(self, index) : # i번째 샘플 찾기
        row = self._target_df.iloc[index]
        
        review_vector = self.vectorizer.vectorize(row.review)
        rating_index = self._vectorizer.rating_vocab.lookup_token(row.rating)
        
        return {'x_data' : review_vector, 
               'y_target' : rating_index}
    
    
    def get_num_batches(self, batch_size) :
        # 배치 크기 주어지면, 배치 개수 반환
        return len(self) // batch_size

3. Vocabulary, Vetorizer, DataLoader 클래스  
텍스트를 벡터의 미니배치로 변환

* Vocabulary  
첫번째 단계, 토큰을 정수로 매핑

In [88]:
# 토큰과 정수 매핑 관리
class Vocabulary(object) :
    
    def __init__(self, token_to_idx = None, add_unk = True, unk_token = '<UNK>') :
        '''
        token_to_idx (dict) : 기존 토큰-인덱스 매핑 dict
        add_unk (bool) : UNK 토큰을 추가할지 지정하는 플래그
        unk_token (str) : 추가할 UNK 토큰
        '''
        if token_to_idx is None :
            token_to_idx = {}
            
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx : token for token, idx in self._token_to_idx.items()}
        
        self._add_unk = add_unk
        self._unk_token = unk_token
        
        self.unk_index = -1
        if add_unk :
            self.unk_index = self.add_token(unk_token)
    
    # 직렬화할 수 있는 dict 반환
    def to_serializable(self) :
        return {'token_to_idx' : self._token_to_idx,
               'add_unk' : self._add_unk,
               'unk_token' : self._unk_token}
    
    # 직렬화된 dict에서 Vocabulary 객체 생성
    @classmethod
    def from_serializable(cls, contents) :
        return cls(**contents)
    
    # 토큰 기반 매핑 dict 업데이트
    def add_token(self, token) : # 새로운 토큰 추가
        if token in self._token_to_idx :   #token : Vocabulary 추가할 토큰
            index = self._token_to_idx[token]
        else :
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
    
    # 토큰 리스트 Vocabulary에 추가
    def add_many(self, tokens):
        '''
        매개변수:
            tokens (list): 문자열 토큰 리스트
        반환값:
            indices (list): 토큰 리스트에 상응되는 인덱스 리스트
        '''
        return [self.add_token(token) for token in tokens]
    
    # 토큰에 대응하는 인덱스 추출, 없으면 UNK 반환
    def lookup_token(self, token) : 
        if self.add_unk :
            return self._token_to_idx.get(token, self.unk_index) # token : 찾을 토큰
        else : # UNK일 때
            return self._token_to_idx[token]
        
    # 인덱스에 해당하는 토큰 반환
    def lookup_index(self, index) :
        if index not in self._idx_to_token :
            raise KeyError('Vocabulary에 인덱스(%d)가 없다.' % index)
        return self._idx_to_token[index]
    
    def __str__(self) :
        return '<Vocabulary(size=%d)>' % len(self)
    
    def __len__(self) :
        return len(self._token_to_idx)

* Vectorizer  
두 번째 단계, 입력 데이터 포인트의 토큰을 순회하며 각 토큰을 정수로 바꾸기

In [89]:
from collections import Counter

In [90]:
# 텍스트를 수치 벡터로 변환
class ReviewVectorizer(object) :
    def __init__ (self, review_vocab, rating_vocab) :
        self.review_vocab = review_vocab # 단어를 정수에 매핑하는 Vocabulary
        self.rating_vocab = rating_vocab # 클래스 레이블을 정수에 매핑하는 Vocabulary
        
    # 리뷰에 대한 one-hot vector 생성
    def vectorize(self, review) :
        one_hot = np.zeros(len(self.review_vocab), dtype=np.float32)
        
        for token in review.split(' ') :
            if token not in string.punctuation :
                one_hot[self.review_vocab.lookup_token(token)] = 1
        return one_hot
    
    @classmethod
    # 데이터셋 데이터프레임에서 Vectorizer 객체 생성
    def from_dataframe(cls, review_df, cutoff=25) : # 빈도 기반 필터링 설정값
        review_vocab = Vocabulary(add_unk=True)
        rating_voacb = Vocabulary(add_unk=False)
        
        # 점수 추가
        for rating in sorted(set(review_df.rating)) :
            rating_vocab.add_token(rating)
        
        # count > cutoff인 단어 추가
        word_counts = Counter()
        for review in review_df.review :
            for word in review.split(' ') :
                if word not in string.punctuation :
                    word_counts[word] += 1
                    
        for word, count in word_counts.items() :
            if count > cutoff :
                review_vocab.add_token(word)
                
        return cls(review_vocab, rating_vocab) # ReviewVectorizer 객체 반환
    
    @classmethod
    # 직렬화된 dict에서 ReviewVectorizer 객체 생성
    def from_serializable(cls, contents) :
        review_vocab = Vocabulary.from_serializable(contents['review_vocab'])
        rating_vocab = Vocabulary.from_serializable(contents['rating_vocab'])
        
        return cls(review_vocab=review_vocab, rating_vocab=rating_vocab) # ReveiwVectorizer 클래스 객체 반환
    
    # 캐싱
    def to_serializable(self) :
        return {'review_vocab' : self.review_vocab.to_serializable(),
               'rating_vocab' : self.rating_vocab.to_serializable()}

* DataLoader  
마지막 단계, 벡터로 변환한 데이터 포인트 모으기

In [91]:
# 데이터셋에서 미니배치 생성하기
from torch.utils.data import DataLoader

def generate_batches(dataset, batch_size, shuffle=True, drop_last=True, device='cpu') :
    dataloader = DataLoader(dataset = dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)
    
    for data_dict in dataloader :
        out_data_dict = {}
        for name, tensor in data_dict.items() :
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

4. 퍼셉트론 분류기

In [92]:
import torch.nn as nn

class ReviewClassifier(nn.Module) :
    def __init__(self, num_features) :
        super(ReviewClassifier, self).__init__()
        self.fc1 = nn.Linear(in_features = num_features, out_features=1)
        
    def forward(self, x_in, apply_sigmoid = False) :
        # 크로스 엔트로피 손실 사용하려면 False로 지정
        y_out = self.fc1(x_in).squeeze()
        if apply_sigmoid :
            y_out = torch.sigmoid(y_out)
        return y_out

5. 모델 훈련

In [93]:
from argparse import Namespace

args = Namespace(
    # 날짜와 경로 정보
    frequency_cutoff=25,
    model_state_file='model.pth',
    review_csv=r"C:\Users\knuyh\Desktop\민지\스터디\파이토치로 배우는 자연어처리\reviews_with_splits_lite.csv",
    save_dir=r"C:\Users\knuyh\Desktop\민지\스터디\파이토치로 배우는 자연어처리",
    vectorizer_file='vectorizer.json',
    # 모델 하이퍼파라미터 없음
    # 훈련 하이퍼파라미터
    batch_size=128,
    early_stopping_criteria=5,
    learning_rate=0.001,
    num_epochs=100,
    seed=1337,
    # 실행 옵션
    catch_keyboard_interrupt=True,
    cuda=True,
    expand_filepaths_to_save_dir=True,
    reload_from_files=False,
)

In [94]:
# 훈련
import torch.optim as optim

def make_train_state(args) :
    return {'epoch_index' : 0,
           'train_loss' : [],
           'train_acc' : [],
           'val_loss' : [],
           'val_acc' : [],
           'test_loss' : -1,
           'test_acc' : -1}
train_state = make_train_state(args)

In [None]:
if not torch.cuda.is_available() : 
    args.cuda = False
args.device = torch.device('cuda' if args.cuda else 'cpu')

In [98]:
# 데이터셋과 Vectorizer
dataset = ReviewDataset.load_dataset_and_make_vectorizer(args.review_csv)                                                            
    
vectorizer = dataset.get_vectorizer()

NameError: name 'rating_vocab' is not defined

In [None]:
# 모델
classifier = ReviewClassifier(num_features = len(vectorizer.review_vocab))
# classifier = classifier.to(args.device)

In [None]:
# 손실함수와 옵티마이저
loss_func = nn.BCEWithLogitsLoss()
optimizer . optim.Adam(classifier.parameters(), lr=args.learning_rate)

In [None]:
# 훈련 반복
for epoch_index in range(args.num_epochs):
    train_state['epoch_index'] = epoch_index

    # 훈련 세트에 대한 순회

    # 훈련 세트와 배치 제너레이터 준비, 손실과 정확도를 0으로 설정
    dataset.set_split('train')
    batch_generator = generate_batches(dataset, 
                                       batch_size=args.batch_size, 
                                       device=args.device)
    running_loss = 0.0
    running_acc = 0.0
    classifier.train()

    for batch_index, batch_dict in enumerate(batch_generator):
        # 훈련 과정은 5단계로 이루어집니다

        # --------------------------------------
        # 단계 1. 그레이디언트를 0으로 초기화합니다
        optimizer.zero_grad()

        # 단계 2. 출력을 계산합니다
        y_pred = classifier(x_in=batch_dict['x_data'].float())

        # 단계 3. 손실을 계산합니다
        loss = loss_func(y_pred, batch_dict['y_target'].float())
        loss_t = loss.item()
        running_loss += (loss_t - running_loss) / (batch_index + 1)

        # 단계 4. 손실을 사용해 그레이디언트를 계산합니다
        loss.backward()

        # 단계 5. 옵티마이저로 가중치를 업데이트합니다
        optimizer.step()
        # -----------------------------------------

        # 정확도를 계산합니다
        acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
        running_acc += (acc_t - running_acc) / (batch_index + 1)
        
    train_state['train_loss'].append(running_loss)
    train_state['train_acc'].append(running_acc)
    
    # 검증 세트에 대한 순회

        # 검증 세트와 배치 제너레이터 준비, 손실과 정확도를 0으로 설정
        dataset.set_split('val')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.
        running_acc = 0.
        classifier.eval() # 모델의 파라미터 수정하지 못하게 하고, 드롭아웃 비활성화

        for batch_index, batch_dict in enumerate(batch_generator):

            # 단계 1. 출력을 계산합니다
            y_pred = classifier(x_in=batch_dict['x_data'].float())

            # 단계 2. 손실을 계산합니다
            loss = loss_func(y_pred, batch_dict['y_target'].float())
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # 단계 3. 정확도를 계산합니다
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)
            
        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

6. 평가, 추론, 분석

In [None]:
dataset.set_split('test')
batch_generator = generate_batches(dataset, 
                                   batch_size=args.batch_size, 
                                   device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # 출력을 계산합니다
    y_pred = classifier(x_in=batch_dict['x_data'].float())

    # 손실을 계산합니다
    loss = loss_func(y_pred, batch_dict['y_target'].float())
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    # 정확도를 계산합니다
    acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

In [None]:
print("테스트 손실: {:.3f}".format(train_state['test_loss']))
print("테스트 정확도: {:.2f}".format(train_state['test_acc']))

In [None]:
# 새로운 데이터 포인트 추론하여 분류

In [None]:
def predict_rating(review, classifier, vectorizer, decision_threshold=0.5):
                        # 훈련된 모델               클래스 나눌 결정 경계
    review = preprocess_text(review)
    
    vectorized_review = torch.tensor(vectorizer.vectorize(review))
    result = classifier(vectorized_review.view(1, -1)) 
    
    probability_value = torch.sigmoid(result).item()
    index = 1
    if probability_value < decision_threshold:
        index = 0

    return vectorizer.rating_vocab.lookup_index(index)

In [None]:
test_review = "this is a pretty awesome book"

classifier = classifier.cpu()
prediction = predict_rating(test_review, classifier, vectorizer, decision_threshold=0.5)
print("{} -> {}".format(test_review, prediction))

In [None]:
# 가중치 분석
# 퍼셉트론의 가중치는 어휘 사전의 한 단어와 정확하게 대응하므로 쉽게 확인 가능


In [None]:
# 가중치 정렬
fc1_weights = classifier.fc1.weight.detach()[0]
_, indices = torch.sort(fc1_weights, dim=0, descending=True)
indices = indices.numpy().tolist()

# 긍정적인 상위 20개 단어
print("긍정 리뷰에 영향을 미치는 단어:")
print("--------------------------------------")
for i in range(20):
    print(vectorizer.review_vocab.lookup_index(indices[i]))
    
print("====\n\n\n")

# 부정적인 상위 20개 단어
print("부정 리뷰에 영향을 미치는 단어:")
print("--------------------------------------")
indices.reverse()
for i in range(20):
    print(vectorizer.review_vocab.lookup_index(indices[i]))