In [1]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7ffafc0e9630>

# 1. Logistic Regression Bag-of-Words classifier

### 1. word2index 딕 준비 for Bag-of-Words

In [3]:
data = [ ("me gusta comer en la cafeteria".split(), "SPANISH"),
         ("Give it to me".split(), "ENGLISH"),
         ("No creo que sea una buena idea".split(), "SPANISH"),
         ("No it is not a good idea to get lost at sea".split(), "ENGLISH") ]

test_data = [ ("Yo creo que si".split(), "SPANISH"),
              ("it is lost on me".split(), "ENGLISH")]

# word_to_ix maps each word in the vocab to a unique integer, which will be its
# index into the Bag of words vector
word_to_ix = {}
for sent, _ in data + test_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

ix_to_word = {v : k for k,v in word_to_ix.items()}

print(word_to_ix)
print(ix_to_word)

VOCAB_SIZE = len(word_to_ix)
NUM_LABELS = 2

{'it': 7, 'to': 8, 'una': 13, 'Give': 6, 'good': 19, 'cafeteria': 5, 'comer': 2, 'not': 17, 'si': 24, 'on': 25, 'lost': 21, 'me': 0, 'creo': 10, 'en': 3, 'sea': 12, 'get': 20, 'No': 9, 'is': 16, 'que': 11, 'la': 4, 'idea': 15, 'at': 22, 'gusta': 1, 'Yo': 23, 'a': 18, 'buena': 14}
{0: 'me', 1: 'gusta', 2: 'comer', 3: 'en', 4: 'la', 5: 'cafeteria', 6: 'Give', 7: 'it', 8: 'to', 9: 'No', 10: 'creo', 11: 'que', 12: 'sea', 13: 'una', 14: 'buena', 15: 'idea', 16: 'is', 17: 'not', 18: 'a', 19: 'good', 20: 'get', 21: 'lost', 22: 'at', 23: 'Yo', 24: 'si', 25: 'on'}


### 2. 모델 선언  

In [4]:
class BoWClassifier(nn.Module): # nn.Module을 상속받아서 클래스 만들어야 함
    
    def __init__(self, num_labels, vocab_size):
        # 파이토치의 nn.Module을 상속받아 "모델 클래스"를 만들 때는
        # 반드시 부모 클래스 nn.Module의 생성자를 초기화 해줘야 함
        super(BoWClassifier, self).__init__()
        
        # 선형 맵핑(아핀 변환?)
        # vocab_size만큼의 벡터를 -> spanish or english 2가지로 분류
        
        self.linear = nn.Linear(vocab_size, num_labels)
        
        # NOTE! The non-linearity log softmax does not have parameters! So we don't need
        # to worry about that here
        
    def forward(self, bow_vec): 
        # nn.Module을 상속받은 클래스에서 forward는 예약어임
        # Pass the input through the linear layer,
        # then pass that through log_softmax.
        # Many non-linearities and other functions are in torch.nn.functional
        return F.log_softmax(self.linear(bow_vec))

### 3. 전처리 함수 선언 (문장 -> 벡터 / 레이블)

텐서는 리스트로부터 바로 만들 수 있다. torch.Tensor(list) , default 타입은 floatTensor인데 <br>
integer 타입은 torch.LongTensor를 사용해야 함

Tensor.view 는 reshape 함수임~

In [6]:
def make_bow_vector(sentence, word_to_ix):
    vec = torch.zeros(len(word_to_ix))
    for word in sentence:
        vec[word_to_ix[word]] += 1
    return vec.view(1, -1) # reshape 하는 함수!!

def make_target(label, label_to_ix):
    return torch.LongTensor([label_to_ix[label]]) # integer Tensor는 LongTensor 사용

In [8]:
model = BoWClassifier(NUM_LABELS, VOCAB_SIZE)

for param in model.parameters():
    print(param) 
    
    # Ax + b
    # nn.Linear가 가지고 있는 2x26 A
    # b
    

Parameter containing:

Columns 0 to 9 
-0.1808 -0.0890 -0.1295 -0.1729  0.1483  0.0669 -0.1575  0.0365 -0.0309  0.0673
 0.1917  0.0630  0.0973 -0.0790 -0.0861 -0.0211  0.1135 -0.1090 -0.1556 -0.1673

Columns 10 to 19 
 0.1796 -0.0346  0.0130 -0.1186  0.0753 -0.0825 -0.0724 -0.1404  0.0732  0.1111
-0.0204 -0.0121  0.1603 -0.1584 -0.0810  0.1582 -0.0832 -0.1492 -0.1451  0.0097

Columns 20 to 25 
 0.1313 -0.0343 -0.1889 -0.1827  0.0981  0.0486
-0.1885 -0.1633  0.0701  0.1635 -0.1131  0.1610
[torch.FloatTensor of size 2x26]

Parameter containing:
1.00000e-02 *
 -9.1960
 -7.8866
[torch.FloatTensor of size 2]



토치에서 모델로 넘겨주는 모든 변수는 autograd.Variable()로 wrapping해줘야 한다!!

In [10]:
# To run the model, pass in a BoW vector, but wrapped in an autograd.Variable
sample = data[0]
bow_vector = make_bow_vector(sample[0], word_to_ix)
log_probs = model(autograd.Variable(bow_vector)) # 이렇게 넣어주면 forward 함수로 바로 맵핑
print(log_probs)

Variable containing:
-0.9966 -0.4607
[torch.FloatTensor of size 1x2]



In [11]:
label_to_ix = { "SPANISH": 0, "ENGLISH": 1 }

In [19]:
ix_to_label = {v:k for k,v in label_to_ix.items()}

### 4. 트레이닝!

트레이닝 전 파라미터 확인 (before & after 해보려고) 

In [12]:
# Run on test data before we train, just to see a before-and-after
for instance, label in test_data:
    bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix))
    log_probs = model(bow_vec)
    print(log_probs)
print(next(model.parameters())[:,word_to_ix["creo"]]) # Print the matrix column corresponding to "creo"

Variable containing:
-0.6785 -0.7080
[torch.FloatTensor of size 1x2]

Variable containing:
-0.8051 -0.5925
[torch.FloatTensor of size 1x2]

Variable containing:
 0.1796
-0.0204
[torch.FloatTensor of size 2]



In [13]:
loss_function = nn.NLLLoss() # negative log likelihood 로스
optimizer = optim.SGD(model.parameters(), lr=0.1) # 옵티마이저

# Usually you want to pass over the training data several times.
# 100 is much bigger than on a real data set, but real datasets have more than
# two instances.  Usually, somewhere between 5 and 30 epochs is reasonable.
for epoch in range(100):
    for instance, label in data:
        # 1. Pytorch는 gradients를 누적하기 때문에 항상 초기화해줘야 함
        model.zero_grad()
    
        # 2. 문장을 벡터로 만들어 준 후 autograd.Variable로 wrapping하기
        # target 역시 autograd.Variable로 wrapping
        bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix))
        target = autograd.Variable(make_target(label, label_to_ix))
    
        # 3. forward path
        log_probs = model(bow_vec)
    
        # 4. loss 계산 후, loss로부터 backward(), 그리고 optimizer.step()
        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()

In [33]:
for instance, label in test_data:
    bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix))
    log_probs = model(bow_vec)
    values, indices = torch.max(log_probs,1)
    print('pred : ' ,ix_to_label[list(indices.data.numpy())[0][0]],'&& label : ', label)
    #print(ix_to_label[indice.numpy()[]])

pred :  SPANISH && label :  SPANISH
pred :  ENGLISH && label :  ENGLISH
