In [1]:
import torch 
import torch.autograd as autograd
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import numpy as np
import pickle
import random
from seqeval.metrics import f1_score

torch.manual_seed(1)

<torch._C.Generator at 0x206cc9711b0>

In [5]:
path_train = './eng_data/eng.train'
path_dev = './eng_data/eng.testa' 
path_test = './eng_data/eng.testb'  

In [6]:
def prepare_dataset(path_train):
    with open(path_train, 'r') as f:
        train = f.readlines()
    
    sentences = []
    sentence = []
    for i in train:
        word = i.split()
        if len(word) > 1:
            l = (word[0], word[-1].replace('B-','I-'))
            sentence.append(l)
        if len(word) == 0 and len(sentence) > 0:
            sentences.append(sentence)
            sentence = []

    dataset = []
    for i in sentences:
        sent = [t[0] for t in i]
        tag = [t[-1] for t in i]
        dataset.append((sent, tag))
    
    return dataset

In [20]:
with open(path_test,'r') as f:
    a = f.readlines()
a

['SOCCER\tNN\tI-NP\tO\n',
 '-\t:\tO\tO\n',
 'JAPAN\tNNP\tI-NP\tI-LOC\n',
 'GET\tVB\tI-VP\tO\n',
 'LUCKY\tNNP\tI-NP\tO\n',
 'WIN\tNNP\tI-NP\tO\n',
 ',\t,\tO\tO\n',
 'CHINA\tNNP\tI-NP\tI-PER\n',
 'IN\tIN\tI-PP\tO\n',
 'SURPRISE\tDT\tI-NP\tO\n',
 'DEFEAT\tNN\tI-NP\tO\n',
 '.\t.\tO\tO\n',
 '\n',
 'Nadim\tNNP\tI-NP\tI-PER\n',
 'Ladki\tNNP\tI-NP\tI-PER\n',
 '\n',
 'AL-AIN\tNNP\tI-NP\tI-LOC\n',
 ',\t,\tO\tO\n',
 'United\tNNP\tI-NP\tI-LOC\n',
 'Arab\tNNP\tI-NP\tI-LOC\n',
 'Emirates\tNNPS\tI-NP\tI-LOC\n',
 '0000-00-00\tCD\tI-NP\tO\n',
 '\n',
 'Japan\tNNP\tI-NP\tI-LOC\n',
 'began\tVBD\tI-VP\tO\n',
 'the\tDT\tI-NP\tO\n',
 'defence\tNN\tI-NP\tO\n',
 'of\tIN\tI-PP\tO\n',
 'their\tPRP$\tI-NP\tO\n',
 'Asian\tJJ\tI-NP\tI-MISC\n',
 'Cup\tNNP\tI-NP\tI-MISC\n',
 'title\tNN\tI-NP\tO\n',
 'with\tIN\tI-PP\tO\n',
 'a\tDT\tI-NP\tO\n',
 'lucky\tJJ\tI-NP\tO\n',
 '0-0\tCD\tI-NP\tO\n',
 'win\tVBP\tI-VP\tO\n',
 'against\tIN\tI-PP\tO\n',
 'Syria\tNNP\tI-NP\tI-LOC\n',
 'in\tIN\tI-PP\tO\n',
 'a\tDT\tI-NP\tO\n',
 'Gro

In [21]:
train_set = prepare_dataset(path_train)[:5]
dev_set = prepare_dataset(path_dev)[:5]
test_set = prepare_dataset(path_test)[:5]

In [24]:
test_set[0]

(['SOCCER',
  '-',
  'JAPAN',
  'GET',
  'LUCKY',
  'WIN',
  ',',
  'CHINA',
  'IN',
  'SURPRISE',
  'DEFEAT',
  '.'],
 ['O', 'O', 'I-LOC', 'O', 'O', 'O', 'O', 'I-PER', 'O', 'O', 'O', 'O'])

In [28]:
def prepare_sequence(seq, to_ix):
    idxs = []
    for w in seq:
        if w.lower() in to_ix:
            idxs.append(to_ix[w.lower()])
        else:
            idxs.append(to_ix['<unk>'])
    return torch.tensor(idxs, dtype=torch.long)

def prepare_tag_sequence(seq, tag_to_ix):
    idxs = [tag_to_ix[i] for i in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [37]:
word_to_ix = {}
tag_to_ix = {}
for sent, tags in train_set:
    for word in sent:
        if word.lower() not in word_to_ix:
            word_to_ix[word.lower()] = len(word_to_ix)

# Add <unk> token
word_to_ix['<unk>'] = len(word_to_ix)
tag_to_ix = {'O': 0, 'I-LOC': 1, 'I-PER': 2, 'I-ORG': 3, 'I-MISC': 4}
ix_to_tag = {0: 'O', 1: 'I-LOC', 2: 'I-PER', 3: 'I-ORG', 4: 'I-MISC'}

EMBEDDING_DIM = 8
HIDDEN_DIM = 5

In [None]:
300 từ khác nhau > vocabsize = 300, chọn số chiều mỗi từ = 15
eu = [0.2 06,.....0.15]

In [None]:
5 từ từ số chiều = 15 x 5 >>>>LSTM >>>> [...hidden=128..] >>>>...Giảm chiều >>> [..5 chiều..]

In [55]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
#         print('embeds is ',embeds)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
#         print('lstm_out ', lstm_out)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [56]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

LSTMTagger(
  (word_embeddings): Embedding(57, 8)
  (lstm): LSTM(8, 5)
  (hidden2tag): Linear(in_features=5, out_features=5, bias=True)
)

In [57]:
def check_f1_valid(dev_set):
    lts = []
    lts_pred = []
    for sent, label in dev_set:
        with torch.no_grad():
            lts.append(label)
            sent_in = prepare_sequence(sent, word_to_ix).to(device)
            softmax_prob = model(sent_in).to(device)
            lts_pred.append([ix_to_tag[i] for i in torch.argmax(softmax_prob, dim=1).tolist()])
         
    return f1_score(lts, lts_pred)

In [58]:
train_set

[(['BRUSSELS', '1996-08-22'], ['I-LOC', 'O']),
 (['The',
   'European',
   'Commission',
   'said',
   'on',
   'Thursday',
   'it',
   'disagreed',
   'with',
   'German',
   'advice',
   'to',
   'consumers',
   'to',
   'shun',
   'British',
   'lamb',
   'until',
   'scientists',
   'determine',
   'whether',
   'mad',
   'cow',
   'disease',
   'can',
   'be',
   'transmitted',
   'to',
   'sheep',
   '.'],
  ['O',
   'I-ORG',
   'I-ORG',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'I-MISC',
   'O',
   'O',
   'O',
   'O',
   'O',
   'I-MISC',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O']),
 (['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'],
  ['I-ORG', 'O', 'I-MISC', 'O', 'O', 'O', 'I-MISC', 'O', 'O']),
 (['Germany',
   "'s",
   'representative',
   'to',
   'the',
   'European',
   'Union',
   "'s",
   'veterinary',
   'committee',
   'Werner',
   'Zwingmann',
   'said',
   'on',

In [60]:
EPOCH = 5
train_loss = 0.
f1_dev = 0

loss_each_epoch = []
f1_dev_each_epoch = []

for epoch in tqdm(range(EPOCH)):
    N_train = len(train_set)
    sum_train_loss = 0
    np.random.shuffle(train_set)
 
    for sentence, tags in train_set:
#         print('sentence is ',sentence)
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix).to(device)
        targets = prepare_tag_sequence(tags, tag_to_ix).to(device)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
        sum_train_loss += loss.item()

    f1_dev_per_epoch = check_f1_valid(dev_set)
    
    if epoch == 0:
        train_loss = sum_train_loss/N_train
        f1_dev = f1_dev_per_epoch
        
    print('Epoch {}: train_loss is {} \t f1_dev is {}'.format(epoch, sum_train_loss/N_train, f1_dev_per_epoch))
    
    if f1_dev_per_epoch > f1_dev:
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'f1_dev': f1_dev_per_epoch
        }, './best_model_lstm_softmax.tar')
        f1_dev = f1_dev_per_epoch
        
    if epoch == EPOCH - 1:
        print('Done of training!')
        
    loss_each_epoch.append(sum_train_loss/N_train)
    f1_dev_each_epoch.append(f1_dev_per_epoch)

  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

Epoch 0: train_loss is 1.4479418992996216 	 f1_dev is 0


 20%|████████████████▊                                                                   | 1/5 [00:00<00:00,  8.89it/s]

Epoch 1: train_loss is 1.4402748823165894 	 f1_dev is 0


 40%|█████████████████████████████████▌                                                  | 2/5 [00:00<00:00,  7.84it/s]

Epoch 2: train_loss is 1.4327916383743287 	 f1_dev is 0


 60%|██████████████████████████████████████████████████▍                                 | 3/5 [00:00<00:00,  7.84it/s]

Epoch 3: train_loss is 1.425456666946411 	 f1_dev is 0


 80%|███████████████████████████████████████████████████████████████████▏                | 4/5 [00:00<00:00,  7.64it/s]

Epoch 4: train_loss is 1.4183082580566406 	 f1_dev is 0
Done of training!


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  7.26it/s]


In [17]:
lts = []
lts_pred = []

for sent, label in test_set:
    lts.append(label)
    sent_in = prepare_sequence(sent, word_to_ix).to(device)
    softmax_prob = model(sent_in)
    lts_pred.append([ix_to_tag[i] for i in torch.argmax(softmax_prob, dim=1).tolist()])

f1_score(lts, lts_pred)

0

In [18]:
# Load best model
state = torch.load('./best_model_lstm_softmax.tar')
model_new = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))

model_new.load_state_dict(state['model_state_dict'])
model_new.to(device)

lts = []
lts_pred = []

with torch.no_grad():
    for sent, label in test_set:
        lts.append(label)
        sent_in = prepare_sequence(sent, word_to_ix).to(device)
        softmax_prob = model_new(sent_in)
        lts_pred.append([ix_to_tag[i] for i in torch.argmax(softmax_prob, dim=1).tolist()])
        
f1_score(lts, lts_pred)

FileNotFoundError: [Errno 2] No such file or directory: './best_model_lstm_softmax.tar'