In [1]:
data_path = 'data/en.pos.train'
sentences = open(data_path, 'r').read().strip().split('\n\n')

In [2]:
from collections import defaultdict

word_count, tags = defaultdict(int), set()
for sentence in sentences:
    lines = sentence.strip().split('\n')
    for line in lines:
        word, tag = line.strip().split('\t')
        word_count[word] += 1
        tags.add(tag)
tags = list(tags)

In [3]:
words = [word for word in word_count.keys() if word_count[word]>1]

In [4]:
words = ['<UNK>', '<s>', '</s>'] + words
feat_tags = ['<s>'] + tags
output_tags = tags

In [5]:
word_dict = {word: i for i, word in enumerate(words)}
feat_tags_dict = {tag: i for i, tag in enumerate(feat_tags)}
output_tag_dict = {tag: i for i, tag in enumerate(output_tags)}

In [6]:
def tagid2tag_str(id):
    return output_tags[id]

def tag2id(tag):
    return output_tag_dict[tag]

def feat_tag2id(tag):
    return feat_tags_dict[tag]

def word2id(word):
    return word_dict[word] if word in word_dict else word_dict['<UNK>']

def num_words():
    return len(words)

def num_tag_feats():
    return len(feat_tags)

def num_tags():
    return len(output_tags)

In [7]:
sens = open(data_path, 'r').read().strip().split('\n\n')
writer = open(data_path+'.data', 'w')

for sen in sens:
    lines = sen.strip().split('\n')
    ws, ts = ['<s>', '<s>'], ['<s>', '<s>']
    for line in lines:
        word, tag = line.strip().split()
        ws.append(word)
        ts.append(tag)
    ws += ['</s>', '</s>']

    for i in range(len(lines)):
        feats = [ws[i], ws[i + 1], ws[i + 2], ws[i + 3], ws[i + 4], ts[i], ts[i + 1]]
        label = ts[i + 2]
        writer.write('\t'.join(feats) + '\t' + label + '\n')
writer.close()


In [8]:
import torch
import torch.nn as nn
%matplotlib nbagg
import random
import matplotlib.pyplot as plt
import numpy as np

In [9]:
cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if cuda else "cpu")
seed = 1008
torch.manual_seed(seed)
if cuda:
    torch.cuda.manual_seed_all(seed)

In [10]:
word_embed_dim, pos_embed_dim = 200,200
input_dim=5*word_embed_dim+2*pos_embed_dim
hidden_dim,output_dim=200,len(feat_tags)


In [11]:
class POS_tagging(nn.Module):
    def __init__(self):
        super(POS_tagging, self).__init__()
        self.word_embeddings=nn.Embedding(len(words),word_embed_dim)
        self.tag_embeddings=nn.Embedding(len(feat_tags),pos_embed_dim)
        self.network=torch.nn.Sequential(torch.nn.Linear(input_dim, hidden_dim),nn.ReLU(),nn.Linear(hidden_dim,output_dim))
       
    def forward(self, features):
        word_ids = torch.tensor([word2id(word_feat) for word_feat in features[0:5]], dtype=torch.long)
        tag_ids = torch.tensor([feat_tag2id(tag_feat) for tag_feat in features[5:]],dtype=torch.long)
        word_embeds = self.word_embeddings(word_ids).view((1, -1))
        tag_embeds = self.tag_embeddings(tag_ids).view((1,-1))
        embedding_layer=torch.cat((word_embeds,tag_embeds),1)
        out=self.network(embedding_layer)
        output=nn.functional.log_softmax(out, dim=1)
        
        return output


In [12]:
train_data = open(data_path+'.data', 'r').read().strip().split('\n')
minibatch_size=1000

In [13]:

def train(model,epochs,train_data):
    model.train()
    total_loss=torch.tensor([0.0])
    random.shuffle(train_data)
    loss_function=nn.NLLLoss()
    optimizer = torch.optim.SGD(model.parameters(),lr=0.1)
    
    for epochs in range(epochs):
         print('epoch:',epochs)
        
         for j,line in enumerate(train_data):
            fields = line.strip().split('\t')
            features, label, gold_label = fields[:-1], fields[-1], tag2id(fields[-1])
            result = model(features)
            loss = loss_function(result, torch.tensor([gold_label], dtype=torch.long))
            total_loss+=loss
            if j % minibatch_size == 0:
                minibatch_loss = total_loss / minibatch_size
                optimizer.zero_grad()
                minibatch_loss.backward()
                optimizer.step()
                total_loss=torch.tensor([0.0])
                


        
            
    return result.detach()

        

In [14]:
model= POS_tagging().to(device)

In [15]:
def load(filename):
    model.populate(filename)

def save(filename):
    model.save(filename)

In [16]:
train(model,5,train_data)
print('finished training!') 

epoch: 0
epoch: 1
epoch: 2
epoch: 3
epoch: 4
finished training!


In [17]:
def decode(model,ws):
   # first putting two start symbols
    ws = ['<s>', '<s>'] + ws + ['</s>', '</s>']
    ts = ['<s>', '<s>']
    with torch.no_grad():
        for i in range(2, len(ws) - 2):
            features = ws[i - 2:i + 3] + ts[i - 2:i]

       # running forward
            output = model(features)

       # getting list value of the output

       # getting best tag
            best_tag_id = np.argmax(output)

       # assigning the best tag
            ts.append(tagid2tag_str(best_tag_id.item()))

       # refresh dynet memory (computation graph)

    return ts[2:]

In [18]:
test_file = 'data/en.pos.dev.raw'
writer = open(test_file+'.output.pyex.dim200', 'w')
for sentence in open(test_file, 'r'):
    words = sentence.strip().split()
    tags = decode(model, words)
    output = [word + '\t' + tag for word, tag in zip(words, tags)]
    writer.write('\n'.join(output) + '\n\n')
writer.close()

In [19]:
def evaluate_test(w_test_file,data_file):
    true=0
    compare=open(data_file,'r')
    l=[]
    k=[]
    for sentence1 in compare:
        words1=sentence1.strip().split()
        if len(words1)==2:
            l.append(words1[1])
    for sentence2 in open(w_test_file,'r'):
        words2=sentence2.strip().split()
        if len(words2)==2:
            k.append(words2[1])
    for i in range(len(l)):
        if l[i]==k[i]:
            true+=1
    accuracy=true/len(l)
    return accuracy


In [20]:
print(evaluate_test('data/en.pos.dev.raw.output.pyex.dim200','data/en.pos.dev'))

0.7750903785807352
