In [1]:
import numpy as np
from collections import defaultdict
%matplotlib nbagg
import random
import matplotlib.pyplot as plt
import dynet
import torch
import torch.nn.functional as F


In [2]:
data_path = 'data/en.pos.train'
sentences = open(data_path, 'r').read().strip().split('\n\n')

In [3]:
word_count, tags = defaultdict(int), set()
for sentence in sentences:
    lines = sentence.strip().split('\n')
    for line in lines:
        word, tag = line.strip().split('\t')
        word_count[word] += 1
        tags.add(tag)
tags = list(tags)

In [4]:
words = [word for word in word_count.keys() if word_count[word]>1]

In [5]:
words = ['<UNK>', '<s>', '</s>'] + words
feat_tags = ['<s>'] + tags
output_tags = tags

In [6]:
word_dict = {word: i for i, word in enumerate(words)}
feat_tags_dict = {tag: i for i, tag in enumerate(feat_tags)}
output_tag_dict = {tag: i for i, tag in enumerate(output_tags)}

In [7]:
def tagid2tag_str(id):
    return output_tags[id]

def tag2id(tag):
    return output_tag_dict[tag]

def feat_tag2id(tag):
    return feat_tags_dict[tag]

def word2id(word):
    return word_dict[word] if word in word_dict else word_dict['<UNK>']

def num_words():
    return len(words)

def num_tag_feats():
    return len(feat_tags)

def num_tags():
    return len(output_tags)

In [8]:
sens = open(data_path, 'r').read().strip().split('\n\n')
writer = open(data_path+'.data', 'w')

for sen in sens:
    lines = sen.strip().split('\n')
    ws, ts = ['<s>', '<s>'], ['<s>', '<s>']
    for line in lines:
        word, tag = line.strip().split()
        ws.append(word)
        ts.append(tag)
    ws += ['</s>', '</s>']

    for i in range(len(lines)):
        feats = [ws[i], ws[i + 1], ws[i + 2], ws[i + 3], ws[i + 4], ts[i], ts[i + 1]]
        label = ts[i + 2]
        writer.write('\t'.join(feats) + '\t' + label + '\n')
writer.close()


In [9]:
#initialise weights in hyperbolic space

word_embed_dim, pos_embed_dim = 5,5

word_embedding = torch.normal(torch.zeros(len(words),word_embed_dim))
tag_embedding=torch.normal(torch.zeros(len(feat_tags),pos_embed_dim))
radii_we=torch.rand(len(words))
radii_te=torch.rand(len(feat_tags))
we_norm=torch.sum(word_embedding**2,dim=1)
te_norm=torch.sum(tag_embedding**2,dim=1)
correction_we=radii_we/torch.sqrt(we_norm)
correction_te=radii_te/torch.sqrt(te_norm)
corr_tile_we=correction_we.repeat(word_embed_dim,1)
pre_word_embedding=corr_tile_we.transpose(0,1)*word_embedding
hyp_word_embedding=pre_word_embedding.clone().detach().requires_grad_(True)
corr_tile_te=correction_te.repeat(pos_embed_dim,1)
pre_tag_embedding=corr_tile_te.transpose(0,1)*tag_embedding
hyp_tag_embedding=pre_tag_embedding.clone().detach().requires_grad_(True)


In [10]:
input_dim = 5 * word_embed_dim + 2 * pos_embed_dim

hidden_dim, minibatch_size = 200, 1000
"""
hidden_layer_we=torch.normal(torch.zeros(hidden_dim,input_dim))
radii_hwe=torch.rand(hidden_dim)
hwe_norm=torch.sum(hidden_layer_we**2,dim=1)
correction_hwe=radii_hwe/torch.sqrt(hwe_norm)
corr_tile_hwe=correction_hwe.repeat(input_dim,1)
hidden_we=corr_tile_hwe.transpose(0,1)*hidden_layer_we
hidden_layer=hidden_we.clone().detach().requires_grad_(True)

"""
hidden_layers_we={}
hidden_layers_te={}

# initialize the hidden layers in the hyperbolic space. Notice that we are in a subspace of it because we are not normalising each istance
for i in range(5):
    hidden_layer_we=torch.normal(torch.zeros(hidden_dim,word_embed_dim))
    radii_hwe=torch.rand(hidden_dim)
    hwe_norm=torch.sum(hidden_layer_we**2,dim=1)
    correction_hwe=radii_hwe/torch.sqrt(hwe_norm)
    corr_tile_hwe=correction_hwe.repeat(word_embed_dim,1)
    hidden_we=corr_tile_hwe.transpose(0,1)*hidden_layer_we
    hidden_layers_we[i]=hidden_we.clone().detach().requires_grad_(True)
    

for i in range(2):
    hidden_layer_te=torch.normal(torch.zeros(hidden_dim,pos_embed_dim))
    radii_hte=torch.rand(hidden_dim)
    hte_norm=torch.sum(hidden_layer_te**2,dim=1)
    correction_hte=radii_hte/torch.sqrt(hte_norm)
    corr_tile_hte=correction_hte.repeat(pos_embed_dim,1)
    hidden=corr_tile_hte.transpose(0,1)*hidden_layer_te
    hidden_layers_te[i]=hidden.clone().detach().requires_grad_(True)


# define the hidden layer bias term and initialize it as constant 0.2.
hidden_layer_biases = 0.2*torch.ones((7,hidden_dim))
hidden_layer_bias=hidden_layer_biases.clone().detach().requires_grad_(True)

# define the output weight.
output_layers = torch.normal(torch.zeros(num_tags(), hidden_dim)).clone().detach().requires_grad_(True)

# define the bias vector and initialize it as zero.

output_bias = torch.zeros(num_tags(),requires_grad=True)

In [11]:
def arccosh(x):
    c0 = torch.log(x)
    c1 = torch.log1p(torch.sqrt(x * x - 1) / x)
    return c0 + c1

def hyp_dist(u,v):
    de=2*(torch.sum((u-v)**2,dim=1))/((1-torch.sum(u**2,dim=1))*(1-torch.sum(v**2,dim=1)))
    dist=arccosh(1+de)
    return dist

In [12]:
cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if cuda else "cpu")
seed = 1008
torch.manual_seed(seed)
if cuda:
    torch.cuda.manual_seed_all(seed)

In [13]:
def forward(features):
    
   # extract word and tags ids
    word_ids = [word2id(word_feat) for word_feat in features[0:5]]
    tag_ids = [feat_tag2id(tag_feat) for tag_feat in features[5:]]
    hidden_out=torch.zeros(hidden_dim)
    
    """
    # consider outputs for each position and sum them all together
    hyp_con=torch.cat((hyp_word_embedding[word_ids[0]],hyp_word_embedding[word_ids[1]],hyp_word_embedding[word_ids[2]],hyp_word_embedding[word_ids[3]],hyp_word_embedding[word_ids[4]],hyp_tag_embedding[tag_ids[0]],hyp_tag_embedding[tag_ids[1]]),0)
    hyp_conc=hyp_con/(torch.sum(hyp_con**2)+1e-2)
    c=hyp_conc.repeat(hidden_dim,1)
    de=1+2*(torch.sum((hidden_layer-c)**2,dim=1))/((1-torch.sum(hidden_layer**2,dim=1))*(1-torch.sum(c**2,dim=1)))
    print(de)
    hyp_distance=torch.log(de)+torch.log1p(torch.sqrt(de**2-1)/de)
    print(hyp_distance)
    
    hidden_out=F.relu(1-hyp_distance)
    print(hidden_out)
    output=output_layers@hidden_out+output_bias
    print(output)
        
    
    """
    for i,wid in enumerate(word_ids):
        c=hyp_word_embedding[wid].repeat(hidden_dim,1)
        #hidden_out+=1-hyp_dist(hidden_layers_we[i],c)
        hidden_out+=F.relu(1-hyp_dist(hidden_layers_we[i],c)+hidden_layer_bias[i]) #question: do we want to fire if all the points are relatively close to the hidden layer?
        
    for j,tag in enumerate(tag_ids):
        d=hyp_tag_embedding[tag].repeat(hidden_dim,1)
        #hidden_out+=1-hyp_dist(hidden_layers_te[j],d)
        hidden_out+=F.relu(1-hyp_dist(hidden_layers_te[j],d)+hidden_layer_bias[j+5])
    output = output_layers@F.relu(hidden_out) + output_bias
    
    
    # return a list of outputs
    return output

In [14]:
def decode(ws):
   # first putting two start symbols
    ws = ['<s>', '<s>'] + ws + ['</s>', '</s>']
    ts = ['<s>', '<s>']

    for i in range(2, len(ws) - 2):
        features = ws[i - 2:i + 3] + ts[i - 2:i]

       # running forward
        output = forward(features)
        

       # getting best tag
        best_tag_id = torch.argmax(output)

       # assigning the best tag
        ts.append(tagid2tag_str(best_tag_id))

    return ts[2:]

In [15]:
train_data = open(data_path+'.data', 'r').read().strip().split('\n')
optimizer=torch.optim.SGD([output_bias,output_layers,hidden_layer_bias],lr=1e-2)


In [16]:
def train_iter(train_data,hidden_layers_we,hidden_layers_te,hyp_word_embedding,hyp_tag_embedding):
        losses = [] # minibatch loss vector
        random.shuffle(train_data) # shuffle the training data.

        for line in train_data:
            fields = line.strip().split('\t')
            features, label, gold_label = fields[:-1], fields[-1], tag2id(fields[-1])
            result = forward(features)

            # getting loss with respect to negative log softmax function and the gold label; and appending to the minibatch losses.
            loss_func=torch.nn.LogSoftmax(dim=0)
            softmax=-loss_func(result)
            loss = softmax[gold_label]
            losses.append(loss)

            if len(losses) >= minibatch_size:
                minibatch_loss_value= sum(losses) / len(losses) 

                # printing info and plotting
                                                    
                minibatch_loss_value.backward(retain_graph=True) # calling pytorch to run backpropagation
                optimizer.step() # calling pytorch to change parameter values with respect to current backpropagation
                
                for i in range(5): 
                    norm_sq=torch.sum(hidden_layers_we[i].data**2,dim=1)
                    var=(1e-2*((1-norm_sq.repeat(word_embed_dim,1).transpose(0,1))**2)/4)*hidden_layers_we[i].grad.data
                    hidden_layers_we[i].data=hidden_layers_we[i].data-var
                    if torch.max(torch.sum(hidden_layers_we[i].data**2,dim=1))>1:
                        hidden_layers_we[i].data=hidden_layers_we[i].data/(torch.sum(hidden_layers_we[i].data**2,dim=1)+1e-5)
                    hidden_layers_we[i].grad.data.zero_()
                for i in range(2):
                    norm_sq=torch.sum(hidden_layers_te[i].data**2,dim=1)
                    var=1e-2*(((1-norm_sq.repeat(pos_embed_dim,1).transpose(0,1))**2)/4)*hidden_layers_te[i].grad.data
                    hidden_layers_te[i].data=hidden_layers_te[i].data-var
                    if torch.max(torch.sum(hidden_layers_te[i].data**2,dim=1))>1:
                        hidden_layers_te[i].data=hidden_layers_te[i].data/(torch.sum(hidden_layers_te[i].data**2,dim=1)+1e-5)
                    hidden_layers_we[i].grad.data.zero_()
                """
                hl_norm_sq=torch.sum(hidden_layer.data**2,dim=1)
                var=(1e-2*((1-hl_norm_sq.repeat(input_dim,1).transpose(0,1))**2)/4)*hidden_layer.grad.data
                hidden_layer.data=hidden_layer.data-var
                if torch.max(torch.sum(hidden_layer.data**2,dim=1))>1:
                    hidden_layer.data=hidden_layer.data/(torch.sum(hidden_layer.data**2,dim=1)+1e-5)
                hidden_layer.grad.data.zero_()
                """    
                hwe_norm_sq=torch.sum(hyp_word_embedding.data**2,dim=1)
                hwe_var=1e-2*(((1-hwe_norm_sq.repeat(word_embed_dim,1).transpose(0,1))**2)/4)*hyp_word_embedding.grad.data
                hyp_word_embedding.data=hyp_word_embedding.data-hwe_var
                if torch.max(torch.sum(hyp_word_embedding.data**2,dim=1))>1:
                    hyp_word_embedding.data=hyp_word_embedding.data/(torch.sum(hyp_word_embedding.data**2,dim=1)+1e-5)
                hyp_word_embedding.grad.data.zero_()
                hte_norm_sq=torch.sum(hyp_tag_embedding.data**2,dim=1)
                hte_var=1e-2*(((1-hte_norm_sq.repeat(pos_embed_dim,1).transpose(0,1))**2)/4)*hyp_tag_embedding.grad.data
                hyp_tag_embedding.data=hyp_tag_embedding.data-hte_var
                if torch.max(torch.sum(hyp_tag_embedding.data**2,dim=1))>1:
                    hyp_tag_embedding.data=hyp_tag_embedding.data/(torch.sum(hyp_tag_embedding.data**2,dim=1)+1e-5)
                hyp_tag_embedding.grad.data.zero_()
                


                
                

                # empty the loss vector and refresh the memory of dynetnet
                losses = []
                optimizer.zero_grad()
            


In [17]:
def load(filename):
    model.populate(filename)

def save(filename):
    model.save(filename)


In [None]:
for epoch in range(5):
    print('epoch',epoch+1)
    train_iter(train_data,hidden_layers_we,hidden_layers_te,hyp_word_embedding,hyp_tag_embedding)
    
print('finished training!') 

epoch 1
epoch 2


In [None]:
test_file = 'data/en.pos.dev.raw'
writer = open(test_file+'.output.hyperbballbias.dim5', 'w')#change dimension here
for sentence in open(test_file, 'r'):
    words = sentence.strip().split()
    tags = decode(words)
    output = [word + '\t' + tag for word, tag in zip(words, tags)]
    writer.write('\n'.join(output) + '\n\n')
writer.close()

In [None]:
def evaluate_test(w_test_file,data_file):
    true=0
    compare=open(data_file,'r')
    l=[]
    k=[]
    for sentence1 in compare:
        words1=sentence1.strip().split()
        if len(words1)==2:
            l.append(words1[1])
    for sentence2 in open(w_test_file,'r'):
        words2=sentence2.strip().split()
        if len(words2)==2:
            k.append(words2[1])
    for i in range(len(l)):
        if l[i]==k[i]:
            true+=1
    accuracy=true/len(l)
    return accuracy


In [None]:
print(evaluate_test('data/en.pos.dev.raw.output.hyperbball.dim20','data/en.pos.dev'))
