In [45]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter
import nltk
flatten = lambda l: [item for sublist in l for item in sublist]

* https://github.com/rguthrie3/DeepDependencyParsingProblemSet/tree/master/data

In [2]:
USE_CUDA = torch.cuda.is_available()

In [3]:
def getBatch(batch_size,train_data):
    random.shuffle(train_data)
    sindex=0
    eindex=batch_size
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex+batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [148]:
def make_word_vector(sents, word2index):
    idxs = list(map(lambda w: word2index[w] if w in word2index.keys() else word2index["<unk>"], sents))
    tensor = Variable(torch.LongTensor(idxs)).cuda() if USE_CUDA else  Variable(torch.LongTensor(idxs))
    return tensor

def make_tag_vector(tag,tag2index):
    tensor = Variable(torch.LongTensor([tag2index[tag]]))
    if USE_CUDA:
        tensor = tensor.cuda()
    return tensor

In [146]:
class TrainsitionState(object):
    
    def __init__(self,tagged_sent):
        self.root = ('ROOT','<root>',-1)
        self.stack=[self.root]
        self.buffer=[(s[0],s[1],i) for i,s in enumerate(tagged_sent)]
        self.address = [s[0] for s in tagged_sent] + [self.root[0]]
        self.arcs=[]
        self.terminal=False
        
    def __str__(self):
        return 'stack : %s \nbuffer : %s' % (str([s[0] for s in self.stack]), str([b[0] for b in self.buffer]))
    
    def shift(self):
        
        if len(self.buffer)>=1:
            self.stack.append(self.buffer.pop(0))
        else:
            print("Empty buffer")
            
    def left_arc(self,relation=None):
        
        if len(self.stack)>=2:
            arc={}
            s2 = self.stack[-2]
            s1 = self.stack[-1]
            arc['graph_id'] = len(self.arcs)
            arc['form'] = s1[0]
            arc['head']=s2[2]
            arc['pos'] = s1[1]
            if relation:
                arc['relation']=relation
            self.arcs.append(arc)
            self.stack.pop(-2)
            
        elif self.stack==[self.root]:
            print("Element Lacking")
    
    def right_arc(self,relation=None):
        
        if len(self.stack)>=2:
            arc={}
            s2 = self.stack[-2]
            s1 = self.stack[-1]
            arc['graph_id'] = len(self.arcs)
            arc['form'] = s2[0]
            arc['head']=s1[2]
            arc['pos'] = s2[1]
            if relation:
                arc['relation']=relation
            self.arcs.append(arc)
            self.stack.pop(-1)
            
        elif self.stack==[self.root]:
            print("Element Lacking")
            
    def is_done(self):
        return len(self.buffer)==0 and self.stack==[self.root]

In [147]:
temp = TrainsitionState(nltk.pos_tag("He has good control .".split()))
print(temp)
temp.shift()
temp.shift()
print(temp)
temp.left_arc()
print(temp)
print(temp.arcs)
temp.shift()
temp.shift()
print(temp)
temp.left_arc()
print(temp)
temp.right_arc()
print(temp)
temp.shift()
temp.right_arc()
print(temp)
temp.right_arc()
print(temp)
print(temp.arcs)
temp.is_done()

stack : ['ROOT'] 
buffer : ['He', 'has', 'good', 'control', '.']
stack : ['ROOT', 'He', 'has'] 
buffer : ['good', 'control', '.']
stack : ['ROOT', 'has'] 
buffer : ['good', 'control', '.']
[{'graph_id': 0, 'pos': 'VBZ', 'form': 'has', 'head': 0}]
stack : ['ROOT', 'has', 'good', 'control'] 
buffer : ['.']
stack : ['ROOT', 'has', 'control'] 
buffer : ['.']
stack : ['ROOT', 'has'] 
buffer : ['.']
stack : ['ROOT', 'has'] 
buffer : []
stack : ['ROOT'] 
buffer : []
[{'graph_id': 0, 'pos': 'VBZ', 'form': 'has', 'head': 0}, {'graph_id': 1, 'pos': 'NN', 'form': 'control', 'head': 2}, {'graph_id': 2, 'pos': 'VBZ', 'form': 'has', 'head': 3}, {'graph_id': 3, 'pos': 'VBZ', 'form': 'has', 'head': 4}, {'graph_id': 4, 'pos': '<root>', 'form': 'ROOT', 'head': 1}]


True

In [124]:
import pydot

In [279]:
def plot_tree(state,image_name):
    graph = pydot.Dot(graph_type='graph')
    for arc in state.arcs:
        edge = pydot.Edge(arc['form'],state.address[arc['head']])
        graph.add_edge(edge)
    
    graph.write_png('t_graph.png')

# Data load & Preprocessing 

In [253]:
data = open('../DeepDependencyParsingProblemSet/data/train.txt','r').readlines()
vocab = open('../DeepDependencyParsingProblemSet/data/vocab.txt','r').readlines()

In [254]:
train_data = [[nltk.pos_tag(d.split('|||')[0].split()), d.split('|||')[1][:-1].split()] for d in data]

In [255]:
train_x,train_y = list(zip(*train_data))
train_x_f = flatten(train_x)
sents,pos_tags = list(zip(*train_x_f))

In [256]:
tag2index = {v:i for i,v in enumerate(set(pos_tags))}
tag2index['<root>']=len(tag2index)
tag2index['<NULL>']=len(tag2index)

In [257]:
vocab = [v.split('\t')[0] for v in vocab]
word2index = {v:i for i,v in enumerate(vocab)}
word2index['ROOT'] = len(word2index)
word2index['<NULL>'] = len(word2index)

In [258]:
actions = ['SHIFT','REDUCE_L','REDUCE_R']
action2index = {v:i for i,v in enumerate(actions)}

In [259]:
state = TrainsitionState(train_x[0])

In [260]:
transition = train_y[0]+['REDUCE_R']

In [261]:
while len(transition)!=0:
    feat = get_feat(state,word2index,tag2index)
    action = transition.pop(0)
    train_X.append([feat,action2index[action]])
    if action=='SHIFT':
        state.shift()
    elif action=='REDUCE_R':
        state.right_arc()
    elif action=='REDUCE_L':
        state.left_arc()

In [280]:
plot_tree(state,'testz')

['dot', '-Tpng', '/tmp/tmpgohpzinq'] return code: 1

stdout, stderr:
 b''
b"Error: /tmp/tmpgohpzinq: syntax error in line 2 near ','\n"



AssertionError: 1

In [212]:
def get_feat(transition_state,word2index,tag2index,label2index=None):
    feats=[]
    
    feats.append(transition_state.stack[-1][0]) if len(transition_state.stack)>=1 and \
    transition_state.stack[-1][0] in word2index.keys() else feats.append('<NULL>') # s1
    feats.append(transition_state.stack[-2][0]) if len(transition_state.stack)>=2 and \
    transition_state.stack[-2][0] in word2index.keys() else feats.append('<NULL>') # s2
    feats.append(transition_state.stack[-3][0]) if len(transition_state.stack)>=3 and \
    transition_state.stack[-3][0] in word2index.keys() else feats.append('<NULL>') # s3
    
    feats.append(transition_state.buffer[0][0]) if len(transition_state.buffer)>=1 and \
    transition_state.buffer[0][0] in word2index.keys() else feats.append('<NULL>') # b1
    feats.append(transition_state.buffer[1][0]) if len(transition_state.buffer)>=2 and \
    transition_state.buffer[1][0] in word2index.keys() else feats.append('<NULL>') # b2
    feats.append(transition_state.buffer[2][0]) if len(transition_state.buffer)>=3 and \
    transition_state.buffer[2][0] in word2index.keys() else feats.append('<NULL>') # b3

    return make_word_vector(feats,word2index)

In [None]:
class NeuralDependencyParser(nn.Module):
    
    def __init__(self,w_size,w_embed_dim,t_size,t_embed_dim,l_size,l_embed_dim,hidden_size,target_size):
        super(NeuralDependencyParser, self).__init__()
        
        self.w_embed =  nn.Embedding(w_size,w_embed_dim)
        self.t_embed = nn.Embedding(t_size,t_embed_dim)
        self.l_embed = nn.Embedding(l_size,l_embed_dim)
        self.hidden_size = hidden_size
        self.target_size = target_size
        self.linear = nn.Linear(w_embed_dim+t_embed_dim+l_embed_dim,self.target_size)
        
    def foward(self,configure)
    