In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from nltk.grammar import DependencyGrammar
from nltk.parse import (  DependencyGraph,   ProjectiveDependencyParser,    NonprojectiveDependencyParser)
from nltk.parse.transitionparser import TransitionParser, Configuration, Transition
from nltk.parse import ParserI, DependencyGraph, DependencyEvaluator
import tempfile
import os
from numpy import array
from scipy import sparse
from sklearn.datasets import load_svmlight_file
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from os import remove
import tempfile
import pickle
from collections import defaultdict
from itertools import chain
from pprint import pformat
import subprocess
import warnings
from six import string_types

from nltk.tree import Tree
from nltk.compat import python_2_unicode_compatible

In [2]:
from DependencyGraph import myDependencyGraph    # the member functions of the dependency graph have been modified to include the 10th column in the nodes of the dependency graph

In [3]:
 # extract_features function of the Configuration class has been modified here to return the features in the 10th column as training features
def extractFeatures(self):      
    '''    
       param morph_status: whether to take the morphological features in the feature set
       param misc_statu: whether to take the additional features in the feature set
    '''
    result = []
    # Todo : can come up with more complicated features set for better
    # performance.
    if len(self.stack) > 0:
        # Stack 0
        stack_idx0 = self.stack[len(self.stack) - 1]
        token = self._tokens[stack_idx0]
        if self._check_informative(token['word'], True):
            result.append('STK_0_FORM_' + token['word'])
        if 'lemma' in token and self._check_informative(token['lemma']):
            result.append('STK_0_LEMMA_' + token['lemma'])
        if self._check_informative(token['tag']):
            result.append('STK_0_POS_' + token['tag'])
        if(morph_status):
            if 'feats' in token and self._check_informative(token['feats']):
                feats = token['feats'].split("|")
                for feat in feats:
                    result.append('STK_0_FEATS_' + feat)
        
        if(misc_status):
            if 'misc' in token and self._check_informative(token['misc']):
                miscs = token['misc'].split("|")
                for misc in miscs:
                    result.append('STK_0_MISC_' + misc)
        # Stack 1
        if len(self.stack) > 1:
            stack_idx1 = self.stack[len(self.stack) - 2]
            token = self._tokens[stack_idx1]
            if self._check_informative(token['tag']):
                result.append('STK_1_POS_' + token['tag'])

        # Left most, right most dependency of stack[0]
        left_most = 1000000
        right_most = -1
        dep_left_most = ''
        dep_right_most = ''
        for (wi, r, wj) in self.arcs:
            if wi == stack_idx0:
                if (wj > wi) and (wj > right_most):
                    right_most = wj
                    dep_right_most = r
                if (wj < wi) and (wj < left_most):
                    left_most = wj
                    dep_left_most = r
        if self._check_informative(dep_left_most):
            result.append('STK_0_LDEP_' + dep_left_most)
        if self._check_informative(dep_right_most):
            result.append('STK_0_RDEP_' + dep_right_most)

    # Check Buffered 0
    if len(self.buffer) > 0:
        # Buffer 0
        buffer_idx0 = self.buffer[0]
        token = self._tokens[buffer_idx0]
        if self._check_informative(token['word'], True):
            result.append('BUF_0_FORM_' + token['word'])
        if 'lemma' in token and self._check_informative(token['lemma']):
            result.append('BUF_0_LEMMA_' + token['lemma'])
        if self._check_informative(token['tag']):
            result.append('BUF_0_POS_' + token['tag'])
        if(morph_status):
            if 'feats' in token and self._check_informative(token['feats']):
                feats = token['feats'].split("|")
                for feat in feats:
                    result.append('BUF_0_FEATS_' + feat)
        if (misc_status):
            if 'misc' in token and self._check_informative(token['misc']):
                miscs = token['misc'].split("|")
                for misc in miscs:
                    result.append('BUF_0_MISC_' + misc) 
        # Buffer 1
        if len(self.buffer) > 1:
            buffer_idx1 = self.buffer[1]
            token = self._tokens[buffer_idx1]
            if self._check_informative(token['word'], True):
                result.append('BUF_1_FORM_' + token['word'])
            if self._check_informative(token['tag']):
                result.append('BUF_1_POS_' + token['tag'])
        if len(self.buffer) > 2:
            buffer_idx2 = self.buffer[2]
            token = self._tokens[buffer_idx2]
            if self._check_informative(token['tag']):
                result.append('BUF_2_POS_' + token['tag'])
        if len(self.buffer) > 3:
            buffer_idx3 = self.buffer[3]
            token = self._tokens[buffer_idx3]
            if self._check_informative(token['tag']):
                result.append('BUF_3_POS_' + token['tag'])
                # Left most, right most dependency of stack[0]
        left_most = 1000000
        right_most = -1
        dep_left_most = ''
        dep_right_most = ''
        for (wi, r, wj) in self.arcs:
            if wi == buffer_idx0:
                if (wj > wi) and (wj > right_most):
                    right_most = wj
                    dep_right_most = r
                if (wj < wi) and (wj < left_most):
                    left_most = wj
                    dep_left_most = r
        if self._check_informative(dep_left_most):
            result.append('BUF_0_LDEP_' + dep_left_most)
        if self._check_informative(dep_right_most):
            result.append('BUF_0_RDEP_' + dep_right_most)

    return result

Configuration.extract_features = extractFeatures

In [4]:
'''
    train_model takes in a parser, a machine learning model and a list of dependency graphs.
    And trains the model based on the training examples created
    by the parser.
'''

def train_model(parser,model, depgraphs,modelfile, verbose=True):

    try:
        input_file = tempfile.NamedTemporaryFile(prefix='transition_parse.train',dir=tempfile.gettempdir(), delete=False)

        if parser._algorithm == parser.ARC_STANDARD:
            parser._create_training_examples_arc_std(depgraphs, input_file)
        else:
            parser._create_training_examples_arc_eager(depgraphs, input_file)

        input_file.close()
        x_train, y_train = load_svmlight_file(input_file.name)
        model.fit(x_train, y_train)
        model_pickle= open(modelfile, 'wb')
        pickle.dump(model,model_pickle)
        model_pickle.close()
        
    finally:
        remove(input_file.name)


In [5]:
# Read the train and test files
with open('UD_Hindi/hi-ud-train.conllu', 'r') as f:
    graphs = [myDependencyGraph(entry, top_relation_label='root') for entry in f.read().split('\n\n') if entry]
with open('UD_Hindi/hi-ud-test.conllu', 'r') as f:
    graph_test = [myDependencyGraph(entry, top_relation_label='root') for entry in f.read().split('\n\n') if entry]

In [6]:
# create the parsers and machine learning models
parser_str_list=['arc-standard','arc-eager']
model_list=[LogisticRegression(),svm.SVC(kernel='poly',degree=2, coef0=0,gamma=0.2,C=0.5,verbose=True,probability=True),MLPClassifier(solver='lbfgs',hidden_layer_sizes=75, random_state=1, verbose=True)]
status_list=[(False,False),(True,False),(True,True)]


In [7]:
print("PERFORMANCE EVALUATION FOR ARC-STANDARD PARSING")
print("===============================================")
for model in model_list:
    for morph_status,misc_status in status_list:
        parser=TransitionParser(parser_str_list[0])
        model_file=train_model(parser,model,graphs,'temp.arcstd.model',True)
        print('')
        result = parser.parse(graph_test,'temp.arcstd.model')
        evaluator = DependencyEvaluator(result,graph_test)
        print("Morphological Features: ",'Taken' if morph_status else 'Not Taken','. Additional Features in 10th column: ','Taken' if misc_status else 'Not Taken')
        print('Peformance for Arc-Standard parser and Model ',str(model)[:str(model).index("(")],' :')
        print(evaluator.eval())
        print('-----------------------------------------')


PERFORMANCE EVALUATION FOR ARC-STANDARD PARSING
 Number of training examples : 500
 Number of valid (projective) examples : 476

Morphological Features:  Not Taken . Additional Features in 10th column:  Not Taken
Peformance for Arc-Standard parser and Model  LogisticRegression  :
(0.800453514739229, 0.6923658352229781)
-----------------------------------------
 Number of training examples : 500
 Number of valid (projective) examples : 476

Morphological Features:  Taken . Additional Features in 10th column:  Not Taken
Peformance for Arc-Standard parser and Model  LogisticRegression  :
(0.8034769463340892, 0.6893424036281179)
-----------------------------------------
 Number of training examples : 500
 Number of valid (projective) examples : 476

Morphological Features:  Taken . Additional Features in 10th column:  Taken
Peformance for Arc-Standard parser and Model  LogisticRegression  :
(0.873015873015873, 0.7755102040816326)
-----------------------------------------
 Number of trainin

In [8]:
print("PERFORMANCE EVALUATION FOR ARC-EAGER PARSING")
print("===============================================")
for model in model_list:
    for morph_status,misc_status in status_list:
        parser=TransitionParser(parser_str_list[1])
        model_file=train_model(parser,model,graphs,'temp.arceager.model',True)
        print('')
        result = parser.parse(graph_test,'temp.arceager.model')
        evaluator = DependencyEvaluator(result,graph_test)
        print("Morphological Features: ",'Taken' if morph_status else 'Not Taken',' .Additional Features in 10th column: ','Taken' if misc_status else 'Not Taken')
        print('Peformance for Arc-Eager parser and Model ',str(model)[:str(model).index("(")],' :')
        print(evaluator.eval())
        print('-----------------------------------------')


PERFORMANCE EVALUATION FOR ARC-EAGER PARSING
 Number of training examples : 500
 Number of valid (projective) examples : 476

Morphological Features:  Not Taken  .Additional Features in 10th column:  Not Taken
Peformance for Arc-Eager parser and Model  LogisticRegression  :
(0.8518518518518519, 0.7377173091458806)
-----------------------------------------
 Number of training examples : 500
 Number of valid (projective) examples : 476

Morphological Features:  Taken  .Additional Features in 10th column:  Not Taken
Peformance for Arc-Eager parser and Model  LogisticRegression  :
(0.8624338624338624, 0.7520786092214664)
-----------------------------------------
 Number of training examples : 500
 Number of valid (projective) examples : 476

Morphological Features:  Taken  .Additional Features in 10th column:  Taken
Peformance for Arc-Eager parser and Model  LogisticRegression  :
(0.8979591836734694, 0.7981859410430839)
-----------------------------------------
 Number of training examples