<h1>Transition based dependency Parsing<h1>

In [1]:
from sklearn.preprocessing import OneHotEncoder
from collections import defaultdict
import numpy as np
import pandas as pd
import itertools
import pandas as pd
from tqdm.notebook import tqdm
from collections import OrderedDict
from sklearn.linear_model import LogisticRegression
from copy import deepcopy

In [2]:
train_file_sent="UD_Hindi-HDTB/hi_hdtb-ud-train.txt"
train_file_stat="UD_Hindi-HDTB/hi_hdtb-ud-train.conllu" 
test_file_sent="UD_Hindi-HDTB/hi_hdtb-ud-test.txt"
test_file_stat="UD_Hindi-HDTB/hi_hdtb-ud-test.conllu" 
val_file_sent="UD_Hindi-HDTB/hi_hdtb-ud-dev.txt"
val_file_stat="UD_Hindi-HDTB/hi_hdtb-ud-dev.conllu" 

In [3]:
def file_lines(name):
    lines=[]
    with open(name,'r',encoding = 'utf-8') as f:
        lines=f.readlines()
    for i in range(0,len(lines)):
        lines[i]=lines[i].strip().split('\t')
    return lines

In [4]:
def extract_stats(line):
    stats=dict()
    stats['form']=line[1]
    stats['lemma']=line[2]
    stats['upos']=line[3]
    stats['xpos']=line[4]
    stats['head']=int(line[6])
    stats['deprel']=line[7]
    return int(line[0]),stats

Checking for projectivity and filtering

In [5]:
def path_exists(head,i,graph):
    while i!=head and i!=0:
        i=graph[i]
    return i==head
def projectivity(sentence):
    graph=dict()
    for word in sentence:
        word_id=word
        head_id=sentence[word_id]['head']
        graph[word_id]=head_id
    for dependent in graph:
        head=graph[dependent]
        l= min(head,dependent)+1
        r=max(head,dependent)
        for i in range(l,r,1):
            if not path_exists(head,i,graph):
                return False            
    return True

In [6]:
def filter_non_projective(sentences):
    filt_sentences=[sentence for sentence in sentences if projectivity(sentence)]
    return filt_sentences

In [7]:
def get_sentences(file):
    lines=file_lines(file)
    sentences=[]
    sentence={}
    for line in lines:
        if len(line)<10:
            if len(sentence)>0:
                #do something with sentence
                sentences.append(sentence)
                sentence={}
        else:
            id,stats=extract_stats(line)
            sentence[id]=stats
    return sentences

<b>The arc standard parsing algorithm (not used in project)<b>

In [8]:
def parse_standard(sentence):
    buffer=list()
    stack=[0]
    arcs=list()
    states=[]
    transitions=[]
    def perform_action(action,label=''):
        nonlocal buffer,stack,arcs,states,transitions
        states.append((list(buffer),list(stack),list(arcs)))
        transitions.append((action,label))
        if action=='shift':
            stack.append(buffer.pop())
        elif action=='left_arc':
            arcs.append((stack[-1],stack[-2],label))
            stack.pop(-2)
        elif action=='right_arc':
            arcs.append((stack[-2],stack[-1],label))
            stack.pop()
    dependency_graph = defaultdict(lambda: defaultdict())
    for word in reversed(sentence): #store in reverse in buffer, so top of buffer is first word
        buffer.append(word['id'])
        dependency_graph[word['head']][word['id']]=word['deprel']
    states=[]
    transitions=[]
    head_found=dict()
    while not (len(stack)==1 and stack[0]==0 and len(buffer)==0):
        if len(stack)>=2:
            if dependency_graph.get(stack[-1]) is not None and dependency_graph[stack[-1]].get(stack[-2]) is not None:
                head_found[stack[-2]]=True
                perform_action(action='left_arc',label=dependency_graph[stack[-1]][stack[-2]])
            elif dependency_graph.get(stack[-2]) is not None and dependency_graph[stack[-2]].get(stack[-1]) is not None:
                if dependency_graph.get(stack[-1]) is not None and any([dependent not in head_found for dependent in dependency_graph[stack[-1]].keys()]):
                    perform_action(action='shift')
                else:
                    head_found[stack[-1]]=True
                    perform_action(action='right_arc',label=dependency_graph[stack[-2]][stack[-1]])
            else:
                perform_action(action='shift')
        else:
            perform_action(action='shift')
    return states,transitions

<b>The arc standard parsing approach<b>

In [9]:
def parse_eager(sentence):
    buffer=list()
    stack=[0]
    arcs={}
    states=[]
    transitions=[]
    def perform_action(action,label=''):
        nonlocal buffer,stack,arcs,states,transitions
        states.append((list(buffer),list(stack),deepcopy(arcs)))
        transitions.append((action,label))
        if action=='shift':
            stack.append(buffer.pop())
        elif action=='reduce':
            stack.pop()
        elif action=='left_arc':
            if arcs.get(buffer[-1])==None:
                arcs[buffer[-1]]=[(stack.pop(),label)]
            else:
                arcs[buffer[-1]].append((stack.pop(),label))
        elif action=='right_arc':
            if arcs.get(stack[-1])==None:
                arcs[stack[-1]]=[(buffer[-1],label)]
            else:
                arcs[stack[-1]].append((buffer[-1],label))
            stack.append(buffer.pop())
    dependency_graph = defaultdict(lambda: defaultdict())
    for word in reversed(sentence): #store in reverse in buffer, so top of buffer is first word
        buffer.append(word)
        dependency_graph[sentence[word]['head']][word]=sentence[word]['deprel']
    head_found=dict()
    while not len(buffer)==0:
        if len(stack)>=1:
            if stack[-1]!=0 and head_found.get(stack[-1])==None and dependency_graph.get(buffer[-1]) is not None and dependency_graph[buffer[-1]].get(stack[-1]) is not None:
                head_found[stack[-1]]=True
                perform_action(action='left_arc',label=dependency_graph[buffer[-1]][stack[-1]])
            elif head_found.get(buffer[-1])==None and dependency_graph.get(stack[-1]) is not None and dependency_graph[stack[-1]].get(buffer[-1]) is not None:
                head_found[buffer[-1]]=True
                perform_action(action='right_arc',label=dependency_graph[stack[-1]][buffer[-1]])
            elif head_found.get(stack[-1])!=None and not any([dependent not in head_found for dependent in dependency_graph[stack[-1]].keys()]):
                perform_action(action='reduce')
            else:
                perform_action(action='shift')
        else:
            perform_action(action='shift')
    return states,transitions

<b>Feature extraction baseline and additional<b>

In [10]:
def extract_features_oneword(state,row,sentence):
    buffer,stack,arcs=state
    if len(stack)>1:
        row['S0_w']=str(stack[-1])
        row['S0_p']=sentence[stack[-1]]['xpos']
        row['S0_w_p']=str(stack[-1])+'_'+sentence[stack[-1]]['xpos']
    else:
        row['S0_w']='None'
        row['S0_p']='None'
        row['S0_w_p']='None'
    if len(buffer)>=1:
        row['N0_w']=str(buffer[-1])
        row['N0_p']=sentence[buffer[-1]]['xpos']
        row['N0_w_p']=str(buffer[-1])+'_'+sentence[buffer[-1]]['xpos']
    else:
        row['N0_w']='None'
        row['N0_p']='None'
        row['N0_w_p']='None'
    if len(buffer)>=2:
        row['N1_w']=str(buffer[-2])
        row['N1_p']=sentence[buffer[-2]]['xpos']
        row['N1_w_p']=str(buffer[-2])+'_'+sentence[buffer[-2]]['xpos']
    else:
        row['N1_w']='None'
        row['N1_p']='None'
        row['N1_w_p']='None'
    if len(buffer)>=3:
        row['N2_w']=str(buffer[-3])
        row['N2_p']=sentence[buffer[-3]]['xpos']
        row['N2_w_p']=str(buffer[-3])+'_'+sentence[buffer[-3]]['xpos']
    else:
        row['N2_w']='None'
        row['N2_p']='None'
        row['N2_w_p']='None'
    return row

In [11]:
def extract_features_twoword(state,row,sentence):
    buffer,stack,arcs=state
    if len(stack)>1 and len(buffer)>=1:
        row['S0_w_p_N0_w_p']=str(stack[-1])+'_'+sentence[stack[-1]]['xpos']+'_'+str(buffer[-1])+'_'+sentence[buffer[-1]]['xpos']
        row['S0_w_p_N0_w']=str(stack[-1])+'_'+sentence[stack[-1]]['xpos']+'_'+str(buffer[-1])
        row['S0_w_N0_w_p']=str(stack[-1])+'_'+str(buffer[-1])+'_'+sentence[buffer[-1]]['xpos']
        row['S0_w_p_N0_p']=str(stack[-1])+'_'+sentence[stack[-1]]['xpos']+'_'+sentence[buffer[-1]]['xpos']
        row['S0_p_N0_w_p']=sentence[stack[-1]]['xpos']+'_'+str(buffer[-1])+'_'+sentence[buffer[-1]]['xpos']
        row['S0_w_N0_w']=str(stack[-1])+'_'+str(buffer[-1])
        row['S0_p_N0_p']=sentence[stack[-1]]['xpos']+'_'+sentence[buffer[-1]]['xpos']
    else:
        row['S0_w_p_N0_w_p']='None'
        row['S0_w_p_N0_w']='None'
        row['S0_w_N0_w_p']='None'
        row['S0_w_p_N0_p']='None'
        row['S0_p_N0_w_p']='None'
        row['S0_w_N0_w']='None'
        row['S0_p_N0_p']='None'
    if len(buffer)>=2:
        row['N0_p_N1_p']=sentence[buffer[-1]]['xpos']+'_'+sentence[buffer[-2]]['xpos']
    else:
        row['N0_p_N1_p']='None'
    return row

In [12]:
def extract_features_threeword(state,row,sentence):
    buffer,stack,arcs=state
    if len(buffer)>=3:
        row['N0_p_N1_p_N2_p']=sentence[buffer[-1]]['xpos']+'_'+sentence[buffer[-2]]['xpos']+'_'+sentence[buffer[-3]]['xpos']
    else:
        row['N0_p_N1_p_N2_p']='None'
    if len(buffer)>=2 and len(stack)>1:
        row['S0_p_N0_p_N1_p']=sentence[stack[-1]]['xpos']+'_'+sentence[buffer[-1]]['xpos']+'_'+sentence[buffer[-2]]['xpos']
    else:
        row['S0_p_N0_p_N1_p']='None'
    s0h=-1
    s0l=-1
    s0r=-1
    n0l=-1
    if len(stack)>1:
        for node in arcs:
            for arc in arcs[node]:
                if arc[0]==stack[-1]:
                    s0h=node
                    break
            if s0h!=-1:
                break
        if arcs.get(stack[-1])!=None:
            for arc in arcs[stack[-1]]:
                if s0l==-1 and s0r==-1:
                    s0l=arc[0]
                    s0r=arc[0]
                elif arc[0]<s0l:
                    s0l=arc[0]
                elif arc[0]>s0r:
                    s0r=arc[0]
    if len(buffer)>=1:
        if arcs.get(buffer[-1])!=None:
            for arc in arcs[buffer[-1]]:
                if n0l==-1:
                    n0l=arc[0]
                elif arc[0]<n0l:
                    n0l=arc[0]
    if len(stack)>1 and len(buffer)>=1:
        if s0h!=-1 and s0h!=0:
            row['S0h_p_S0_p_N0_p']=sentence[s0h]['xpos']+'_'+sentence[stack[-1]]['xpos']+'_'+sentence[buffer[-1]]['xpos']
        else:
            row['S0h_p_S0_p_N0_p']='None'
        if s0l!=-1 and s0l!=0:
            row['S0_p_S0l_p_N0_p']=sentence[stack[-1]]['xpos']+'_'+sentence[s0l]['xpos']+'_'+sentence[buffer[-1]]['xpos']
        else:
            row['S0_p_S0l_p_N0_p']='None'
        if s0r!=-1 and s0r!=0:
            row['S0_p_S0r_p_N0_p']=sentence[stack[-1]]['xpos']+'_'+sentence[s0r]['xpos']+'_'+sentence[buffer[-1]]['xpos']
        else:
            row['S0_p_S0r_p_N0_p']='None'
        if n0l!=-1:
            row['S0_p_N0_p_N0l_p']=sentence[stack[-1]]['xpos']+'_'+sentence[buffer[-1]]['xpos']+'_'+sentence[n0l]['xpos']
        else:
            row['S0_p_N0_p_N0l_p']='None'
    else:
        row['S0h_p_S0_p_N0_p']='None'
        row['S0_p_S0l_p_N0_p']='None'
        row['S0_p_S0r_p_N0_p']='None'
        row['S0_p_N0_p_N0l_p']='None'
    return row

In [13]:
def extract_features_distance(state,row,sentence):
    buffer,stack,arcs=state
    d='None'
    if len(stack)>1 and len(buffer)>=1:
        d=str(abs(stack[-1]-buffer[-1]))
    if len(stack)>1:
        row['S0_w_d']=str(stack[-1])+'_'+d
        row['S0_p_d']=sentence[stack[-1]]['xpos']+'_'+d
    else:
        row['S0_w_d']='None'
        row['S0_p_d']='None'
    if len(buffer)>=1:
        row['N0_w_d']=str(buffer[-1])+'_'+d
        row['N0_p_d']=sentence[buffer[-1]]['xpos']+'_'+d
    else:
        row['N0_w_d']='None'
        row['N0_p_d']='None'
    if len(stack)>1 and len(buffer)>=1:
        row['S0_w_N0_w_d']=str(stack[-1])+'_'+str(buffer[-1])+'_'+d
        row['S0_p_N0_p_d']=sentence[stack[-1]]['xpos']+'_'+sentence[buffer[-1]]['xpos']+'_'+d
    else:
        row['S0_w_N0_w_d']='None'
        row['S0_p_N0_p_d']='None'
    return row

In [14]:
def extract_features_valency(state,row,sentence):
    buffer,stack,arcs=state
    if len(stack)>1:
        vl=0
        vr=0
        if arcs.get(stack[-1])!=None:
            for arc in arcs[stack[-1]]:
                if arc[0]<stack[-1]:
                    vl+=1
                elif arc[0]>stack[-1]:
                    vr+=1
        row['S0_w_vr']=str(stack[-1])+'_'+str(vr)
        row['S0_p_vr']=sentence[stack[-1]]['xpos']+'_'+str(vr)
        row['S0_w_vl']=str(stack[-1])+'_'+str(vl)
        row['S0_p_vl']=sentence[stack[-1]]['xpos']+'_'+str(vl)
    else:
        row['S0_w_vr']='None'
        row['S0_p_vr']='None'
        row['S0_w_vl']='None'
        row['S0_p_vl']='None'
    if len(buffer)>=1:
        vl=0
        vr=0
        if arcs.get(buffer[-1])!=None:
            for arc in arcs[buffer[-1]]:
                if arc[0]<buffer[-1]:
                    vl+=1
                elif arc[0]>buffer[-1]:
                    vr+=1
        row['N0_w_vr']=str(buffer[-1])+'_'+str(vr)
        row['N0_p_vr']=sentence[buffer[-1]]['xpos']+'_'+str(vr)
        row['N0_w_vl']=str(buffer[-1])+'_'+str(vl)
        row['N0_p_vl']=sentence[buffer[-1]]['xpos']+'_'+str(vl)
    else:
        row['N0_w_vr']='None'
        row['N0_p_vr']='None'
        row['N0_w_vl']='None'
        row['N0_p_vl']='None'
    return row

In [15]:
def extract_features_unigram(state,row,sentence):
    buffer,stack,arcs=state
    s0h=-1
    s0l=-1
    s0l_l=-1
    s0r=-1
    s0r_l=-1
    n0l=-1
    n0l_l=-1
    s0_l=-1
    if len(stack)>1:
        for node in arcs:
            for arc in arcs[node]:
                if arc[0]==stack[-1]:
                    s0h=node
                    s0_l=arc[1]
                    break
            if s0h!=-1:
                break
        if arcs.get(stack[-1])!=None:
            for arc in arcs[stack[-1]]:
                if s0l==-1 and s0r==-1:
                    s0l=arc[0]
                    s0l_l=arc[1]
                    s0r=arc[0]
                    s0r_l=arc[1]
                elif arc[0]<s0l:
                    s0l=arc[0]
                    s0l_l=arc[1]
                elif arc[0]>s0r:
                    s0r=arc[0]
                    s0r_l=arc[1]
    if len(buffer)>=1:
        if arcs.get(buffer[-1])!=None:
            for arc in arcs[buffer[-1]]:
                if n0l==-1:
                    n0l=arc[0]
                    n0l_l=arc[1]
                elif arc[0]<n0l:
                    n0l=arc[0]
                    n0l_l=arc[1]
    if s0h!=-1 and s0h!=0:
        row['S0h_w']=str(s0h)
        row['S0h_p']=sentence[s0h]['xpos']
    else:
        row['S0h_w']='None'
        row['S0h_p']='None'
    if s0_l!=-1:
        row['S0_l']=s0_l
    else:
        row['S0_l']='None'
    if s0l!=-1 and s0l!=0:
        row['S0l_w']=str(s0l)
        row['S0l_p']=sentence[s0l]['xpos']
        row['S0l_l']=s0l_l
    else:
        row['S0l_w']='None'
        row['S0l_p']='None'
        row['S0l_l']='None'
    if s0r!=-1 and s0r!=0:
        row['S0r_w']=str(s0r)
        row['S0r_p']=sentence[s0r]['xpos']
        row['S0r_l']=s0r_l
    else:
        row['S0r_w']='None'
        row['S0r_p']='None'
        row['S0r_l']='None'
    if n0l!=-1 and n0l!=0:
        row['N0l_w']=str(n0l)
        row['N0l_p']=sentence[n0l]['xpos']
        row['N0l_l']=n0l_l
    else:
        row['N0l_w']='None'
        row['N0l_p']='None'
        row['N0l_l']='None'
    return row

In [16]:

def extract_features(states,transitions,sentence,data,counter,label,fs):
    for state,transition in zip(states,transitions):
        row={}
        if 'one' in fs:
            row=extract_features_oneword(state,row,sentence)
        if 'two' in fs:
            row=extract_features_twoword(state,row,sentence)
        if 'three' in fs:
            row=extract_features_threeword(state,row,sentence)
        if 'distance' in fs:
            row=extract_features_distance(state,row,sentence)
        if 'valency' in fs:
            row=extract_features_valency(state,row,sentence)
        if 'unigram' in fs:
            row=extract_features_unigram(state,row,sentence)
        if transition[1]=='':
            label[counter]=transition[0]
        else:
            label[counter]=transition[0]+','+transition[1]
        data[counter]=row
        counter+=1
    return counter
            

<b>Modified arc eager parsing for test data<b>

In [26]:
def parse_eager_test_unshift(sentence,model,enc,fs):
    buffer=list()
    stack=[0]
    arcs={}
    def perform_action(action,label=''):
        nonlocal buffer,stack,arcs
        if action=='shift':
            stack.append(buffer.pop())
        elif action=='reduce':
            stack.pop()
        elif action=='left_arc':
            if arcs.get(buffer[-1])==None:
                arcs[buffer[-1]]=[(stack.pop(),label)]
            else:
                arcs[buffer[-1]].append((stack.pop(),label))
        elif action=='right_arc':
            if arcs.get(stack[-1])==None:
                arcs[stack[-1]]=[(buffer[-1],label)]
            else:
                arcs[stack[-1]].append((buffer[-1],label))
            stack.append(buffer.pop())
        elif action=='unshift':
            buffer.append(stack.pop())
    for word in reversed(sentence): #store in reverse in buffer, so top of buffer is first word
        buffer.append(word)
    head_found=dict()
    end_seen=False
    while not (len(stack)==1 and len(buffer)==0 and end_seen):
        if len(buffer)==0 and not end_seen:
            end_seen=True
            continue
        state=(buffer,stack,arcs)
        row={}
        if 'one' in fs:
            row=extract_features_oneword(state,row,sentence)
        if 'two' in fs:
            row=extract_features_twoword(state,row,sentence)
        if 'three' in fs:
            row=extract_features_threeword(state,row,sentence)
        if 'distance' in fs:
            extract_features_distance(state,row,sentence)
        if 'valency' in fs:
            row=extract_features_valency(state,row,sentence)
        if 'unigram' in fs:
            row=extract_features_unigram(state,row,sentence)
        data={0:row}
        df = pd.DataFrame.from_dict(data,'index')
        x=enc.transform(df)
        decision_probs=np.argsort(-1 *model.predict_proba(x)[0],kind='quicksort')
        for decision in decision_probs:
            temp=model.classes_[decision].split(',')
            transition=temp[0]
            label=temp[1] if len(temp)>1 else ''
            if end_seen and head_found.get(stack[-1])==None and  len(buffer)==0:
                perform_action(action='unshift')
                break
            if transition=='left_arc':
                if stack[-1]!=0 and len(buffer)>=1 and head_found.get(stack[-1])==None and len(stack)>=1:
                    head_found[stack[-1]]=(buffer[-1],label)
                    perform_action(action='left_arc',label=label)
                    break
            if transition=='right_arc':
                if len(stack)>=1 and len(buffer)>=1 and head_found.get(buffer[-1])==None:
                    head_found[buffer[-1]]=(stack[-1],label)
                    perform_action(action='right_arc',label=label) 
                    break
            if transition=='reduce':
                if head_found.get(stack[-1])!=None and len(stack)>1:
                    perform_action(action='reduce')
                    break
            if transition=='shift' and not end_seen:
                perform_action(action='shift')
                break
    return head_found

<b>Arc eager parsing for test data<b>

In [18]:
def parse_eager_test(sentence,model,enc,fs):
    buffer=list()
    stack=[0]
    arcs={}
    def perform_action(action,label=''):
        nonlocal buffer,stack,arcs
        if action=='shift':
            stack.append(buffer.pop())
        elif action=='reduce':
            stack.pop()
        elif action=='left_arc':
            if arcs.get(buffer[-1])==None:
                arcs[buffer[-1]]=[(stack.pop(),label)]
            else:
                arcs[buffer[-1]].append((stack.pop(),label))
        elif action=='right_arc':
            if arcs.get(stack[-1])==None:
                arcs[stack[-1]]=[(buffer[-1],label)]
            else:
                arcs[stack[-1]].append((buffer[-1],label))
            stack.append(buffer.pop())
    for word in reversed(sentence): #store in reverse in buffer, so top of buffer is first word
        buffer.append(word)
    head_found=dict()
    end_seen=False
    while len(buffer)!=0:
        state=(buffer,stack,arcs)
        row={}
        if 'one' in fs:
            row=extract_features_oneword(state,row,sentence)
        if 'two' in fs:
            row=extract_features_twoword(state,row,sentence)
        if 'three' in fs:
            row=extract_features_threeword(state,row,sentence)
        if 'distance' in fs:
            extract_features_distance(state,row,sentence)
        if 'valency' in fs:
            row=extract_features_valency(state,row,sentence)
        if 'unigram' in fs:
            row=extract_features_unigram(state,row,sentence)
        data={0:row}
        df = pd.DataFrame.from_dict(data,'index')
        x=enc.transform(df)
        decision_probs=np.argsort(-1 *model.predict_proba(x)[0],kind = 'quicksort')
        
        left_arc_possible=stack[-1]!=0 and head_found.get(stack[-1])==None and len(stack)>=1
        right_arc_possible=len(stack)>=1 and head_found.get(buffer[-1])==None
        reduce_possible=head_found.get(stack[-1])!=None and len(stack)>=1
        for decision in decision_probs:
            temp=model.classes_[decision].split(',')
            transition=temp[0]
            label=temp[1] if len(temp)>1 else ''
            if transition=='left_arc':
                if left_arc_possible:
                    head_found[stack[-1]]=(buffer[-1],label)
                    perform_action(action='left_arc',label=label)
                    break
            if transition=='right_arc':
                if right_arc_possible:
                    head_found[buffer[-1]]=(stack[-1],label)
                    perform_action(action='right_arc',label=label) 
                    break
            if transition=='reduce':
                if reduce_possible:
                    perform_action(action='reduce')
                    break
            if transition=='shift':
                perform_action(action='shift')
                break
    return head_found

In [19]:
def get_model(fs):
    sentences=get_sentences(train_file_stat)
    filt_sentences=filter_non_projective(sentences)
    data={}
    label={}
    counter=0
    for i in tqdm(range(len(filt_sentences))):
        states,transitions=parse_eager(filt_sentences[i])
        counter=extract_features(states,transitions,filt_sentences[i],data,counter,label,fs)
    df = pd.DataFrame.from_dict(data,'index')
    label=np.array(list(label.values()))
    enc = OneHotEncoder(handle_unknown='ignore',sparse=True,dtype=int)
    x=enc.fit_transform(df)
    #model= BernoulliNB()
    model=LogisticRegression(verbose = 1,n_jobs = 6)
    model.fit(x,label)
    return model,enc

In [20]:
def get_scores(fs,unshift=0):
    model,enc=get_model(fs)
    sentences=get_sentences(test_file_stat)
    filt_sentences=filter_non_projective(sentences)
    total=0
    labelled=0
    unlabelled=0
    total_nodes=0
    for i in tqdm(range(len(filt_sentences))):
        sentence=filt_sentences[i]
        total_nodes+=len(sentence)
        heads=''
        if unshift==1:
            heads=parse_eager_test_unshift(sentence,model,enc,fs)
        else:
            heads=parse_eager_test(sentence,model,enc,fs)
        for node in heads:
            head,label=heads[node]
            label=label.split(':')[0]
            total+=1
            if sentence[node]['head']==head:
                unlabelled+=1
                if sentence[node]['deprel'].split(':')[0]==label:
                    labelled+=1
    p=labelled/total
    r=labelled/total_nodes
    las_score=2*p*r/(p+r)
    p=unlabelled/total
    r=unlabelled/total_nodes
    uas_score=2*p*r/(p+r)
    info={}
    info['gs_nodes']=total_nodes
    info['sp_nodes']=total
    info['correct_labelled']=labelled
    info['correct_unlabelled']=unlabelled
    info['UAS_score']=uas_score
    info['LAS_score']=las_score
    info['features']=fs
    info['unshift']=unshift
    try:
        f = open('Scores.txt', 'a')
        f.write(str(info))
        f.close()
    except:
        print("Unable to append to file")
    return info

<h3>Code block for testing<h3>

Add features to the feature array to be used for training and testing

features that can be added are:'one','two','three','valency','distance','unigram'

change the unshift option to 1 to use the modified Arc eager approach for parsing or 0 to use the normal Arc eager parser

All results are saved to a file called Scores.txt

In [None]:
feature_array=['one','two','three','valency','distance','unigram']
results=get_scores(feature_array,unshift=1)
print('gs_nodes',results['gs_nodes'])
print('sp_nodes',results['sp_nodes'])
print('correct_labelled',results['correct_labelled'])
print('correct_unlabelled',results['correct_unlabelled'])
print('UAS_score',results['UAS_score'])
print('LAS_score',results['LAS_score'])

HBox(children=(FloatProgress(value=0.0, max=11467.0), HTML(value='')))




[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   1 out of   1 | elapsed:  5.4min finished


HBox(children=(FloatProgress(value=0.0, max=1473.0), HTML(value='')))