In [2]:
train_file_sent="UD_Hindi-HDTB/hi_hdtb-ud-train.txt"
train_file_stat="UD_Hindi-HDTB/hi_hdtb-ud-train.conllu" 
test_file_sent="UD_Hindi-HDTB/hi_hdtb-ud-test.txt"
test_file_stat="UD_Hindi-HDTB/hi_hdtb-ud-test.conllu" 
val_file_sent="UD_Hindi-HDTB/hi_hdtb-ud-dev.txt"
val_file_stat="UD_Hindi-HDTB/hi_hdtb-ud-dev.conllu" 

In [3]:
def file_lines(name):
    lines=[]
    with open(name,'r',encoding = 'utf-8') as f:
        lines=f.readlines()
    for i in range(0,len(lines)):
        lines[i]=lines[i].strip().split('\t')
    return lines

In [4]:
def extract_stats(line):
    stats=dict()
    stats['id']=int(line[0])
    stats['form']=line[1]
    stats['lemma']=line[2]
    stats['upos']=line[3]
    stats['xpos']=line[4]
    stats['head']=int(line[6])
    stats['deprel']=line[7]
    return stats

In [5]:
def path_exists(head,i,graph):
    while i!=head and i!=0:
        i=graph[i]
    return i==head
def projectivity(sentence):
    graph=dict()
    for word in sentence:
        word_id=word['id']
        head_id=word['head']
        graph[word_id]=head_id
    for dependent in graph:
        head=graph[dependent]
        l= min(head,dependent)+1
        r=max(head,dependent)
        for i in range(l,r,1):
            if not path_exists(head,i,graph):
                return False            
    return True

In [6]:
def filter_non_projective(sentences):
    filt_sentences=[sentence for sentence in sentences if projectivity(sentence)]
    return filt_sentences

In [7]:
def get_sentences(file):
    lines=file_lines(file)
    sentences=[]
    sentence=[]
    for line in lines:
        if len(line)<10:
            if len(sentence)>0:
                #do something with sentence
                sentences.append(sentence)
                sentence=[]
        else:
            sentence.append(extract_stats(line))
    return sentences

In [8]:
from collections import defaultdict
def parse(sentence):
    buffer=list()
    stack=[0]
    arcs=list()
    states=[]
    transitions=[]
    def perform_action(action,label=''):
        nonlocal buffer,stack,arcs,states,transitions
        states.append((list(buffer),list(stack),list(arcs)))
        transitions.append((action,label))
        if action=='shift':
            stack.append(buffer.pop())
        elif action=='left_arc':
            arcs.append((stack[-1],stack[-2],label))
            stack.pop(-2)
        elif action=='right_arc':
            arcs.append((stack[-2],stack[-1],label))
            stack.pop()
    dependency_graph = defaultdict(lambda: defaultdict())
    for word in reversed(sentence): #store in reverse in buffer, so top of buffer is first word
        buffer.append(word['id'])
        dependency_graph[word['head']][word['id']]=word['deprel']
    states=[]
    transitions=[]
    head_found=dict()
    while not (len(stack)==1 and stack[0]==0 and len(buffer)==0):
        if len(stack)>=2:
            if dependency_graph.get(stack[-1]) is not None and dependency_graph[stack[-1]].get(stack[-2]) is not None:
                head_found[stack[-2]]=True
                perform_action(action='left_arc',label=dependency_graph[stack[-1]][stack[-2]])
            elif dependency_graph.get(stack[-2]) is not None and dependency_graph[stack[-2]].get(stack[-1]) is not None:
                if dependency_graph.get(stack[-1]) is not None and any([dependent not in head_found for dependent in dependency_graph[stack[-1]].keys()]):
                    perform_action(action='shift')
                else:
                    head_found[stack[-1]]=True
                    perform_action(action='right_arc',label=dependency_graph[stack[-2]][stack[-1]])
            else:
                perform_action(action='shift')
        else:
            perform_action(action='shift')
    return states,transitions

In [21]:
def parse_eager(sentence):
    buffer=list()
    stack=[0]
    arcs={}
    states=[]
    transitions=[]
    def perform_action(action,label=''):
        nonlocal buffer,stack,arcs,states,transitions
        states.append((list(buffer),list(stack),arcs.copy()))
        transitions.append((action,label))
        if action=='shift':
            stack.append(buffer.pop())
        elif action=='reduce':
            stack.pop()
        elif action=='left_arc':
            if arcs.get(buffer[-1])==None:
                arcs[buffer[-1]]=[(stack.pop(),label)]
            else:
                arcs[buffer[-1]].append((stack.pop(),label))
        elif action=='right_arc':
            if arcs.get(stack[-1])==None:
                arcs[stack[-1]]=[(buffer[-1],label)]
            else:
                arcs[stack[-1]].append((buffer[-1],label))
            stack.append(buffer.pop())
    dependency_graph = defaultdict(lambda: defaultdict())
    for word in reversed(sentence): #store in reverse in buffer, so top of buffer is first word
        buffer.append(word['id'])
        dependency_graph[word['head']][word['id']]=word['deprel']
    head_found=dict()
    while not len(buffer)==0:
        if len(stack)>=1:
            if head_found.get(stack[-1])==None and dependency_graph.get(buffer[-1]) is not None and dependency_graph[buffer[-1]].get(stack[-1]) is not None:
                head_found[stack[-1]]=True
                perform_action(action='left_arc',label=dependency_graph[buffer[-1]][stack[-1]])
            elif dependency_graph.get(stack[-1]) is not None and dependency_graph[stack[-1]].get(buffer[-1]) is not None:
                head_found[buffer[-1]]=True
                perform_action(action='right_arc',label=dependency_graph[stack[-1]][buffer[-1]])
            elif head_found.get(stack[-1])!=None:
                perform_action(action='reduce')
            else:
                perform_action(action='shift')
        else:
            perform_action(action='shift')
    return states,transitions

In [40]:
def get_feature_names(feature_array):
    feature_names=[]
    if 'single_words' in feature_array:
        feature_names.extend(['S0_w_p','S0_w','S0_p','N0_w_p','N0_w','N0_p','N1_w_p','N1_w','N1_p','N2_w_p','N2_w','N2_p'])
    return feature_names
    

In [57]:
import numpy as np
import pandas as pd
def extract_features(states,transitions,feature_names,sentence):
    df=pd.DataFrame(columns=feature_names)
    for state in states:
        buffer,stack,arcs=state
        row={}
        for feature_name in feature_names:
            feature_tokens=feature_name.split('_')
            cur_word=0
            val=''
            for token in feature_tokens:
                
                #determining word
                if token[0]=='S':
                    if int(token[1])<len(stack):
                        cur_word=stack[-1-int(token[1])]
                        if  cur_word==0:
                            row[feature_name]=np.nan
                            break
                    else:
                        row[feature_name]=np.nan
                        break
                elif token[0]=='N':
                    if int(token[1])<len(buffer):
                        cur_word=buffer[-1-int(token[1])]
                        if  cur_word==0:
                            row[feature_name]=np.nan
                            break
                    else:
                        row[feature_name]=np.nan
                        break
                
                #determining feature of the word
                else:
                    for word in sentence:
                        if word['id']==cur_word:
                            if token=='w':
                                if len(val)==0:
                                    val=word['form']
                                    break
                                else:
                                    val=val+'_'+word['form']
                                    break
                            elif token=='p':
                                if len(val)==0:
                                    val=word['xpos']
                                    break
                                else:
                                    val=val+'_'+word['xpos']
                                    break
            row[feature_name]=val
        df=df.append(row,ignore_index = True)
    return df
                            

In [11]:
sentences=get_sentences(train_file_stat)

In [12]:
filt_sentences=filter_non_projective(sentences)

In [13]:
len(filt_sentences)

11467

In [41]:
feature_names=get_feature_names(['single_words'])

In [None]:
import pandas as pd
df=pd.dataframe(columns=)
for sentence in filt_sentences:

In [42]:
states,transitions=parse_eager(sentences[0])


In [58]:
df=extract_features(states,transitions,feature_names,sentences[0])

In [59]:
df

Unnamed: 0,S0_w_p,S0_w,S0_p,N0_w_p,N0_w,N0_p,N1_w_p,N1_w,N1_p,N2_w_p,N2_w,N2_p
0,,,,यह_DEM,यह,DEM,एशिया_NNP,एशिया,NNP,की_PSP,की,PSP
1,यह_DEM,यह,DEM,एशिया_NNP,एशिया,NNP,की_PSP,की,PSP,सबसे_INTF,सबसे,INTF
2,,,,एशिया_NNP,एशिया,NNP,की_PSP,की,PSP,सबसे_INTF,सबसे,INTF
3,एशिया_NNP,एशिया,NNP,की_PSP,की,PSP,सबसे_INTF,सबसे,INTF,बड़ी_JJ,बड़ी,JJ
4,की_PSP,की,PSP,सबसे_INTF,सबसे,INTF,बड़ी_JJ,बड़ी,JJ,मस्जिदों_NN,मस्जिदों,NN
5,एशिया_NNP,एशिया,NNP,सबसे_INTF,सबसे,INTF,बड़ी_JJ,बड़ी,JJ,मस्जिदों_NN,मस्जिदों,NN
6,सबसे_INTF,सबसे,INTF,बड़ी_JJ,बड़ी,JJ,मस्जिदों_NN,मस्जिदों,NN,में_PSP,में,PSP
7,एशिया_NNP,एशिया,NNP,बड़ी_JJ,बड़ी,JJ,मस्जिदों_NN,मस्जिदों,NN,में_PSP,में,PSP
8,बड़ी_JJ,बड़ी,JJ,मस्जिदों_NN,मस्जिदों,NN,में_PSP,में,PSP,से_PSP,से,PSP
9,एशिया_NNP,एशिया,NNP,मस्जिदों_NN,मस्जिदों,NN,में_PSP,में,PSP,से_PSP,से,PSP


In [17]:
buffer,stack,arcs=states[0]

In [52]:
for i in range(len(states)):
    print('buffer:\t',states[i][0])
    print('stack:\t',states[i][1])
    print('arcs:\t',states[i][2])
    print('transition:\t',transitions[i])

buffer:	 [11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
stack:	 [0]
arcs:	 {}
transition:	 ('shift', '')
buffer:	 [11, 10, 9, 8, 7, 6, 5, 4, 3, 2]
stack:	 [0, 1]
arcs:	 {}
transition:	 ('left_arc', 'det')
buffer:	 [11, 10, 9, 8, 7, 6, 5, 4, 3, 2]
stack:	 [0]
arcs:	 {2: [(1, 'det'), (3, 'case')]}
transition:	 ('shift', '')
buffer:	 [11, 10, 9, 8, 7, 6, 5, 4, 3]
stack:	 [0, 2]
arcs:	 {2: [(1, 'det'), (3, 'case')]}
transition:	 ('right_arc', 'case')
buffer:	 [11, 10, 9, 8, 7, 6, 5, 4]
stack:	 [0, 2, 3]
arcs:	 {2: [(1, 'det'), (3, 'case')]}
transition:	 ('reduce', '')
buffer:	 [11, 10, 9, 8, 7, 6, 5, 4]
stack:	 [0, 2]
arcs:	 {2: [(1, 'det'), (3, 'case')]}
transition:	 ('shift', '')
buffer:	 [11, 10, 9, 8, 7, 6, 5]
stack:	 [0, 2, 4]
arcs:	 {2: [(1, 'det'), (3, 'case')]}
transition:	 ('left_arc', 'advmod')
buffer:	 [11, 10, 9, 8, 7, 6, 5]
stack:	 [0, 2]
arcs:	 {2: [(1, 'det'), (3, 'case')], 5: [(4, 'advmod')]}
transition:	 ('shift', '')
buffer:	 [11, 10, 9, 8, 7, 6]
stack:	 [0, 2, 5]
arcs:	 {2: [(1, 'd

In [56]:
sentences[0]

[{'id': 1,
  'form': 'यह',
  'lemma': 'यह',
  'upos': 'DET',
  'xpos': 'DEM',
  'head': 2,
  'deprel': 'det'},
 {'id': 2,
  'form': 'एशिया',
  'lemma': 'एशिया',
  'upos': 'PROPN',
  'xpos': 'NNP',
  'head': 6,
  'deprel': 'nmod'},
 {'id': 3,
  'form': 'की',
  'lemma': 'का',
  'upos': 'ADP',
  'xpos': 'PSP',
  'head': 2,
  'deprel': 'case'},
 {'id': 4,
  'form': 'सबसे',
  'lemma': 'सबसे',
  'upos': 'ADV',
  'xpos': 'INTF',
  'head': 5,
  'deprel': 'advmod'},
 {'id': 5,
  'form': 'बड़ी',
  'lemma': 'बड़ा',
  'upos': 'ADJ',
  'xpos': 'JJ',
  'head': 6,
  'deprel': 'amod'},
 {'id': 6,
  'form': 'मस्जिदों',
  'lemma': 'मस्जिद',
  'upos': 'NOUN',
  'xpos': 'NN',
  'head': 9,
  'deprel': 'nmod'},
 {'id': 7,
  'form': 'में',
  'lemma': 'में',
  'upos': 'ADP',
  'xpos': 'PSP',
  'head': 6,
  'deprel': 'case'},
 {'id': 8,
  'form': 'से',
  'lemma': 'से',
  'upos': 'ADP',
  'xpos': 'PSP',
  'head': 6,
  'deprel': 'case'},
 {'id': 9,
  'form': 'एक',
  'lemma': 'एक',
  'upos': 'NUM',
  'xpos': 'QC'