In [2]:
train_file_sent="UD_Hindi-HDTB/hi_hdtb-ud-train.txt"
train_file_stat="UD_Hindi-HDTB/hi_hdtb-ud-train.conllu" 
test_file_sent="UD_Hindi-HDTB/hi_hdtb-ud-test.txt"
test_file_stat="UD_Hindi-HDTB/hi_hdtb-ud-test.conllu" 
val_file_sent="UD_Hindi-HDTB/hi_hdtb-ud-dev.txt"
val_file_stat="UD_Hindi-HDTB/hi_hdtb-ud-dev.conllu" 

In [3]:
def file_lines(name):
    lines=[]
    with open(name,'r',encoding = 'utf-8') as f:
        lines=f.readlines()
    for i in range(0,len(lines)):
        lines[i]=lines[i].strip().split('\t')
    return lines

In [4]:
def extract_stats(line):
    stats=dict()
    stats['form']=line[1]
    stats['lemma']=line[2]
    stats['upos']=line[3]
    stats['xpos']=line[4]
    stats['head']=int(line[6])
    stats['deprel']=line[7]
    return int(line[0]),stats

In [5]:
def path_exists(head,i,graph):
    while i!=head and i!=0:
        i=graph[i]
    return i==head
def projectivity(sentence):
    graph=dict()
    for word in sentence:
        word_id=word
        head_id=sentence[word_id]['head']
        graph[word_id]=head_id
    for dependent in graph:
        head=graph[dependent]
        l= min(head,dependent)+1
        r=max(head,dependent)
        for i in range(l,r,1):
            if not path_exists(head,i,graph):
                return False            
    return True

In [6]:
def filter_non_projective(sentences):
    filt_sentences=[sentence for sentence in sentences if projectivity(sentence)]
    return filt_sentences

In [7]:
def get_sentences(file):
    lines=file_lines(file)
    sentences=[]
    sentence={}
    for line in lines:
        if len(line)<10:
            if len(sentence)>0:
                #do something with sentence
                sentences.append(sentence)
                sentence={}
        else:
            id,stats=extract_stats(line)
            sentence[id]=stats
    return sentences

In [8]:
from collections import defaultdict
def parse(sentence):
    buffer=list()
    stack=[0]
    arcs=list()
    states=[]
    transitions=[]
    def perform_action(action,label=''):
        nonlocal buffer,stack,arcs,states,transitions
        states.append((list(buffer),list(stack),list(arcs)))
        transitions.append((action,label))
        if action=='shift':
            stack.append(buffer.pop())
        elif action=='left_arc':
            arcs.append((stack[-1],stack[-2],label))
            stack.pop(-2)
        elif action=='right_arc':
            arcs.append((stack[-2],stack[-1],label))
            stack.pop()
    dependency_graph = defaultdict(lambda: defaultdict())
    for word in reversed(sentence): #store in reverse in buffer, so top of buffer is first word
        buffer.append(word['id'])
        dependency_graph[word['head']][word['id']]=word['deprel']
    states=[]
    transitions=[]
    head_found=dict()
    while not (len(stack)==1 and stack[0]==0 and len(buffer)==0):
        if len(stack)>=2:
            if dependency_graph.get(stack[-1]) is not None and dependency_graph[stack[-1]].get(stack[-2]) is not None:
                head_found[stack[-2]]=True
                perform_action(action='left_arc',label=dependency_graph[stack[-1]][stack[-2]])
            elif dependency_graph.get(stack[-2]) is not None and dependency_graph[stack[-2]].get(stack[-1]) is not None:
                if dependency_graph.get(stack[-1]) is not None and any([dependent not in head_found for dependent in dependency_graph[stack[-1]].keys()]):
                    perform_action(action='shift')
                else:
                    head_found[stack[-1]]=True
                    perform_action(action='right_arc',label=dependency_graph[stack[-2]][stack[-1]])
            else:
                perform_action(action='shift')
        else:
            perform_action(action='shift')
    return states,transitions

In [9]:
from collections import defaultdict
def parse_eager(sentence):
    buffer=list()
    stack=[0]
    arcs={}
    states=[]
    transitions=[]
    def perform_action(action,label=''):
        nonlocal buffer,stack,arcs,states,transitions
        states.append((list(buffer),list(stack),arcs.copy()))
        transitions.append((action,label))
        if action=='shift':
            stack.append(buffer.pop())
        elif action=='reduce':
            stack.pop()
        elif action=='left_arc':
            if arcs.get(buffer[-1])==None:
                arcs[buffer[-1]]=[(stack.pop(),label)]
            else:
                arcs[buffer[-1]].append((stack.pop(),label))
        elif action=='right_arc':
            if arcs.get(stack[-1])==None:
                arcs[stack[-1]]=[(buffer[-1],label)]
            else:
                arcs[stack[-1]].append((buffer[-1],label))
            stack.append(buffer.pop())
    dependency_graph = defaultdict(lambda: defaultdict())
    for word in reversed(sentence): #store in reverse in buffer, so top of buffer is first word
        buffer.append(word)
        dependency_graph[sentence[word]['head']][word]=sentence[word]['deprel']
    head_found=dict()
    while not len(buffer)==0:
        if len(stack)>=1:
            if head_found.get(stack[-1])==None and dependency_graph.get(buffer[-1]) is not None and dependency_graph[buffer[-1]].get(stack[-1]) is not None:
                head_found[stack[-1]]=True
                perform_action(action='left_arc',label=dependency_graph[buffer[-1]][stack[-1]])
            elif dependency_graph.get(stack[-1]) is not None and dependency_graph[stack[-1]].get(buffer[-1]) is not None:
                head_found[buffer[-1]]=True
                perform_action(action='right_arc',label=dependency_graph[stack[-1]][buffer[-1]])
            elif head_found.get(stack[-1])!=None:
                perform_action(action='reduce')
            else:
                perform_action(action='shift')
        else:
            perform_action(action='shift')
    return states,transitions

In [10]:
def get_feature_names(feature_array):
    feature_names=[]
    if 'single_words' in feature_array:
        feature_names.extend(['S0_w_p','S0_w','S0_p','N0_w_p','N0_w','N0_p','N1_w_p','N1_w','N1_p','N2_w_p','N2_w','N2_p'])
    return feature_names
    

In [11]:
import numpy as np
import pandas as pd
import itertools
def extract_features_oneword(states,transitions,sentence,data,counter,label=-1):
    for state,transition in zip(states,transitions):
        buffer,stack,arcs=state
        row={}
        if len(stack)>1:
            row['S0_w']=sentence[stack[-1]]['form']
            row['S0_p']=sentence[stack[-1]]['xpos']
            row['S0_w_p']=sentence[stack[-1]]['form']+'_'+sentence[stack[-1]]['xpos']
        else:
            row['S0_w']='None'
            row['S0_p']='None'
            row['S0_w_p']='None'
        if len(buffer)>=1:
            row['N0_w']=sentence[buffer[-1]]['form']
            row['N0_p']=sentence[buffer[-1]]['xpos']
            row['N0_w_p']=sentence[buffer[-1]]['form']+'_'+sentence[buffer[-1]]['xpos']
        else:
            row['N0_w']='None'
            row['N0_p']='None'
            row['N0_w_p']='None'
        if len(buffer)>=2:
            row['N1_w']=sentence[buffer[-2]]['form']
            row['N1_p']=sentence[buffer[-2]]['xpos']
            row['N1_w_p']=sentence[buffer[-2]]['form']+'_'+sentence[buffer[-2]]['xpos']
        else:
            row['N1_w']='None'
            row['N1_p']='None'
            row['N1_w_p']='None'
        if len(buffer)>=3:
            row['N2_w']=sentence[buffer[-3]]['form']
            row['N2_p']=sentence[buffer[-3]]['xpos']
            row['N2_w_p']=sentence[buffer[-3]]['form']+'_'+sentence[buffer[-3]]['xpos']
        else:
            row['N2_w']='None'
            row['N2_p']='None'
            row['N2_w_p']='None'
        if label!=-1:
            if transition[1]=='':
                label[counter]=transition[0]
            else:
                label[counter]=transition[0]+','+transition[1]
        data[counter]=row
        counter+=1
    return counter
            

In [12]:
import numpy as np
import pandas as pd
import itertools
def extract_features(states,transitions,feature_names,sentence,data,counter,label=-1):
    for state,transition in zip(states,transitions):
        buffer,stack,arcs=state
        row={}
        for feature_name in feature_names[:-1]:
            feature_tokens=feature_name.split('_')
            cur_word=0
            val=''
            for token in feature_tokens:
                
                #determining word
                if token[0]=='S':
                    if int(token[1])<len(stack):
                        cur_word=stack[-1-int(token[1])]
                        if  cur_word==0:
                            row[feature_name]='None'
                            break
                    else:
                        row[feature_name]='None'
                        break
                elif token[0]=='N':
                    if int(token[1])<len(buffer):
                        cur_word=buffer[-1-int(token[1])]
                        if  cur_word==0:
                            row[feature_name]='None'
                            break
                    else:
                        row[feature_name]='None'
                        break
                
                #determining feature of the word
                else:
                    if token=='w':
                        if len(val)==0:
                            val=sentence[cur_word]['form']
                            break
                        else:
                            val=val+'_'+sentence[cur_word]['form']
                            break
                    elif token=='p':
                        if len(val)==0:
                            val=sentence[cur_word]['xpos']
                            break
                        else:
                            val=val+'_'+sentence[cur_word]['xpos']
                            break
                           
            row[feature_name]=val
        if transition[1]=='':
            row['transition']=transition[0]
        else:
            row['transition']=transition[0]+','+transition[1]
            
        data[counter]=row
        counter+=1
    return counter
                            

In [13]:
sentences=get_sentences(train_file_stat)

In [14]:
sentences[0]

{1: {'form': 'यह',
  'lemma': 'यह',
  'upos': 'DET',
  'xpos': 'DEM',
  'head': 2,
  'deprel': 'det'},
 2: {'form': 'एशिया',
  'lemma': 'एशिया',
  'upos': 'PROPN',
  'xpos': 'NNP',
  'head': 6,
  'deprel': 'nmod'},
 3: {'form': 'की',
  'lemma': 'का',
  'upos': 'ADP',
  'xpos': 'PSP',
  'head': 2,
  'deprel': 'case'},
 4: {'form': 'सबसे',
  'lemma': 'सबसे',
  'upos': 'ADV',
  'xpos': 'INTF',
  'head': 5,
  'deprel': 'advmod'},
 5: {'form': 'बड़ी',
  'lemma': 'बड़ा',
  'upos': 'ADJ',
  'xpos': 'JJ',
  'head': 6,
  'deprel': 'amod'},
 6: {'form': 'मस्जिदों',
  'lemma': 'मस्जिद',
  'upos': 'NOUN',
  'xpos': 'NN',
  'head': 9,
  'deprel': 'nmod'},
 7: {'form': 'में',
  'lemma': 'में',
  'upos': 'ADP',
  'xpos': 'PSP',
  'head': 6,
  'deprel': 'case'},
 8: {'form': 'से',
  'lemma': 'से',
  'upos': 'ADP',
  'xpos': 'PSP',
  'head': 6,
  'deprel': 'case'},
 9: {'form': 'एक',
  'lemma': 'एक',
  'upos': 'NUM',
  'xpos': 'QC',
  'head': 0,
  'deprel': 'root'},
 10: {'form': 'है',
  'lemma': 'है',

In [15]:
filt_sentences=filter_non_projective(sentences)

In [16]:
len(filt_sentences)

11467

In [17]:
feature_names=get_feature_names(['single_words'])
feature_names.append('transition')

In [18]:
import pandas as pd
from tqdm.notebook import tqdm
data={}
label={}
counter=0
for i in tqdm(range(len(filt_sentences))):
    states,transitions=parse_eager(filt_sentences[i])
    counter=extract_features_oneword(states,transitions,filt_sentences[i],data,counter,label)
    #counter=extract_features(states,transitions,feature_names,filt_sentences[i],data,counter)

HBox(children=(FloatProgress(value=0.0, max=11467.0), HTML(value='')))




In [19]:
df = pd.DataFrame.from_dict(data,'index')

In [20]:
df

Unnamed: 0,S0_w,S0_p,S0_w_p,N0_w,N0_p,N0_w_p,N1_w,N1_p,N1_w_p,N2_w,N2_p,N2_w_p
0,,,,यह,DEM,यह_DEM,एशिया,NNP,एशिया_NNP,की,PSP,की_PSP
1,यह,DEM,यह_DEM,एशिया,NNP,एशिया_NNP,की,PSP,की_PSP,सबसे,INTF,सबसे_INTF
2,,,,एशिया,NNP,एशिया_NNP,की,PSP,की_PSP,सबसे,INTF,सबसे_INTF
3,एशिया,NNP,एशिया_NNP,की,PSP,की_PSP,सबसे,INTF,सबसे_INTF,बड़ी,JJ,बड़ी_JJ
4,की,PSP,की_PSP,सबसे,INTF,सबसे_INTF,बड़ी,JJ,बड़ी_JJ,मस्जिदों,NN,मस्जिदों_NN
...,...,...,...,...,...,...,...,...,...,...,...,...
437546,दूतावास,NNP,दूतावास_NNP,किया,VM,किया_VM,।,SYM,।_SYM,,,
437547,यहां,PRP,यहां_PRP,किया,VM,किया_VM,।,SYM,।_SYM,,,
437548,उन्होंने,PRP,उन्होंने_PRP,किया,VM,किया_VM,।,SYM,।_SYM,,,
437549,,,,किया,VM,किया_VM,।,SYM,।_SYM,,,


In [21]:
label=np.array(list(label.values()))

In [22]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore',sparse=True,dtype=int)
x=enc.fit_transform(df)

In [23]:
x

<437551x138980 sparse matrix of type '<class 'numpy.int64'>'
	with 5250612 stored elements in Compressed Sparse Row format>

In [24]:
from sklearn.naive_bayes import GaussianNB,BernoulliNB
model=model = BernoulliNB()
model.fit(x,label)

BernoulliNB()

In [27]:
states,transitions=parse_eager(sentences[0])


In [None]:
df=extract_features(states,transitions,feature_names,sentences[0])

In [None]:
df

In [None]:
df=extract_features_oneword(states,transitions,sentences[0])
df

In [None]:
buffer,stack,arcs=states[0]

In [28]:
for i in range(len(states)):
    print('buffer:\t',states[i][0])
    print('stack:\t',states[i][1])
    print('arcs:\t',states[i][2])
    print('transition:\t',transitions[i])

buffer:	 [11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
stack:	 [0]
arcs:	 {}
transition:	 ('shift', '')
buffer:	 [11, 10, 9, 8, 7, 6, 5, 4, 3, 2]
stack:	 [0, 1]
arcs:	 {}
transition:	 ('left_arc', 'det')
buffer:	 [11, 10, 9, 8, 7, 6, 5, 4, 3, 2]
stack:	 [0]
arcs:	 {2: [(1, 'det'), (3, 'case')]}
transition:	 ('shift', '')
buffer:	 [11, 10, 9, 8, 7, 6, 5, 4, 3]
stack:	 [0, 2]
arcs:	 {2: [(1, 'det'), (3, 'case')]}
transition:	 ('right_arc', 'case')
buffer:	 [11, 10, 9, 8, 7, 6, 5, 4]
stack:	 [0, 2, 3]
arcs:	 {2: [(1, 'det'), (3, 'case')]}
transition:	 ('reduce', '')
buffer:	 [11, 10, 9, 8, 7, 6, 5, 4]
stack:	 [0, 2]
arcs:	 {2: [(1, 'det'), (3, 'case')]}
transition:	 ('shift', '')
buffer:	 [11, 10, 9, 8, 7, 6, 5]
stack:	 [0, 2, 4]
arcs:	 {2: [(1, 'det'), (3, 'case')]}
transition:	 ('left_arc', 'advmod')
buffer:	 [11, 10, 9, 8, 7, 6, 5]
stack:	 [0, 2]
arcs:	 {2: [(1, 'det'), (3, 'case')], 5: [(4, 'advmod')]}
transition:	 ('shift', '')
buffer:	 [11, 10, 9, 8, 7, 6]
stack:	 [0, 2, 5]
arcs:	 {2: [(1, 'd

In [None]:
sentences[0]

In [None]:
import itertools
a=['1','23','e']
b=['d','232','wewe']
for x,y in zip(a,b):
    print(x)
    print(y)

In [None]:
def do_something(a,counter):
    a[4]='dsds'
    counter=5

In [None]:
a={1:'dsa',2:'asd',3:'d'}
counter=1
do_something(a,counter)
print(a)
print(counter)

In [49]:
enc.inverse_transform(x[3])

array([['एशिया', 'NNP', 'एशिया_NNP', 'की', 'PSP', 'की_PSP', 'सबसे',
        'INTF', 'सबसे_INTF', 'बड़ी', 'JJ', 'बड़ी_JJ']], dtype=object)

In [50]:
label[3]

'right_arc,case'

In [51]:
model.predict(x[3])

array(['right_arc,case'], dtype='<U20')

In [52]:
model.predict_proba(x[3])

array([[2.34669102e-049, 2.71559648e-238, 1.05817858e-037,
        1.92568446e-040, 2.22801870e-025, 0.00000000e+000,
        0.00000000e+000, 2.81912833e-093, 1.26308310e-029,
        8.02775019e-018, 0.00000000e+000, 3.77546272e-113,
        1.49841387e-029, 0.00000000e+000, 4.05127231e-046,
        5.09656200e-033, 1.59648976e-016, 5.18373361e-020,
        5.25226836e-168, 2.27849073e-034, 3.40375266e-022,
        1.98450334e-018, 1.36217559e-033, 0.00000000e+000,
        1.27423911e-137, 5.24379398e-009, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 1.53296109e-204,
        2.73213203e-257, 9.21533023e-034, 5.90737155e-034,
        9.99999995e-001, 0.00000000e+000, 0.00000000e+000,
        2.61096691e-023, 1.41447833e-044, 7.15302082e-036,
        0.00000000e+000, 4.08491434e-296, 1.84577671e-017,
        1.24831054e-175, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 2.17161161e-030,
        1.58250475e-035, 0.00000000e+000, 0.00000000e+00

In [41]:
model.classes_

array(['left_arc,acl', 'left_arc,acl:relcl', 'left_arc,advcl',
       'left_arc,advmod', 'left_arc,amod', 'left_arc,aux',
       'left_arc,aux:pass', 'left_arc,case', 'left_arc,cc',
       'left_arc,compound', 'left_arc,cop', 'left_arc,dep',
       'left_arc,det', 'left_arc,dislocated', 'left_arc,iobj',
       'left_arc,mark', 'left_arc,nmod', 'left_arc,nsubj',
       'left_arc,nsubj:pass', 'left_arc,nummod', 'left_arc,obj',
       'left_arc,obl', 'left_arc,punct', 'left_arc,vocative',
       'left_arc,xcomp', 'reduce', 'right_arc,acl', 'right_arc,acl:relcl',
       'right_arc,advcl', 'right_arc,advmod', 'right_arc,amod',
       'right_arc,aux', 'right_arc,aux:pass', 'right_arc,case',
       'right_arc,cc', 'right_arc,compound', 'right_arc,conj',
       'right_arc,cop', 'right_arc,dep', 'right_arc,det',
       'right_arc,dislocated', 'right_arc,mark', 'right_arc,nmod',
       'right_arc,nsubj', 'right_arc,nummod', 'right_arc,obj',
       'right_arc,obl', 'right_arc,punct', 'right_arc,r