In [2]:
from sklearn.preprocessing import OneHotEncoder
from collections import defaultdict
import numpy as np
import pandas as pd
import itertools
import pandas as pd
from tqdm.notebook import tqdm
from collections import OrderedDict
from sklearn.naive_bayes import GaussianNB,BernoulliNB

In [3]:
train_file_sent="UD_Hindi-HDTB/hi_hdtb-ud-train.txt"
train_file_stat="UD_Hindi-HDTB/hi_hdtb-ud-train.conllu" 
test_file_sent="UD_Hindi-HDTB/hi_hdtb-ud-test.txt"
test_file_stat="UD_Hindi-HDTB/hi_hdtb-ud-test.conllu" 
val_file_sent="UD_Hindi-HDTB/hi_hdtb-ud-dev.txt"
val_file_stat="UD_Hindi-HDTB/hi_hdtb-ud-dev.conllu" 

In [4]:
def file_lines(name):
    lines=[]
    with open(name,'r',encoding = 'utf-8') as f:
        lines=f.readlines()
    for i in range(0,len(lines)):
        lines[i]=lines[i].strip().split('\t')
    return lines

In [5]:
def extract_stats(line):
    stats=dict()
    stats['form']=line[1]
    stats['lemma']=line[2]
    stats['upos']=line[3]
    stats['xpos']=line[4]
    stats['head']=int(line[6])
    stats['deprel']=line[7]
    return int(line[0]),stats

In [6]:
def path_exists(head,i,graph):
    while i!=head and i!=0:
        i=graph[i]
    return i==head
def projectivity(sentence):
    graph=dict()
    for word in sentence:
        word_id=word
        head_id=sentence[word_id]['head']
        graph[word_id]=head_id
    for dependent in graph:
        head=graph[dependent]
        l= min(head,dependent)+1
        r=max(head,dependent)
        for i in range(l,r,1):
            if not path_exists(head,i,graph):
                return False            
    return True

In [7]:
def filter_non_projective(sentences):
    filt_sentences=[sentence for sentence in sentences if projectivity(sentence)]
    return filt_sentences

In [8]:
def get_sentences(file):
    lines=file_lines(file)
    sentences=[]
    sentence={}
    for line in lines:
        if len(line)<10:
            if len(sentence)>0:
                #do something with sentence
                sentences.append(sentence)
                sentence={}
        else:
            id,stats=extract_stats(line)
            sentence[id]=stats
    return sentences

In [None]:
def parse(sentence):
    buffer=list()
    stack=[0]
    arcs=list()
    states=[]
    transitions=[]
    def perform_action(action,label=''):
        nonlocal buffer,stack,arcs,states,transitions
        states.append((list(buffer),list(stack),list(arcs)))
        transitions.append((action,label))
        if action=='shift':
            stack.append(buffer.pop())
        elif action=='left_arc':
            arcs.append((stack[-1],stack[-2],label))
            stack.pop(-2)
        elif action=='right_arc':
            arcs.append((stack[-2],stack[-1],label))
            stack.pop()
    dependency_graph = defaultdict(lambda: defaultdict())
    for word in reversed(sentence): #store in reverse in buffer, so top of buffer is first word
        buffer.append(word['id'])
        dependency_graph[word['head']][word['id']]=word['deprel']
    states=[]
    transitions=[]
    head_found=dict()
    while not (len(stack)==1 and stack[0]==0 and len(buffer)==0):
        if len(stack)>=2:
            if dependency_graph.get(stack[-1]) is not None and dependency_graph[stack[-1]].get(stack[-2]) is not None:
                head_found[stack[-2]]=True
                perform_action(action='left_arc',label=dependency_graph[stack[-1]][stack[-2]])
            elif dependency_graph.get(stack[-2]) is not None and dependency_graph[stack[-2]].get(stack[-1]) is not None:
                if dependency_graph.get(stack[-1]) is not None and any([dependent not in head_found for dependent in dependency_graph[stack[-1]].keys()]):
                    perform_action(action='shift')
                else:
                    head_found[stack[-1]]=True
                    perform_action(action='right_arc',label=dependency_graph[stack[-2]][stack[-1]])
            else:
                perform_action(action='shift')
        else:
            perform_action(action='shift')
    return states,transitions

In [9]:
def parse_eager(sentence):
    buffer=list()
    stack=[0]
    arcs={}
    states=[]
    transitions=[]
    def perform_action(action,label=''):
        nonlocal buffer,stack,arcs,states,transitions
        states.append((list(buffer),list(stack),arcs.copy()))
        transitions.append((action,label))
        if action=='shift':
            stack.append(buffer.pop())
        elif action=='reduce':
            stack.pop()
        elif action=='left_arc':
            if arcs.get(buffer[-1])==None:
                arcs[buffer[-1]]=[(stack.pop(),label)]
            else:
                arcs[buffer[-1]].append((stack.pop(),label))
        elif action=='right_arc':
            if arcs.get(stack[-1])==None:
                arcs[stack[-1]]=[(buffer[-1],label)]
            else:
                arcs[stack[-1]].append((buffer[-1],label))
            stack.append(buffer.pop())
    dependency_graph = defaultdict(lambda: defaultdict())
    for word in reversed(sentence): #store in reverse in buffer, so top of buffer is first word
        buffer.append(word)
        dependency_graph[sentence[word]['head']][word]=sentence[word]['deprel']
    head_found=dict()
    while not len(buffer)==0:
        if len(stack)>=1:
            if stack[-1]!=0 and head_found.get(stack[-1])==None and dependency_graph.get(buffer[-1]) is not None and dependency_graph[buffer[-1]].get(stack[-1]) is not None:
                head_found[stack[-1]]=True
                perform_action(action='left_arc',label=dependency_graph[buffer[-1]][stack[-1]])
            elif head_found.get(buffer[-1])==None and dependency_graph.get(stack[-1]) is not None and dependency_graph[stack[-1]].get(buffer[-1]) is not None:
                head_found[buffer[-1]]=True
                perform_action(action='right_arc',label=dependency_graph[stack[-1]][buffer[-1]])
            elif head_found.get(stack[-1])!=None:
                perform_action(action='reduce')
            else:
                perform_action(action='shift')
        else:
            perform_action(action='shift')
    return states,transitions

In [10]:
def get_feature_names(feature_array):
    feature_names=[]
    if 'single_words' in feature_array:
        feature_names.extend(['S0_w_p','S0_w','S0_p','N0_w_p','N0_w','N0_p','N1_w_p','N1_w','N1_p','N2_w_p','N2_w','N2_p'])
    return feature_names
    

In [11]:
def extract_features_oneword(state,row,sentence):
    buffer,stack,arcs=state
    if len(stack)>1:
        row['S0_w']=sentence[stack[-1]]['form']
        row['S0_p']=sentence[stack[-1]]['xpos']
        row['S0_w_p']=sentence[stack[-1]]['form']+'_'+sentence[stack[-1]]['xpos']
    else:
        row['S0_w']='None'
        row['S0_p']='None'
        row['S0_w_p']='None'
    if len(buffer)>=1:
        row['N0_w']=sentence[buffer[-1]]['form']
        row['N0_p']=sentence[buffer[-1]]['xpos']
        row['N0_w_p']=sentence[buffer[-1]]['form']+'_'+sentence[buffer[-1]]['xpos']
    else:
        row['N0_w']='None'
        row['N0_p']='None'
        row['N0_w_p']='None'
    if len(buffer)>=2:
        row['N1_w']=sentence[buffer[-2]]['form']
        row['N1_p']=sentence[buffer[-2]]['xpos']
        row['N1_w_p']=sentence[buffer[-2]]['form']+'_'+sentence[buffer[-2]]['xpos']
    else:
        row['N1_w']='None'
        row['N1_p']='None'
        row['N1_w_p']='None'
    if len(buffer)>=3:
        row['N2_w']=sentence[buffer[-3]]['form']
        row['N2_p']=sentence[buffer[-3]]['xpos']
        row['N2_w_p']=sentence[buffer[-3]]['form']+'_'+sentence[buffer[-3]]['xpos']
    else:
        row['N2_w']='None'
        row['N2_p']='None'
        row['N2_w_p']='None'
    return row

In [12]:

def extract_features(states,transitions,sentence,data,counter,label):
    for state,transition in zip(states,transitions):
        row={}
        row=extract_features_oneword(state,row,sentence)
        if transition[1]=='':
            label[counter]=transition[0]
        else:
            label[counter]=transition[0]+','+transition[1]
        data[counter]=row
        counter+=1
    return counter
            

In [13]:

def extract_features_old(states,transitions,feature_names,sentence,data,counter,label=-1):
    for state,transition in zip(states,transitions):
        buffer,stack,arcs=state
        row={}
        for feature_name in feature_names[:-1]:
            feature_tokens=feature_name.split('_')
            cur_word=0
            val=''
            for token in feature_tokens:
                
                #determining word
                if token[0]=='S':
                    if int(token[1])<len(stack):
                        cur_word=stack[-1-int(token[1])]
                        if  cur_word==0:
                            row[feature_name]='None'
                            break
                    else:
                        row[feature_name]='None'
                        break
                elif token[0]=='N':
                    if int(token[1])<len(buffer):
                        cur_word=buffer[-1-int(token[1])]
                        if  cur_word==0:
                            row[feature_name]='None'
                            break
                    else:
                        row[feature_name]='None'
                        break
                
                #determining feature of the word
                else:
                    if token=='w':
                        if len(val)==0:
                            val=sentence[cur_word]['form']
                            break
                        else:
                            val=val+'_'+sentence[cur_word]['form']
                            break
                    elif token=='p':
                        if len(val)==0:
                            val=sentence[cur_word]['xpos']
                            break
                        else:
                            val=val+'_'+sentence[cur_word]['xpos']
                            break
                           
            row[feature_name]=val
        if transition[1]=='':
            row['transition']=transition[0]
        else:
            row['transition']=transition[0]+','+transition[1]
            
        data[counter]=row
        counter+=1
    return counter
                            

In [14]:
def parse_eager_test(sentence,model,enc):
    buffer=list()
    stack=[0]
    arcs={}
    states=[]
    transitions=[]
    def perform_action(action,label=''):
        nonlocal buffer,stack,arcs,states,transitions
        states.append((list(buffer),list(stack),arcs.copy()))
        transitions.append((action,label))
        if action=='shift':
            stack.append(buffer.pop())
        elif action=='reduce':
            stack.pop()
        elif action=='left_arc':
            if arcs.get(buffer[-1])==None:
                arcs[buffer[-1]]=[(stack.pop(),label)]
            else:
                arcs[buffer[-1]].append((stack.pop(),label))
        elif action=='right_arc':
            if arcs.get(stack[-1])==None:
                arcs[stack[-1]]=[(buffer[-1],label)]
            else:
                arcs[stack[-1]].append((buffer[-1],label))
            stack.append(buffer.pop())
    for word in reversed(sentence): #store in reverse in buffer, so top of buffer is first word
        buffer.append(word)
    head_found=dict()
    while not len(buffer)==0:
        state=(list(buffer),list(stack),arcs.copy())
        row={}
        extract_features_oneword(state,row,sentence)
        data={0:row}
        df = pd.DataFrame.from_dict(data,'index')
        x=enc.transform(df)
        decision_probs=np.argsort(-1 *model.predict_proba(x)[0])
        print(state)
        for decision in decision_probs:
            temp=model.classes_[decision].split(',')
            print(temp)
            transition=temp[0]
            label=temp[1] if len(temp)>1 else ''
            if transition=='left_arc':
                if stack[-1]!=0 and head_found.get(stack[-1])==None and len(stack)>=1:
                    head_found[stack[-1]]=buffer[-1]
                    perform_action(action='left_arc',label=label)
                    break
            if transition=='right_arc':
                if len(stack)>=1 and head_found.get(buffer[-1])==None:
                    head_found[buffer[-1]]=stack[-1]
                    perform_action(action='right_arc',label=label) 
                    break
            if transition=='reduce':
                if head_found.get(stack[-1])!=None and len(stack)>=1:
                    perform_action(action='reduce')
                    break
            if (transition=='shift' and stack[-1] in arcs) or decision==decision_probs[-1]:
                perform_action(action='shift')
                break
    return head_found,states,transitions

In [15]:
def get_model():
    sentences=get_sentences(train_file_stat)
    filt_sentences=filter_non_projective(sentences)
    data={}
    label={}
    counter=0
    for i in tqdm(range(len(filt_sentences))):
        states,transitions=parse_eager(filt_sentences[i])
        counter=extract_features(states,transitions,filt_sentences[i],data,counter,label)
    df = pd.DataFrame.from_dict(data,'index')
    label=np.array(list(label.values()))
    enc = OneHotEncoder(handle_unknown='ignore',sparse=True,dtype=int)
    x=enc.fit_transform(df)
    model= BernoulliNB()
    model.fit(x,label)
    return model,enc

In [16]:
model,enc=get_model()

HBox(children=(FloatProgress(value=0.0, max=11467.0), HTML(value='')))




In [17]:
sentences=get_sentences(test_file_stat)

In [18]:
filt_sentences=filter_non_projective(sentences)

In [19]:
heads,states,transitions=parse_eager_test(filt_sentences[0],model,enc)

([14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1], [0], {})
['shift']
['right_arc', 'root']
([14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2], [0, 1], {0: [(1, 'root')]})
['right_arc', 'case']
([14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3], [0, 1, 2], {0: [(1, 'root')], 1: [(2, 'case')]})
['reduce']
([14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3], [0, 1], {0: [(1, 'root')], 1: [(2, 'case')]})
['shift']
([14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4], [0, 1, 3], {0: [(1, 'root')], 1: [(2, 'case')]})
['left_arc', 'compound']
([14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4], [0, 1], {0: [(1, 'root')], 1: [(2, 'case')], 4: [(3, 'compound')]})
['shift']
([14, 13, 12, 11, 10, 9, 8, 7, 6, 5], [0, 1, 4], {0: [(1, 'root')], 1: [(2, 'case')], 4: [(3, 'compound')]})
['shift']
([14, 13, 12, 11, 10, 9, 8, 7, 6], [0, 1, 4, 5], {0: [(1, 'root')], 1: [(2, 'case')], 4: [(3, 'compound')]})
['shift']
['reduce']
['right_arc', 'case']
([14, 13, 12, 11, 10, 9, 8, 7], [0, 1, 4, 5, 6], {0: [(1, 'root')], 1: [(2, 'case')], 4: [(3, 'compound

In [20]:
heads

{1: 0, 2: 1, 3: 4, 6: 5, 8: 7, 7: 9, 10: 11, 11: 12, 9: 12, 12: 13, 14: 13}

In [43]:
states

[([14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1], [0], {}),
 ([14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2], [0, 1], {}),
 ([14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3], [0, 1, 2], {1: [(2, 'case')]}),
 ([14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3], [0, 1], {1: [(2, 'case')]}),
 ([14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4], [0, 1, 3], {1: [(2, 'case')]}),
 ([14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4],
  [0, 1],
  {1: [(2, 'case')], 4: [(3, 'compound')]}),
 ([14, 13, 12, 11, 10, 9, 8, 7, 6, 5],
  [0, 1, 4],
  {1: [(2, 'case')], 4: [(3, 'compound')]}),
 ([14, 13, 12, 11, 10, 9, 8, 7, 6],
  [0, 1, 4, 5],
  {1: [(2, 'case')], 4: [(3, 'compound')]}),
 ([14, 13, 12, 11, 10, 9, 8, 7],
  [0, 1, 4, 5, 6],
  {1: [(2, 'case')], 4: [(3, 'compound')], 5: [(6, 'case')]}),
 ([14, 13, 12, 11, 10, 9, 8, 7],
  [0, 1, 4, 5],
  {1: [(2, 'case')], 4: [(3, 'compound')], 5: [(6, 'case')]}),
 ([14, 13, 12, 11, 10, 9, 8],
  [0, 1, 4, 5, 7],
  {1: [(2, 'case')], 4: [(3, 'compound')], 5: [(6, 'case')]}),
 ([14, 13, 12, 11, 10, 

In [21]:
filt_sentences[0]

{1: {'form': 'इसके',
  'lemma': 'यह',
  'upos': 'PRON',
  'xpos': 'PRP',
  'head': 12,
  'deprel': 'nmod'},
 2: {'form': 'अतिरिक्त',
  'lemma': 'अतिरिक्त',
  'upos': 'ADP',
  'xpos': 'PSP',
  'head': 1,
  'deprel': 'case'},
 3: {'form': 'गुग्गुल',
  'lemma': 'गुग्गुल',
  'upos': 'PROPN',
  'xpos': 'NNPC',
  'head': 4,
  'deprel': 'compound'},
 4: {'form': 'कुंड',
  'lemma': 'कुंड',
  'upos': 'PROPN',
  'xpos': 'NNP',
  'head': 12,
  'deprel': 'nsubj'},
 5: {'form': ',',
  'lemma': 'COMMA',
  'upos': 'PUNCT',
  'xpos': 'SYM',
  'head': 7,
  'deprel': 'punct'},
 6: {'form': 'भीम',
  'lemma': 'भीम',
  'upos': 'PROPN',
  'xpos': 'NNPC',
  'head': 7,
  'deprel': 'compound'},
 7: {'form': 'गुफा',
  'lemma': 'गुफा',
  'upos': 'PROPN',
  'xpos': 'NNP',
  'head': 4,
  'deprel': 'conj'},
 8: {'form': 'तथा',
  'lemma': 'तथा',
  'upos': 'CCONJ',
  'xpos': 'CC',
  'head': 9,
  'deprel': 'cc'},
 9: {'form': 'भीमशिला',
  'lemma': 'भीमशिला',
  'upos': 'PROPN',
  'xpos': 'NNP',
  'head': 4,
  'deprel':

In [None]:
filt_sentences=filter_non_projective(sentences)

In [None]:
len(filt_sentences)

In [None]:
feature_names=get_feature_names(['single_words'])
feature_names.append('transition')

In [None]:

data={}
label={}
counter=0
for i in tqdm(range(len(filt_sentences))):
    states,transitions=parse_eager(filt_sentences[i])
    counter=extract_features_oneword(states,transitions,filt_sentences[i],data,counter,label)
    #counter=extract_features(states,transitions,feature_names,filt_sentences[i],data,counter)

In [None]:
df = pd.DataFrame.from_dict(data,'index')

In [None]:
df

In [None]:
label=np.array(list(label.values()))

In [None]:

enc = OneHotEncoder(handle_unknown='ignore',sparse=True,dtype=int)
x=enc.fit_transform(df)

In [None]:
x

In [None]:

model=model = BernoulliNB()
model.fit(x,label)

In [None]:
states,transitions=parse_eager(sentences[0])


In [None]:
df

In [None]:
df=extract_features_oneword(states,transitions,sentences[0])
df

In [None]:
buffer,stack,arcs=states[0]

In [None]:
for i in range(len(states)):
    print('buffer:\t',states[i][0])
    print('stack:\t',states[i][1])
    print('arcs:\t',states[i][2])
    print('transition:\t',transitions[i])

In [None]:
sentences[0]

In [None]:
import itertools
a=['1','23','e']
b=['d','232','wewe']
for x,y in zip(a,b):
    print(x)
    print(y)

In [None]:
def do_something(a,counter):
    a[4]='dsds'
    counter=5

In [None]:
a={1:'dsa',2:'asd',3:'d'}
counter=1
do_something(a,counter)
print(a)
print(counter)

In [14]:
sentences=get_sentences(train_file_stat)
filt_sentences=filter_non_projective(sentences)
data={}
label={}
counter=0


In [15]:
states,transitions=parse_eager(filt_sentences[1])

In [16]:
states

[([7, 6, 5, 4, 3, 2, 1], [0], {}),
 ([7, 6, 5, 4, 3, 2], [0, 1], {}),
 ([7, 6, 5, 4, 3], [0, 1, 2], {}),
 ([7, 6, 5, 4, 3], [0, 1], {3: [(2, 'compound'), (4, 'case')]}),
 ([7, 6, 5, 4], [0, 1, 3], {3: [(2, 'compound'), (4, 'case')]}),
 ([7, 6, 5], [0, 1, 3, 4], {3: [(2, 'compound'), (4, 'case')]}),
 ([7, 6, 5], [0, 1, 3], {3: [(2, 'compound'), (4, 'case')]}),
 ([7, 6, 5],
  [0, 1],
  {3: [(2, 'compound'), (4, 'case')],
   5: [(3, 'nsubj'), (1, 'obj'), (6, 'aux'), (7, 'punct')]}),
 ([7, 6, 5],
  [0],
  {3: [(2, 'compound'), (4, 'case')],
   5: [(3, 'nsubj'), (1, 'obj'), (6, 'aux'), (7, 'punct')]}),
 ([7, 6],
  [0, 5],
  {3: [(2, 'compound'), (4, 'case')],
   5: [(3, 'nsubj'), (1, 'obj'), (6, 'aux'), (7, 'punct')]}),
 ([7],
  [0, 5, 6],
  {3: [(2, 'compound'), (4, 'case')],
   5: [(3, 'nsubj'), (1, 'obj'), (6, 'aux'), (7, 'punct')]}),
 ([7],
  [0, 5],
  {3: [(2, 'compound'), (4, 'case')],
   5: [(3, 'nsubj'), (1, 'obj'), (6, 'aux'), (7, 'punct')]})]

In [17]:
filt_sentences[1]

{1: {'form': 'इसे',
  'lemma': 'यह',
  'upos': 'PRON',
  'xpos': 'PRP',
  'head': 5,
  'deprel': 'obj'},
 2: {'form': 'नवाब',
  'lemma': 'नवाब',
  'upos': 'NOUN',
  'xpos': 'NNC',
  'head': 3,
  'deprel': 'compound'},
 3: {'form': 'शाहजेहन',
  'lemma': 'शाहजेहन',
  'upos': 'PROPN',
  'xpos': 'NNP',
  'head': 5,
  'deprel': 'nsubj'},
 4: {'form': 'ने',
  'lemma': 'ने',
  'upos': 'ADP',
  'xpos': 'PSP',
  'head': 3,
  'deprel': 'case'},
 5: {'form': 'बनवाया',
  'lemma': 'बनवा',
  'upos': 'VERB',
  'xpos': 'VM',
  'head': 0,
  'deprel': 'root'},
 6: {'form': 'था',
  'lemma': 'था',
  'upos': 'AUX',
  'xpos': 'VAUX',
  'head': 5,
  'deprel': 'aux'},
 7: {'form': '।',
  'lemma': '।',
  'upos': 'PUNCT',
  'xpos': 'SYM',
  'head': 5,
  'deprel': 'punct'}}