In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import pandas as pd
import numpy as np

import dp
from dp import models

## Data Read and Write

In [3]:
dl = dp.data_loader.DataLoader()
ds = dp.dataset.Dataset()

df_splits = dl.read_conll(Path(dp.config.ENG_TRAIN_1K))

sentences = ds.sentences_from_splits(df_splits, test_sentence = False)

In [4]:
sentences[1].tokens

[token_id:1 | form:Ms. | lemma:ms. | pos:NNP | xpos:_ | morph:_ | head:2 | relation:NMOD,
 token_id:2 | form:Haag | lemma:haag | pos:NNP | xpos:_ | morph:_ | head:3 | relation:SBJ,
 token_id:3 | form:plays | lemma:play | pos:VBZ | xpos:_ | morph:_ | head:0 | relation:ROOT,
 token_id:4 | form:Elianti | lemma:elianti | pos:NNP | xpos:_ | morph:_ | head:3 | relation:OBJ,
 token_id:5 | form:. | lemma:. | pos:. | xpos:_ | morph:_ | head:3 | relation:P]

In [5]:
sentences[1].to_tree()

[2 -> 1, 3 -> 2, 0 -> 3, 3 -> 4, 3 -> 5]

In [7]:
len(sentences[1])

5

In [8]:
sentences[0][0] #first token of first sentence

token_id:1 | form:In | lemma:in | pos:IN | xpos:_ | morph:_ | head:43 | relation:ADV

In [9]:
sentences[0].tokens[0]

token_id:1 | form:In | lemma:in | pos:IN | xpos:_ | morph:_ | head:43 | relation:ADV

In [10]:
# df_splits = ds.splits_from_sentences(sentences)

# dummy_splits = []
# for split in df_splits:
#     split["dummy1"] = "_"
#     split["dummy2"] = "_"
#     dummy_splits.append(split)

# dl.to_conll(dp.config.OUTPUT / Path("dummy.conll06"), dummy_splits)

# Eisner

In [11]:
ml_score = np.array([[-9999, 9, 10, 9], [np.inf, -9999, 20, 3], [np.inf, 30, -9999, 30], [np.inf,11,0 ,-9999]])
no_tokens = 4

eis = models.eisner.Eisner()
eis.fit(no_tokens, ml_score)

70.0

In [12]:
ml_score

array([[-9.999e+03,  9.000e+00,  1.000e+01,  9.000e+00],
       [       inf, -9.999e+03,  2.000e+01,  3.000e+00],
       [       inf,  3.000e+01, -9.999e+03,  3.000e+01],
       [       inf,  1.100e+01,  0.000e+00, -9.999e+03]])

In [13]:
eis.execute_backtrack(0, no_tokens-1)

In [14]:
print(eis.backtrack)

defaultdict(<class 'dict'>, {'o_r': {(0, 1): 0, (1, 2): 1, (2, 3): 2, (0, 2): 0, (1, 3): 2, (0, 3): 0}, 'o_l': {(0, 1): 0, (1, 2): 1, (2, 3): 2, (0, 2): 0, (1, 3): 2, (0, 3): 2}, 'c_r': {(0, 1): 0, (1, 2): 1, (2, 3): 2, (0, 2): 0, (1, 3): 1, (0, 3): 0}, 'c_l': {(0, 1): 1, (1, 2): 2, (2, 3): 3, (0, 2): 2, (1, 3): 2, (0, 3): 2}})


In [15]:
dp.core.Tree.from_eisner(sentences[1],eis.backtrack_execution_result)

[2 -> 3, 0 -> 2, 2 -> 1]

# Transition Based

In [17]:
stack = [0]
buffer = list(range(1, 4))
arcs = set()
correct_arcs = set(((2,1), (2,3), (0,2)))
# correct_arcs = set(((2,1), (2,3), (5,4),(2,5), (6,2), (8, 7), (6,8), (6,9), (0,6)))
start_config = models.arc_standard.Configuration(stack, buffer, arcs)

In [18]:
o = models.arc_standard.Oracle(start_config, correct_arcs)

In [19]:
correct_sequence, configs_sequence = o.execute()

In [20]:
correct_sequence, configs_sequence

(['shift', 'left', 'shift', 'right', 'right', 'shift'],
 [<dp.models.arc_standard.Configuration at 0x7fd15026a950>,
  <dp.models.arc_standard.Configuration at 0x7fd15026a990>,
  <dp.models.arc_standard.Configuration at 0x7fd15026aa10>,
  <dp.models.arc_standard.Configuration at 0x7fd15026aa50>,
  <dp.models.arc_standard.Configuration at 0x7fd15026aa90>,
  <dp.models.arc_standard.Configuration at 0x7fd15026ab50>])

In [21]:
configs_sequence[-1].arcs

{(0, 2), (2, 1), (2, 3)}

In [22]:
model_scores = [[0.3, 0.1, 0.6], [0.9, 0.05, 0.05],[0.2, 0.35, 0.45], [0.1, 0.2, 0.7], [0.1, 0.7, 0.2],[0.7, 0.1, 0.2]]  #[[left, right, shift]]

In [24]:
arcstand = models.arc_standard.ArcStandard(start_config, model_scores)

In [25]:
final_config = arcstand.execute()

In [27]:
final_config.arcs

{(0, 2), (2, 1), (2, 3)}

# Feature Extraction

In [4]:
dl = dp.data_loader.DataLoader()
ds = dp.dataset.Dataset()

df_splits = dl.read_conll(Path(dp.config.ENG_TRAIN_FULL))

sentences = ds.sentences_from_splits(df_splits, test_sentence = False)
trees = [s.to_tree() for s in sentences]

In [4]:
# potential_trees = dp.core.make_potential_trees( sentences)

In [32]:
sentence = sentences[1]
tree = trees[1]

In [33]:
stack = [0]
buffer = list(range(1, len(tree.arcs)+1))
arcs = set()
correct_arcs = {(arc.head, arc.dep) for arc in tree.arcs}

start_config = models.arc_standard.Configuration(stack, buffer, arcs)
o = models.arc_standard.Oracle(start_config, correct_arcs)
correct_sequence, configs_sequence = o.execute()

# model_scores = [[0.3, 0.1, 0.6], [0.9, 0.05, 0.05],[0.2, 0.35, 0.45], [0.1, 0.2, 0.7], [0.1, 0.7, 0.2],[0.7, 0.1, 0.2]]  #[[left, right, shift]]
# arcstand = models.arc_standard.ArcStandard(start_config, model_scores ,correct_sequence)
# final_config = arcstand.execute()

In [34]:
# for sentence, tree in zip(sentences, trees):
#     stack = [0]
#     buffer = list(range(1, len(sentence)+1))
#     arcs = set()
#     correct_arcs = {(arc.head, arc.dep) for arc in tree.arcs}
    
#     start_config = models.arc_standard.Configuration(stack, buffer, arcs)
#     o = models.arc_standard.Oracle(start_config, correct_arcs)
#     correct_sequence, configs_sequence = o.execute()
# #     import pdb; pdb.set_trace()
#     model_scores = []
#     for i in correct_sequence:
#         index = {"left": 0, "right": 1, "shift": 2}.get(i)
#         dummy = [0, 0, 0]
#         dummy[index] = 1
#         model_scores.append(dummy)
# #     import pdb; pdb.set_trace()
# #     model_scores = [[0.3, 0.1, 0.6], [0.9, 0.05, 0.05],[0.2, 0.35, 0.45], [0.1, 0.2, 0.7], [0.1, 0.7, 0.2],[0.7, 0.1, 0.2]]  #[[left, right, shift]]
#     arcstand = models.arc_standard.ArcStandard(start_config, model_scores ,correct_sequence)
#     final_config = arcstand.execute()
#     assert final_config.arcs == correct_arcs
    

In [5]:
temps = dp.template.Templates()
temps.add_template("eisner", "unigram", ["hform", "hpos", "hform+hpos", "dform", "dpos", "dform+dpos"])

temps.add_template("eisner", "bigram",['hform+hpos+dform+dpos', "hpos+dform+dpos", "hform+dform+dpos", "hform+hpos+dform",
"hform+hpos+dpos", "hform+dform", "hpos+dpos"])

temps.add_template("arc_standard", "nivre", 
["S[0]-form", "S[0]-pos","S[0]-lemma" ,"B[0]-form","B[0]-lemma", "B[0]-pos","B[1]-pos",
 "S[1]-pos","ld(S[0])","rd(S[0])","ld(B[0])","rd(B[0])"])

temps.add_template("arc_standard", "nivre_bigram", ["S[0]-form+S[0]-pos+B[0]-form+B[0]-pos", "S[0]-form+S[0]-pos+B[0]-form",
"S[0]-form+B[0]-form+B[0]-pos", "S[0]-form+S[0]-pos+B[0]-pos"])

In [6]:
temps.get_algo_templates("eisner")

[['hform', 'hpos', 'hform+hpos', 'dform', 'dpos', 'dform+dpos'],
 ['hform+hpos+dform+dpos',
  'hpos+dform+dpos',
  'hform+dform+dpos',
  'hform+hpos+dform',
  'hform+hpos+dpos',
  'hform+dform',
  'hpos+dpos']]

In [11]:
dl = dp.data_loader.DataLoader()
ds = dp.dataset.Dataset()

df_splits = dl.read_conll(Path(dp.config.ENG_DEV_GOLD))

dev_sentences = ds.sentences_from_splits(df_splits, test_sentence = False)
dev_trees = [s.to_tree() for s in sentences]

In [8]:
fe = dp.feature_extraction.FeatureExtraction([sentences[1]], [trees[1]], temps,"arc_standard" ,
                                             use_templates = ["nivre", "nivre_bigram"],
                                             configs = [[start_config]+configs_sequence])

NameError: name 'start_config' is not defined

In [None]:
sentences[1].tokens

In [42]:
fe.extract_feature_arcstandard(sentences[1], configs_sequence[1])

[0, 1, 2, 19, 20, 5, 21, 7, 8, 9, 23, 24, 29, 30, 31, 15]

In [43]:
fe.extract_feature_arcstandard_full(sentences[1], configs_sequence)

[[16, 17, 18, 19, 20, 5, 21, 22, 8, 9, 23, 24, 25, 26, 27, 28],
 [0, 1, 2, 19, 20, 5, 21, 7, 8, 9, 23, 24, 29, 30, 31, 15],
 [32, 17, 33, 34, 35, 36, 6, 22, 37, 38, 23, 24, 39, 40, 41, 42],
 [0, 1, 2, 34, 35, 36, 6, 7, 8, 9, 23, 24, 43, 44, 45, 46],
 [47, 48, 49, 50, 51, 5, 52, 22, 37, 38, 10, 11, 53, 54, 55, 56],
 [0, 1, 2, 34, 35, 36, 52, 7, 8, 9, 23, 24, 43, 44, 45, 46],
 [47, 48, 49, 57, 58, 59, 60, 22, 37, 38, 10, 11, 61, 62, 63, 64],
 [0, 1, 2, 34, 35, 36, 60, 7, 37, 38, 23, 24, 43, 44, 45, 46],
 [65, 66, 67, 68, 69, 70, 60, 7, 8, 9, 23, 24, 71, 72, 73, 74],
 [0, 1, 2, 75, 76, 77, 60, 7, 37, 38, 10, 11, 71, 72, 73, 74]]

In [31]:
# fe = dp.feature_extraction.FeatureExtraction(sentences, trees, temps,"eisner" ,use_templates = ["unigram"],configs = None)

# Perceptron

In [7]:
eis = models.eisner.Eisner()
potential_trees = eis.make_potential_trees(sentences)
fe = dp.feature_extraction.FeatureExtraction(sentences[:10],
                                             potential_trees, 
                                             temps,
                                             "eisner",
                                             use_templates = ["unigram",
                                                              "bigram"
                                                             ],
                                             configs = None)
# eis.fit(no_tokens, ml_score)

100%|██████████| 10/10 [00:00<00:00, 60.36it/s]


In [8]:
vocab_size = len(fe.vocab)

In [9]:
eva = dp.evaluation.Evaluation()

In [12]:
eisner_perceptron = models.eisner.EisnerPerceptron(vocab_size,
                                                   eis,
                                                   fe,
                                                   eva,
                                                   dev_gold_sentences = dev_sentences,
                                                   )

In [13]:
# eisner_perceptron.train(sentences,
#                         trees,
#                         potential_trees,
#                         epochs = 100,
#                         path = dp.config.OUTPUT)

In [14]:
dev_potential_trees = eis.make_potential_trees(
                dev_sentences)

In [16]:
predicted_trees = eisner_perceptron.test(dev_potential_trees, dev_gold_sentences=dev_sentences,
                      load_from_path = dp.config.OUTPUT, test = True)

100%|██████████| 1083/1083 [01:34<00:00, 11.43it/s]


In [19]:
dl = dp.data_loader.DataLoader()
ds = dp.dataset.Dataset()

df_splits = dl.read_conll(Path(dp.config.ENG_TEST))

dev_sentences = ds.sentences_from_splits(df_splits, test_sentence = False)
dev_trees = [s.to_tree() for s in dev_sentences]

In [20]:
dev_potential_trees = eis.make_potential_trees(
                dev_sentences)

In [21]:
predicted_trees = eisner_perceptron.test(dev_potential_trees, dev_gold_sentences=dev_sentences,
                       load_from_path = "", test = True)

100%|██████████| 1382/1382 [01:53<00:00, 12.23it/s]


In [22]:
predicted_sentences = [dev_sentences[i].get_head_info_from(predicted_trees[i]) for i in range(len(dev_sentences))]

In [23]:
df_splits = ds.splits_from_sentences(predicted_sentences)

dummy_splits = []
for split in df_splits:
    split["dummy1"] = "_"
    split["dummy2"] = "_"
    dummy_splits.append(split)

predicted_sentences = [dev_sentences[i].get_head_info_from(predicted_trees[i]) for i in range(len(dev_sentences))]
dl.to_conll(dp.config.OUTPUT / Path("english_prediction_test.conll06"), dummy_splits)



#  Transition Perceptron

In [44]:
dl = dp.data_loader.DataLoader()
ds = dp.dataset.Dataset()

df_splits = dl.read_conll(Path(dp.config.ENG_TRAIN_1K))

sentences = ds.sentences_from_splits(df_splits, test_sentence = False)
trees = [s.to_tree() for s in sentences]

In [45]:
dl = dp.data_loader.DataLoader()
ds = dp.dataset.Dataset()

df_splits = dl.read_conll(Path(dp.config.ENG_DEV_GOLD))

dev_sentences = ds.sentences_from_splits(df_splits, test_sentence = False)
dev_trees = [s.to_tree() for s in sentences]

In [46]:
from pathlib import Path

In [47]:
d_sentences = [sentences[0]] * 1
d_trees = [trees[0]] * 1

# d_sentences = sentences[:100] * 10
# d_trees = trees[:100] * 10

In [48]:
template = dp.template.Templates()
template.add_template("arc_standard", "nivre", 
["S[0]-form", "S[0]-pos","S[0]-lemma" ,"B[0]-form","B[0]-lemma", "B[0]-pos","B[1]-pos",
 "S[1]-pos","ld(S[0])","rd(S[0])","ld(B[0])","rd(B[0])"])
template.add_template("arc_standard", "nivre_bigram", ["S[0]-form+S[0]-pos+B[0]-form+B[0]-pos", "S[0]-form+S[0]-pos+B[0]-form",
"S[0]-form+B[0]-form+B[0]-pos", "S[0]-form+S[0]-pos+B[0]-pos"])

#

In [49]:
arcperceptron = models.arc_standard.ArcStandardPerceptron(evaluation = dp.evaluation.Evaluation(),
                                                         dev_sentences = d_sentences
                                                         )

In [None]:
w = arcperceptron.train(sentences[:100],trees[:100], template, epochs = 20)