
# Group Project - Transfer Quality Estimation

In [1]:
# pip install openkiwi==0.1.2
import kiwi
import torch
import numpy as np

import torch.nn as nn
import torch.optim as optim

import shutil

import os
import csv
import glob

from kiwi.data.utils import read_file
from pathlib import Path
from more_itertools import all_equal, flatten

import subprocess

import yaml
from ipywidgets import interact, fixed, Textarea
from functools import partial
%load_ext yamlmagic

In [2]:
# define a function to read my data
def readData(file):

    tsv_file = open(file)
    read_tsv = csv.reader(tsv_file, delimiter="\t")
    tsv_content = [row[0] for row in read_tsv]
    
#     keys    = tsv_content[0]
#     content = tsv_content[1:]
    
    return tsv_content

In [3]:
def myStrReformat(an_input, tag = 'tags'):
    '''
    tag is a string, indicate where the data is from.
    It has different format if from different sources.
    
    ['tags', 'gap_tags', 'sentence_scores']
    
    'tags' and 'gap_tags' have the same format.
    'sentence_scores' has different format. 
    
    an_input,
        the input data for the tags. 
    
    
    '''
    str_tag = []
    for a_group in an_input:

        # 'sentence_scores' tags are with different format. 
        if tag == 'sentence_scores':
            a_group_str = str(a_group)
            
        # below is for 'tags' and 'gap_tags'    
        else:
            a_group_str = []
            for a_tag in a_group: a_group_str.append(str(a_tag))

        str_tag.append(a_group_str)
        
        

    return str_tag

In [4]:
def concat(probabilities, prob_sep='|', token_sep='\n', sentence_sep='\n\n'):
    flat_probabilities = [list(flatten(probs)) for probs in probabilities]
#     if not all_equal([len(p) for p in flat_probabilities]):
#         logger.error('Number of tokens do not match.')
#         return None

    probs_per_token_sentence_file = [
        list(zip(*parallel_probs)) for parallel_probs in zip(*probabilities)
    ]

    content_str = sentence_sep.join(
        [
            token_sep.join(
                [prob_sep.join(map(str, tokens)) for tokens in sentence]
            )
            for sentence in probs_per_token_sentence_file
        ]
    )
    content_str += sentence_sep  # Add a trailing newline before EOF.

    return content_str

In [5]:
def concat_tags(tags_read, word_gap='\t', sentence_gap ='\n'):
    '''
    Within each sentence, separate the words with ' '; between each sentence,
    separate the sentences with '\n'.


    tags_read,
    a list, each element is a sublist,
    within sublist, each element is a string of a prediction score.

    '''

    sentence_level_join = [ word_gap.join( a_sentence ) for a_sentence in tags_read]

    all_sentences_join = sentence_gap.join( sentence_level_join )
    all_sentences_join += sentence_gap




    return all_sentences_join

In [6]:
def save_config(yaml_config, name):
    """ Writes yaml config to file"""
    with open(name, 'w') as outfile:
        yaml.dump(yaml_config, outfile, default_flow_style=False)

## Replicate WMT18 result as our baseline, word level and setence level using en_de.nmt data

In [9]:
# directory
KIWI         = os.getcwd()
# KIWI = KIWI + '/baselineWMT18'
# which_target = '/target_1'
# which_output = '/tags'

SCRIPT_DIR = KIWI + '/baseline'
# LINEAR_DIR=  SCRIPT_DIR + '/linear'

# RUN_DIR     = SCRIPT_DIR + '/estimator'
WORD_DIR   = KIWI + '/data/word-level/en_de.nmt'
SENT_DIR    = KIWI + '/data/sentence_level/en_de'

PREDICTIONS_DIR = 'predictions_baseline'

In [10]:
MODEL_DIR = os.listdir(SCRIPT_DIR)
test_source = WORD_DIR+'/dev.src'
test_target = WORD_DIR+'/dev.mt'
test_alignment = WORD_DIR + '/dev.src-mt.alignments'
test_source = readData(test_source)
test_target = readData(test_target)
test_alignment = readData(test_alignment)
examples = {'source': test_source,'target': test_target}
examples2 = {'source': test_source,'target': test_target, 'alignments':test_alignment}

In [11]:
# make 'linear' as the folder appear the last.
MODEL_DIR.remove('linear')
MODEL_DIR.append('linear')

In [13]:
## produce output for ape, nuqe and estimators
## use linear model to score stacked output for APE, nuqe and estimators


for MODEL_NAME in MODEL_DIR:
    if MODEL_NAME == 'ape':
        RUN_DIR = os.listdir(SCRIPT_DIR+'/'+MODEL_NAME)
        for RUN_NAME in RUN_DIR:
            if RUN_NAME !='.DS_Store':
                SIDE = SCRIPT_DIR+'/'+MODEL_NAME+'/'+RUN_NAME
                OUTPUT_DIR =  PREDICTIONS_DIR +'/'+MODEL_NAME+'/'+RUN_NAME
                print('Copying ape ' +RUN_NAME+ ' model predictions')
                copyape = shutil.copytree(SIDE, OUTPUT_DIR) 
                print ('Copied ape ' +RUN_NAME)
                print('=========================================')
                
    if MODEL_NAME == 'estimator':
        RUN_DIR = os.listdir(SCRIPT_DIR+'/'+MODEL_NAME)
        for RUN_NAME in RUN_DIR:
            if RUN_NAME !='.DS_Store':

                configfile = SCRIPT_DIR+'/' +MODEL_NAME +'/'+ RUN_NAME+'/predict.yaml'
                modelfile = SCRIPT_DIR+'/' +MODEL_NAME +'/'+ RUN_NAME+'/model.torch'
                model = kiwi.load_model(modelfile)
                modelout = model.predict(examples)
                
                for tag in list(modelout.keys()):
                    
                    print('making estimator ' +RUN_NAME+ ' '+ tag + ' model predictions')

                    all_sentences_join = concat_tags(    myStrReformat(modelout[tag], tag=tag)  )

                    outpath = PREDICTIONS_DIR +'/'+MODEL_NAME+'/'+RUN_NAME + '/dev'
                    myfile = outpath+'/'+tag
                    

                    isExist = os.path.exists(outpath)

                    if not isExist:
                        # Create a new directory because it does not exist 
                        os.makedirs(outpath)
                    #     print("The new directory is created!")

                    Path(myfile).write_text(all_sentences_join)     
                    
                    if tag == 'sentence_scores':
                        sentence_space = read_file(myfile)
                        sentence_no_space_to_write = concat_tags(sentence_space, word_gap='', sentence_gap ='\n')
                        myoutput = outpath+'/'+tag+'_nospace'
                        Path(myoutput).write_text(sentence_no_space_to_write)

                    print('=========================================')
                    
                    
                                
    if MODEL_NAME == 'nuqe':
        RUN_DIR = os.listdir(SCRIPT_DIR+'/'+MODEL_NAME)
        for RUN_NAME in RUN_DIR:
            if RUN_NAME !='.DS_Store':

                configfile = SCRIPT_DIR+'/' +MODEL_NAME +'/'+ RUN_NAME+'/predict.yaml'
                modelfile = SCRIPT_DIR+'/' +MODEL_NAME +'/'+ RUN_NAME+'/model.torch'
                model = kiwi.load_model(modelfile)
                modelout = model.predict(examples2)
                
                for tag in list(modelout.keys()):
                    
                    print('making nuqe ' +RUN_NAME+ ' '+ tag + ' model predictions')

                    all_sentences_join = concat_tags(    myStrReformat(modelout[tag], tag=tag)  )

                    outpath = PREDICTIONS_DIR +'/'+MODEL_NAME+'/'+RUN_NAME + '/dev'
                    myfile = outpath+'/'+tag
                    

                    isExist = os.path.exists(outpath)

                    if not isExist:
                        # Create a new directory because it does not exist 
                        os.makedirs(outpath)
                    #     print("The new directory is created!")

                    Path(myfile).write_text(all_sentences_join)

                    print('=========================================')
                    
    if MODEL_NAME == 'linear':
        ## stack probability for linear model
        
        print('stacking probablities for linear model')  
        
        APE_mt_tags     = PREDICTIONS_DIR + '/ape/mt_pe/dev/tags'
        APE_source_tags = PREDICTIONS_DIR + '/ape/src_pe/dev/tags'
        target1_tags    = PREDICTIONS_DIR + '/estimator/target_1/dev/tags' 
        target2_tags    = PREDICTIONS_DIR + '/estimator/target_2/dev/tags' 
        target3_tags    = PREDICTIONS_DIR + '/estimator/target_3/dev/tags' 
        target4_tags    = PREDICTIONS_DIR + '/estimator/target_4/dev/tags' 
        nuqe_tags       = PREDICTIONS_DIR + '/nuqe/target/dev/tags' 

        predicted_file = [APE_mt_tags, APE_source_tags, target1_tags, target2_tags, target3_tags, target4_tags, nuqe_tags]

        probabilities = [read_file(file) for file in predicted_file]

        lengths = [len(prob) for prob in probabilities]

        stacked_probabilities = concat(probabilities)

        outpath = PREDICTIONS_DIR +'/' +MODEL_NAME +'/dev'
        myfile = outpath+'/tags.stacked'

        isExist = os.path.exists(outpath)

        if not isExist:
            # Create a new directory because it does not exist 
            os.makedirs(outpath)
        #     print("The new directory is created!")

        Path(myfile).write_text(stacked_probabilities)

        print('=========================================')   
        
        print('=====predicting using linear model')  
        
        myshell = SCRIPT_DIR + '/run_linear.sh'
        myshell
        subprocess.run([myshell], shell = True)
        print('=========================================')   
                    
                    

stacking probablities for linear model
=====predicting using linear model
print LINEAR_OUT_DEV /Users/roseyao/Documents/GIT/7643DeepLearning/Project_Final/predictions_baseline/linear/dev
Predicting with model linear
Generating dev set predictions...


## Below are all the evaluation results for baseline model.

In [16]:
%%yaml stackedtag_evaluate
# ==========  Stacked Linear Model (target tags)  ==========
# Example file for configuring the evaluation pipeline
#
# The input type for prediction files (Probabilities[probs] or tags)
# type: tags
 
# The format of gold files (wmt17/wmt18)
format: wmt18

# Format of predictions (wmt17/wmt18). Either they predict gaps or not.
pred-format: wmt17

# File path for the reference files
gold-target: data/word-level/en_de.nmt/dev.tags

# File path for the prediction files
pred-target: predictions_baseline/linear/dev/tags

<IPython.core.display.Javascript object>

In [17]:
save_config(stackedtag_evaluate, 'stackedtag_evaluate.yml')
!kiwi evaluate --config stackedtag_evaluate.yml

---------------------------------------------------------------------------
Word-level scores for tags:
File                                    F1_mult      F1_OK        F1_BAD   
predictions_baseline/linear/dev/tags    0.44114      0.91768      0.48072  


In [18]:
%%yaml targettags_evaluate
# ==========  Average Ensemble Model (target tags)  ==========
# Example file for configuring the evaluation pipeline
#
# The input type for prediction files (Probabilities[probs] or tags)
# type: tags
 
# The format of gold files (wmt17/wmt18)
format: wmt18

# Format of predictions (wmt17/wmt18). Either they predict gaps or not.
pred-format: wmt17

# File path for the reference files
gold-target: data/word-level/en_de.nmt/dev.tags

# File path for the prediction files
pred-target: [predictions_baseline/estimator/target_1/dev/tags,
              predictions_baseline/estimator/target_2/dev/tags,
              predictions_baseline/estimator/target_3/dev/tags,
              predictions_baseline/estimator/target_4/dev/tags,
              predictions_baseline/ape/src_pe/dev/tags,
              predictions_baseline/ape/mt_pe/dev/tags,
              predictions_baseline/nuqe/target/dev/tags]

<IPython.core.display.Javascript object>

In [19]:
save_config(targettags_evaluate, 'targettags_evaluate.yml')
!kiwi evaluate --config targettags_evaluate.yml

------------------------------------------------------------------
Word-level scores for tags:
File (predictions_baseline)    F1_mult      F1_OK        F1_BAD   
*ensemble*                     0.42984      0.93154      0.46143  
estimator/target_2/dev/tags    0.39154      0.91794      0.42654  
estimator/target_1/dev/tags    0.38646      0.9063       0.42642  
estimator/target_4/dev/tags    0.38601      0.89551      0.43105  
estimator/target_3/dev/tags    0.38268      0.91024      0.42042  
ape/src_pe/dev/tags            0.37535      0.87182      0.43054  
nuqe/target/dev/tags           0.31824      0.90716      0.35080  
ape/mt_pe/dev/tags             0.10961      0.93035      0.11782  


In [20]:
%%yaml gaptags_evaluate
# ==========  Average Ensemble Model (gap tags)  ==========
# Example file for configuring the evaluation pipeline
#
# The input type for prediction files (Probabilities[probs] or tags)
# type: tags
 
# The format of gold files (wmt17/wmt18)
format: wmt18

# Format of predictions (wmt17/wmt18). Either they predict gaps or not.
pred-format: wmt17

# File path for the reference files
gold-target: data/word-level/en_de.nmt/dev.tags

# File path for the prediction files
pred-gaps: [predictions_baseline/estimator/target_1/dev/gap_tags,
              predictions_baseline/estimator/target_2/dev/gap_tags,
              predictions_baseline/estimator/target_3/dev/gap_tags,
              predictions_baseline/estimator/target_4/dev/gap_tags,
              predictions_baseline/ape/src_pe/dev/gap_tags,
              predictions_baseline/ape/mt_pe/dev/gap_tags,
              predictions_baseline/nuqe/gaps/dev/gap_tags]
                        

<IPython.core.display.Javascript object>

In [22]:
save_config(gaptags_evaluate, 'gaptags_evaluate.yml')
!kiwi evaluate --config gaptags_evaluate.yml  

----------------------------------------------------------------------
Word-level scores for gap_tags:
File (predictions_baseline)        F1_mult      F1_OK        F1_BAD   
*ensemble*                         0.24596      0.98889      0.24872  
ape/src_pe/dev/gap_tags            0.21709      0.97691      0.22222  
estimator/target_3/dev/gap_tags    0.20058      0.97785      0.20513  
estimator/target_2/dev/gap_tags    0.19112      0.98109      0.19481  
ape/mt_pe/dev/gap_tags             0.18978      0.98749      0.19218  
estimator/target_4/dev/gap_tags    0.18868      0.9825       0.19204  
estimator/target_1/dev/gap_tags    0.18662      0.97599      0.19121  
nuqe/gaps/dev/gap_tags             0.16613      0.98256      0.16908  


In [23]:
%%yaml sourcetags_evaluate
# ==========  Average Ensemble Model (source tags)  ==========
# Example file for configuring the evaluation pipeline
#
# The input type for prediction files (Probabilities[probs] or tags)
# type: tags
 
# The format of gold files (wmt17/wmt18)
format: wmt18

# Format of predictions (wmt17/wmt18). Either they predict gaps or not.
pred-format: wmt17

# File path for the reference files
gold-source: data/word-level/en_de.nmt/dev.src_tags

# File path for the prediction files
pred-source: [predictions_baseline/estimator/source/dev/source_tags ,
              predictions_baseline/ape/src_pe/dev/source_tags,
              predictions_baseline/ape/mt_pe/dev/source_tags,
              predictions_baseline/nuqe/source/dev/source_tags]
                        

<IPython.core.display.Javascript object>

In [24]:
save_config(sourcetags_evaluate, 'sourcetags_evaluate.yml')
!kiwi evaluate --config sourcetags_evaluate.yml  

-----------------------------------------------------------------------
Word-level scores for source_tags:
File (predictions_baseline)         F1_mult      F1_OK        F1_BAD   
ape/src_pe/dev/source_tags          0.34533      0.86644      0.39856  
*ensemble*                          0.30920      0.93068      0.33223  
nuqe/source/dev/source_tags         0.30304      0.8977       0.33758  
estimator/source/dev/source_tags    0.29327      0.89974      0.32595  
ape/mt_pe/dev/source_tags           0.09245      0.92956      0.09945  


In [25]:
%%yaml sentencescores_evaluate
# ==========  Average Ensemble Model (sentence scores)  ==========
# Example file for configuring the evaluation pipeline
#
# The input type for prediction files (Probabilities[probs] or tags)
# type: tags
 
# The format of gold files (wmt17/wmt18)
format: wmt18

# Format of predictions (wmt17/wmt18). Either they predict gaps or not.
pred-format: wmt17

# File path for the reference files
gold-sents: data/sentence_level/en_de/dev.nmt.hter

# File path for the prediction files
pred-sents: [predictions_baseline/estimator/target_1/dev/sentence_scores_nospace,
              predictions_baseline/estimator/target_2/dev/sentence_scores_nospace,
              predictions_baseline/estimator/target_3/dev/sentence_scores_nospace,
              predictions_baseline/estimator/target_4/dev/sentence_scores_nospace]


<IPython.core.display.Javascript object>

In [26]:
save_config(sentencescores_evaluate, 'sentencescores_evaluate.yml')
!kiwi evaluate --config sentencescores_evaluate.yml  

----------------------------------------------------------------------------
Sentence-level scoring:
File (predictions_baseline/estimator)    Pearson r    MAE          RMSE     
*ensemble*                               0.52135      0.11146      0.16236  
target_4/dev/sentence_scores_nospace     0.49223      0.14070      0.17561  
target_1/dev/sentence_scores_nospace     0.49075      0.11220      0.16649  
target_2/dev/sentence_scores_nospace     0.49053      0.11263      0.17098  
target_3/dev/sentence_scores_nospace     0.47417      0.10782      0.17175  
----------------------------------------------------------------------------
Sentence-level ranking:
File (predictions_baseline/estimator)    Spearman r    DeltaAvg 
*ensemble*                               0.56869       0.08266  
target_2/dev/sentence_scores_nospace     0.55191       0.08088  
target_4/dev/sentence_scores_nospace     0.54020       0.07987  
target_3/dev/sentence_scores_nospace     0.53244       0.07856

## Transfer learning 1: Adding WMT2019 and 2020 data to transfer learn predictor estimator models 1-4

In [86]:
# directory
KIWI         = os.getcwd()
# KIWI = KIWI + '/transfer2'
# which_target = '/target_1'
# which_output = '/tags'

SCRIPT_DIR = KIWI + '/transfer1'
# LINEAR_DIR=  SCRIPT_DIR + '/linear'

# RUN_DIR     = SCRIPT_DIR + '/estimator'
WORD_DIR   = KIWI + '/data/word-level/en_de.nmt'
SENT_DIR    = KIWI + '/data/sentence_level/en_de'

PREDICTIONS_DIR = 'predictions_transfer1'
PREDICTIONS_BASE = 'predictions_baseline'

MODEL_DIR = os.listdir(SCRIPT_DIR)

MODEL_DIR.remove('linear')
MODEL_DIR.append('linear')

In [87]:
## produce output for ape, nuqe and estimators
## use linear model to score stacked output for APE, nuqe and estimators


for MODEL_NAME in MODEL_DIR:

                
    if MODEL_NAME == 'estimator':
        RUN_DIR = os.listdir(SCRIPT_DIR+'/'+MODEL_NAME)
        for RUN_NAME in RUN_DIR:
            if RUN_NAME !='.DS_Store':

#                 configfile = SCRIPT_DIR+'/' +MODEL_NAME +'/'+ RUN_NAME+'/predict.yaml'
                modelfile = SCRIPT_DIR+'/' +MODEL_NAME +'/'+ RUN_NAME+'/model.torch'
                model = kiwi.load_model(modelfile)
                modelout = model.predict(examples)
                
                for tag in list(modelout.keys()):
                    
                    print('making estimator ' +RUN_NAME+ ' '+ tag + ' model predictions')

                    all_sentences_join = concat_tags(    myStrReformat(modelout[tag], tag=tag)  )

                    outpath = PREDICTIONS_DIR +'/'+MODEL_NAME+'/'+RUN_NAME + '/dev'
                    myfile = outpath+'/'+tag
                    

                    isExist = os.path.exists(outpath)

                    if not isExist:
                        # Create a new directory because it does not exist 
                        os.makedirs(outpath)
                    #     print("The new directory is created!")

                    Path(myfile).write_text(all_sentences_join)     
                    
                    if tag == 'sentence_scores':
                        sentence_space = read_file(myfile)
                        sentence_no_space_to_write = concat_tags(sentence_space, word_gap='', sentence_gap ='\n')
                        myoutput = outpath+'/'+tag+'_nospace'
                        Path(myoutput).write_text(sentence_no_space_to_write)

                    print('=========================================')
                                      
                    
    if MODEL_NAME == 'linear':
        ## stack probability for linear model
        
        print('stacking probablities for linear model')  
        
        APE_mt_tags     = PREDICTIONS_BASE + '/ape/mt_pe/dev/tags'
        APE_source_tags = PREDICTIONS_BASE + '/ape/src_pe/dev/tags'
        target1_tags    = PREDICTIONS_DIR + '/estimator/target_1/dev/tags' 
        target2_tags    = PREDICTIONS_DIR + '/estimator/target_2/dev/tags' 
        target3_tags    = PREDICTIONS_DIR + '/estimator/target_3/dev/tags' 
        target4_tags    = PREDICTIONS_DIR + '/estimator/target_4/dev/tags' 
        nuqe_tags       = PREDICTIONS_BASE + '/nuqe/target/dev/tags' 

        predicted_file = [APE_mt_tags, APE_source_tags, target1_tags, target2_tags, target3_tags, target4_tags, nuqe_tags]

        probabilities = [read_file(file) for file in predicted_file]

        lengths = [len(prob) for prob in probabilities]

        stacked_probabilities = concat(probabilities)

        outpath = PREDICTIONS_DIR +'/' +MODEL_NAME +'/dev'
        myfile = outpath+'/tags.stacked'

        isExist = os.path.exists(outpath)

        if not isExist:
            # Create a new directory because it does not exist 
            os.makedirs(outpath)
        #     print("The new directory is created!")

        Path(myfile).write_text(stacked_probabilities)

        print('=========================================')   
        
        print('=====predicting using linear model')  
        
        myshell = SCRIPT_DIR + '/run_linear.sh'
        myshell
        subprocess.run([myshell], shell = True)
        print('=========================================')   

making estimator target_2 tags model predictions
making estimator target_2 gap_tags model predictions
making estimator target_2 sentence_scores model predictions
making estimator target_3 tags model predictions
making estimator target_3 gap_tags model predictions
making estimator target_3 sentence_scores model predictions
making estimator target_4 tags model predictions
making estimator target_4 gap_tags model predictions
making estimator target_4 sentence_scores model predictions
making estimator target_1 tags model predictions
making estimator target_1 gap_tags model predictions
making estimator target_1 sentence_scores model predictions
stacking probablities for linear model
=====predicting using linear model
Predicting with model linear
Generating dev set predictions...


## Below are all the evaluation results for transfer1 model.

In [88]:
%%yaml stackedtag_evaluate_1
# ==========  Stacked Linear Model (target tags)  ==========
# Example file for configuring the evaluation pipeline
#
# The input type for prediction files (Probabilities[probs] or tags)
# type: tags
 
# The format of gold files (wmt17/wmt18)
format: wmt18

# Format of predictions (wmt17/wmt18). Either they predict gaps or not.
pred-format: wmt17

# File path for the reference files
gold-target: data/word-level/en_de.nmt/dev.tags

# File path for the prediction files
pred-target: predictions_transfer1/linear/dev/tags

<IPython.core.display.Javascript object>

In [89]:
save_config(stackedtag_evaluate_1, 'stackedtag_evaluate_1.yml')
!kiwi evaluate --config stackedtag_evaluate_1.yml

----------------------------------------------------------------------------
Word-level scores for tags:
File                                     F1_mult      F1_OK        F1_BAD   
predictions_transfer1/linear/dev/tags    0.42483      0.90334      0.47029  


In [90]:
%%yaml targettags_evaluate_1
# ==========  Average Ensemble Model (target tags)  ==========
# Example file for configuring the evaluation pipeline
#
# The input type for prediction files (Probabilities[probs] or tags)
# type: tags
 
# The format of gold files (wmt17/wmt18)
format: wmt18

# Format of predictions (wmt17/wmt18). Either they predict gaps or not.
pred-format: wmt17

# File path for the reference files
gold-target: data/word-level/en_de.nmt/dev.tags

# File path for the prediction files
pred-target: [predictions_transfer1/estimator/target_1/dev/tags,
              predictions_transfer1/estimator/target_2/dev/tags,
              predictions_transfer1/estimator/target_3/dev/tags,
              predictions_transfer1/estimator/target_4/dev/tags,
              predictions_baseline/ape/src_pe/dev/tags,
              predictions_baseline/ape/mt_pe/dev/tags,
              predictions_baseline/nuqe/target/dev/tags]

<IPython.core.display.Javascript object>

In [91]:
save_config(targettags_evaluate_1, 'targettags_evaluate_1.yml')
!kiwi evaluate --config targettags_evaluate_1.yml

----------------------------------------------------------------------------------------
Word-level scores for tags:
File                                                 F1_mult      F1_OK        F1_BAD   
*ensemble*                                           0.43472      0.92355      0.47071  
predictions_baseline/ape/src_pe/dev/tags             0.37535      0.87182      0.43054  
predictions_transfer1/estimator/target_1/dev/tags    0.37388      0.89471      0.41787  
predictions_transfer1/estimator/target_4/dev/tags    0.36550      0.88431      0.41332  
predictions_transfer1/estimator/target_2/dev/tags    0.36233      0.89648      0.40417  
predictions_transfer1/estimator/target_3/dev/tags    0.33581      0.85707      0.39182  
predictions_baseline/nuqe/target/dev/tags            0.31824      0.90716      0.35080  
predictions_baseline/ape/mt_pe/dev/tags              0.10961      0.93035      0.11782  


In [92]:
%%yaml gaptags_evaluate_1
# ==========  Average Ensemble Model (gap tags)  ==========
# Example file for configuring the evaluation pipeline
#
# The input type for prediction files (Probabilities[probs] or tags)
# type: tags
 
# The format of gold files (wmt17/wmt18)
format: wmt18

# Format of predictions (wmt17/wmt18). Either they predict gaps or not.
pred-format: wmt17

# File path for the reference files
gold-target: data/word-level/en_de.nmt/dev.tags

# File path for the prediction files
pred-gaps: [predictions_transfer1/estimator/target_1/dev/gap_tags,
              predictions_transfer1/estimator/target_2/dev/gap_tags,
              predictions_transfer1/estimator/target_3/dev/gap_tags,
              predictions_transfer1/estimator/target_4/dev/gap_tags,
              predictions_baseline/ape/src_pe/dev/gap_tags,
              predictions_baseline/ape/mt_pe/dev/gap_tags,
              predictions_baseline/nuqe/gaps/dev/gap_tags]

<IPython.core.display.Javascript object>

In [93]:
save_config(gaptags_evaluate_1, 'gaptags_evaluate_1.yml')
!kiwi evaluate --config gaptags_evaluate_1.yml 

--------------------------------------------------------------------------------------------
Word-level scores for gap_tags:
File                                                     F1_mult      F1_OK        F1_BAD   
*ensemble*                                               0.21880      0.986        0.22191  
predictions_baseline/ape/src_pe/dev/gap_tags             0.21709      0.97691      0.22222  
predictions_baseline/ape/mt_pe/dev/gap_tags              0.18978      0.98749      0.19218  
predictions_transfer1/estimator/target_4/dev/gap_tags    0.17476      0.96675      0.18078  
predictions_transfer1/estimator/target_1/dev/gap_tags    0.16961      0.97037      0.17479  
predictions_baseline/nuqe/gaps/dev/gap_tags              0.16613      0.98256      0.16908  
predictions_transfer1/estimator/target_3/dev/gap_tags    0.15792      0.96009      0.16449  
predictions_transfer1/estimator/target_2/dev/gap_tags    0.13772      0.96027      0.14342  


In [94]:
%%yaml sentencescores_evaluate_1
# ==========  Average Ensemble Model (sentence scores)  ==========
# Example file for configuring the evaluation pipeline
#
# The input type for prediction files (Probabilities[probs] or tags)
# type: tags
 
# The format of gold files (wmt17/wmt18)
format: wmt18

# Format of predictions (wmt17/wmt18). Either they predict gaps or not.
pred-format: wmt17

# File path for the reference files
gold-sents: data/sentence_level/en_de/dev.nmt.hter

# File path for the prediction files
pred-sents: [predictions_transfer1/estimator/target_1/dev/sentence_scores_nospace,
              predictions_transfer1/estimator/target_2/dev/sentence_scores_nospace,
              predictions_transfer1/estimator/target_3/dev/sentence_scores_nospace,
              predictions_transfer1/estimator/target_4/dev/sentence_scores_nospace]

<IPython.core.display.Javascript object>

In [95]:
save_config(sentencescores_evaluate_1, 'sentencescores_evaluate_1.yml')
!kiwi evaluate --config sentencescores_evaluate_1.yml 

-----------------------------------------------------------------------------
Sentence-level scoring:
File (predictions_transfer1/estimator)    Pearson r    MAE          RMSE     
*ensemble*                                0.50296      0.13766      0.17886  
target_1/dev/sentence_scores_nospace      0.47432      0.12782      0.17843  
target_4/dev/sentence_scores_nospace      0.47317      0.12826      0.17359  
target_2/dev/sentence_scores_nospace      0.46344      0.14811      0.19245  
target_3/dev/sentence_scores_nospace      0.45695      0.16438      0.20597  
-----------------------------------------------------------------------------
Sentence-level ranking:
File (predictions_transfer1/estimator)    Spearman r    DeltaAvg 
*ensemble*                                0.54707       0.08267  
target_3/dev/sentence_scores_nospace      0.51913       0.08107  
target_1/dev/sentence_scores_nospace      0.51788       0.07898  
target_4/dev/sentence_scores_nospace      0.51657       0.07919 

## Transfer learning 2: common words

In [98]:
# directory
KIWI         = os.getcwd()
# KIWI = KIWI + '/transfer2'
# which_target = '/target_1'
# which_output = '/tags'

SCRIPT_DIR = KIWI + '/transfer2'
# LINEAR_DIR=  SCRIPT_DIR + '/linear'

# RUN_DIR     = SCRIPT_DIR + '/estimator'
WORD_DIR   = KIWI + '/data/word-level/en_de.nmt'
SENT_DIR    = KIWI + '/data/sentence_level/en_de'

PREDICTIONS_DIR = 'predictions_transfer2'
PREDICTIONS_BASE = 'predictions_baseline'

MODEL_DIR = os.listdir(SCRIPT_DIR)

MODEL_DIR.remove('linear')
MODEL_DIR.append('linear')

In [99]:
## produce output for ape, nuqe and estimators
## use linear model to score stacked output for APE, nuqe and estimators


for MODEL_NAME in MODEL_DIR:
                
    if MODEL_NAME == 'estimator':
        RUN_DIR = os.listdir(SCRIPT_DIR+'/'+MODEL_NAME)
        for RUN_NAME in RUN_DIR:
            if RUN_NAME !='.DS_Store':

#                 configfile = SCRIPT_DIR+'/' +MODEL_NAME +'/'+ RUN_NAME+'/predict.yaml'
                modelfile = SCRIPT_DIR+'/' +MODEL_NAME +'/'+ RUN_NAME+'/model.torch'
                model = kiwi.load_model(modelfile)
                modelout = model.predict(examples)
                
                for tag in list(modelout.keys()):
                    
                    print('making estimator ' +RUN_NAME+ ' '+ tag + ' model predictions')

                    all_sentences_join = concat_tags(    myStrReformat(modelout[tag], tag=tag)  )

                    outpath = PREDICTIONS_DIR +'/'+MODEL_NAME+'/'+RUN_NAME + '/dev'
                    myfile = outpath+'/'+tag
                    

                    isExist = os.path.exists(outpath)

                    if not isExist:
                        # Create a new directory because it does not exist 
                        os.makedirs(outpath)
                    #     print("The new directory is created!")

                    Path(myfile).write_text(all_sentences_join)     
                    
                    if tag == 'sentence_scores':
                        sentence_space = read_file(myfile)
                        sentence_no_space_to_write = concat_tags(sentence_space, word_gap='', sentence_gap ='\n')
                        myoutput = outpath+'/'+tag+'_nospace'
                        Path(myoutput).write_text(sentence_no_space_to_write)

                    print('=========================================')
                    
                    
    if MODEL_NAME == 'linear':
        ## stack probability for linear model
        
        print('stacking probablities for linear model')  
        
        APE_mt_tags     = PREDICTIONS_BASE + '/ape/mt_pe/dev/tags'
        APE_source_tags = PREDICTIONS_BASE + '/ape/src_pe/dev/tags'
        target1_tags    = PREDICTIONS_DIR + '/estimator/target_1/dev/tags' 
        target2_tags    = PREDICTIONS_DIR + '/estimator/target_2/dev/tags' 
        target3_tags    = PREDICTIONS_DIR + '/estimator/target_3/dev/tags' 
        target4_tags    = PREDICTIONS_DIR + '/estimator/target_4/dev/tags' 
        nuqe_tags       = PREDICTIONS_BASE + '/nuqe/target/dev/tags' 

        predicted_file = [APE_mt_tags, APE_source_tags, target1_tags, target2_tags, target3_tags, target4_tags, nuqe_tags]

        probabilities = [read_file(file) for file in predicted_file]

        lengths = [len(prob) for prob in probabilities]

        stacked_probabilities = concat(probabilities)

        outpath = PREDICTIONS_DIR +'/' +MODEL_NAME +'/dev'
        myfile = outpath+'/tags.stacked'

        isExist = os.path.exists(outpath)

        if not isExist:
            # Create a new directory because it does not exist 
            os.makedirs(outpath)
        #     print("The new directory is created!")

        Path(myfile).write_text(stacked_probabilities)

        print('=========================================')   
        
        print('=====predicting using linear model')  
        
        myshell = SCRIPT_DIR + '/run_linear.sh'
        myshell
        subprocess.run([myshell], shell = True)
        print('=========================================')   
                    

making estimator target_2 tags model predictions
making estimator target_2 gap_tags model predictions
making estimator target_2 sentence_scores model predictions
making estimator target_3 tags model predictions
making estimator target_3 gap_tags model predictions
making estimator target_3 sentence_scores model predictions
making estimator target_4 tags model predictions
making estimator target_4 gap_tags model predictions
making estimator target_4 sentence_scores model predictions
making estimator target_1 tags model predictions
making estimator target_1 gap_tags model predictions
making estimator target_1 sentence_scores model predictions
stacking probablities for linear model
=====predicting using linear model
Predicting with model linear
Generating dev set predictions...


## Below are all the evaluation results for transfer 2 model.

In [100]:
%%yaml stackedtag_evaluate_2
# ==========  Stacked Linear Model (target tags)  ==========
# Example file for configuring the evaluation pipeline
#
# The input type for prediction files (Probabilities[probs] or tags)
# type: tags
 
# The format of gold files (wmt17/wmt18)
format: wmt18

# Format of predictions (wmt17/wmt18). Either they predict gaps or not.
pred-format: wmt17

# File path for the reference files
gold-target: data/word-level/en_de.nmt/dev.tags

# File path for the prediction files
pred-target: predictions_transfer2/linear/dev/tags

<IPython.core.display.Javascript object>

In [101]:
save_config(stackedtag_evaluate_2, 'stackedtag_evaluate_2.yml')
!kiwi evaluate --config stackedtag_evaluate_2.yml

----------------------------------------------------------------------------
Word-level scores for tags:
File                                     F1_mult      F1_OK        F1_BAD   
predictions_transfer2/linear/dev/tags    0.43780      0.9167       0.47758  


In [102]:
%%yaml targettags_evaluate_2
# ==========  Average Ensemble Model (target tags)  ==========
# Example file for configuring the evaluation pipeline
#
# The input type for prediction files (Probabilities[probs] or tags)
# type: tags
 
# The format of gold files (wmt17/wmt18)
format: wmt18

# Format of predictions (wmt17/wmt18). Either they predict gaps or not.
pred-format: wmt17

# File path for the reference files
gold-target: data/word-level/en_de.nmt/dev.tags

# File path for the prediction files
pred-target: [predictions_transfer2/estimator/target_1/dev/tags,
              predictions_transfer2/estimator/target_2/dev/tags,
              predictions_transfer2/estimator/target_3/dev/tags,
              predictions_transfer2/estimator/target_4/dev/tags,
              predictions_baseline/ape/src_pe/dev/tags,
              predictions_baseline/ape/mt_pe/dev/tags,
              predictions_baseline/nuqe/target/dev/tags]

<IPython.core.display.Javascript object>

In [103]:
save_config(targettags_evaluate_2, 'targettags_evaluate_2.yml')
!kiwi evaluate --config targettags_evaluate_2.yml

----------------------------------------------------------------------------------------
Word-level scores for tags:
File                                                 F1_mult      F1_OK        F1_BAD   
*ensemble*                                           0.42335      0.92813      0.45613  
predictions_transfer2/estimator/target_1/dev/tags    0.38250      0.90322      0.42348  
predictions_transfer2/estimator/target_4/dev/tags    0.37735      0.89539      0.42143  
predictions_baseline/ape/src_pe/dev/tags             0.37535      0.87182      0.43054  
predictions_transfer2/estimator/target_2/dev/tags    0.37347      0.91874      0.40651  
predictions_transfer2/estimator/target_3/dev/tags    0.36672      0.89434      0.41004  
predictions_baseline/nuqe/target/dev/tags            0.31824      0.90716      0.35080  
predictions_baseline/ape/mt_pe/dev/tags              0.10961      0.93035      0.11782  


In [104]:
%%yaml gaptags_evaluate_2
# ==========  Average Ensemble Model (gap tags)  ==========
# Example file for configuring the evaluation pipeline
#
# The input type for prediction files (Probabilities[probs] or tags)
# type: tags
 
# The format of gold files (wmt17/wmt18)
format: wmt18

# Format of predictions (wmt17/wmt18). Either they predict gaps or not.
pred-format: wmt17

# File path for the reference files
gold-target: data/word-level/en_de.nmt/dev.tags

# File path for the prediction files
pred-gaps: [predictions_transfer2/estimator/target_1/dev/gap_tags,
              predictions_transfer2/estimator/target_2/dev/gap_tags,
              predictions_transfer2/estimator/target_3/dev/gap_tags,
              predictions_transfer2/estimator/target_4/dev/gap_tags,
              predictions_baseline/ape/src_pe/dev/gap_tags,
              predictions_baseline/ape/mt_pe/dev/gap_tags,
              predictions_baseline/nuqe/gaps/dev/gap_tags]

<IPython.core.display.Javascript object>

In [105]:
save_config(gaptags_evaluate_2, 'gaptags_evaluate_2.yml')
!kiwi evaluate --config gaptags_evaluate_2.yml 

--------------------------------------------------------------------------------------------
Word-level scores for gap_tags:
File                                                     F1_mult      F1_OK        F1_BAD   
*ensemble*                                               0.24508      0.98838      0.24796  
predictions_baseline/ape/src_pe/dev/gap_tags             0.21709      0.97691      0.22222  
predictions_baseline/ape/mt_pe/dev/gap_tags              0.18978      0.98749      0.19218  
predictions_transfer2/estimator/target_2/dev/gap_tags    0.18098      0.98068      0.18455  
predictions_transfer2/estimator/target_4/dev/gap_tags    0.17802      0.97022      0.18349  
predictions_transfer2/estimator/target_1/dev/gap_tags    0.17711      0.97671      0.18133  
predictions_transfer2/estimator/target_3/dev/gap_tags    0.17636      0.9759       0.18071  
predictions_baseline/nuqe/gaps/dev/gap_tags              0.16613      0.98256      0.16908  


In [106]:
%%yaml sentencescores_evaluate_2
# ==========  Average Ensemble Model (sentence scores)  ==========
# Example file for configuring the evaluation pipeline
#
# The input type for prediction files (Probabilities[probs] or tags)
# type: tags
 
# The format of gold files (wmt17/wmt18)
format: wmt18

# Format of predictions (wmt17/wmt18). Either they predict gaps or not.
pred-format: wmt17

# File path for the reference files
gold-sents: data/sentence_level/en_de/dev.nmt.hter

# File path for the prediction files
pred-sents: [predictions_transfer2/estimator/target_1/dev/sentence_scores_nospace,
              predictions_transfer2/estimator/target_2/dev/sentence_scores_nospace,
              predictions_transfer2/estimator/target_3/dev/sentence_scores_nospace,
              predictions_transfer2/estimator/target_4/dev/sentence_scores_nospace]

<IPython.core.display.Javascript object>

In [107]:
save_config(sentencescores_evaluate_2, 'sentencescores_evaluate_2.yml')
!kiwi evaluate --config sentencescores_evaluate_2.yml 

-----------------------------------------------------------------------------
Sentence-level scoring:
File (predictions_transfer2/estimator)    Pearson r    MAE          RMSE     
*ensemble*                                0.52278      0.10593      0.16235  
target_1/dev/sentence_scores_nospace      0.50445      0.11024      0.16505  
target_4/dev/sentence_scores_nospace      0.48922      0.11831      0.16874  
target_2/dev/sentence_scores_nospace      0.47634      0.10695      0.17234  
target_3/dev/sentence_scores_nospace      0.46578      0.10996      0.17062  
-----------------------------------------------------------------------------
Sentence-level ranking:
File (predictions_transfer2/estimator)    Spearman r    DeltaAvg 
*ensemble*                                0.56673       0.08207  
target_2/dev/sentence_scores_nospace      0.54344       0.07984  
target_4/dev/sentence_scores_nospace      0.54297       0.07934  
target_1/dev/sentence_scores_nospace      0.53576       0.07847 

## Transfer learning 3: small sample based on euclidean distance

In [108]:
# directory
KIWI         = os.getcwd()
# KIWI = KIWI + '/transfer2'
# which_target = '/target_1'
# which_output = '/tags'

SCRIPT_DIR = KIWI + '/transfer3'
# LINEAR_DIR=  SCRIPT_DIR + '/linear'

# RUN_DIR     = SCRIPT_DIR + '/estimator'
WORD_DIR   = KIWI + '/data/word-level/en_de.nmt'
SENT_DIR    = KIWI + '/data/sentence_level/en_de'

PREDICTIONS_DIR = 'predictions_transfer3'
PREDICTIONS_BASE = 'predictions_baseline'

MODEL_DIR = os.listdir(SCRIPT_DIR)

MODEL_DIR.remove('linear')
MODEL_DIR.append('linear')

In [109]:
## produce output for ape, nuqe and estimators
## use linear model to score stacked output for APE, nuqe and estimators


for MODEL_NAME in MODEL_DIR:
                
    if MODEL_NAME == 'estimator':
        RUN_DIR = os.listdir(SCRIPT_DIR+'/'+MODEL_NAME)
        for RUN_NAME in RUN_DIR:
            if RUN_NAME !='.DS_Store':

#                 configfile = SCRIPT_DIR+'/' +MODEL_NAME +'/'+ RUN_NAME+'/predict.yaml'
                modelfile = SCRIPT_DIR+'/' +MODEL_NAME +'/'+ RUN_NAME+'/model.torch'
                model = kiwi.load_model(modelfile)
                modelout = model.predict(examples)
                
                for tag in list(modelout.keys()):
                    
                    print('making estimator ' +RUN_NAME+ ' '+ tag + ' model predictions')

                    all_sentences_join = concat_tags(    myStrReformat(modelout[tag], tag=tag)  )

                    outpath = PREDICTIONS_DIR +'/'+MODEL_NAME+'/'+RUN_NAME + '/dev'
                    myfile = outpath+'/'+tag
                    

                    isExist = os.path.exists(outpath)

                    if not isExist:
                        # Create a new directory because it does not exist 
                        os.makedirs(outpath)
                    #     print("The new directory is created!")

                    Path(myfile).write_text(all_sentences_join)     
                    
                    if tag == 'sentence_scores':
                        sentence_space = read_file(myfile)
                        sentence_no_space_to_write = concat_tags(sentence_space, word_gap='', sentence_gap ='\n')
                        myoutput = outpath+'/'+tag+'_nospace'
                        Path(myoutput).write_text(sentence_no_space_to_write)

                    print('=========================================')
                                        
                    
    if MODEL_NAME == 'linear':
        ## stack probability for linear model
        
        print('stacking probablities for linear model')  
        
        APE_mt_tags     = PREDICTIONS_BASE + '/ape/mt_pe/dev/tags'
        APE_source_tags = PREDICTIONS_BASE + '/ape/src_pe/dev/tags'
        target1_tags    = PREDICTIONS_DIR + '/estimator/target_1/dev/tags' 
        target2_tags    = PREDICTIONS_DIR + '/estimator/target_2/dev/tags' 
        target3_tags    = PREDICTIONS_DIR + '/estimator/target_3/dev/tags' 
        target4_tags    = PREDICTIONS_DIR + '/estimator/target_4/dev/tags' 
        nuqe_tags       = PREDICTIONS_BASE + '/nuqe/target/dev/tags' 

        predicted_file = [APE_mt_tags, APE_source_tags, target1_tags, target2_tags, target3_tags, target4_tags, nuqe_tags]

        probabilities = [read_file(file) for file in predicted_file]

        lengths = [len(prob) for prob in probabilities]

        stacked_probabilities = concat(probabilities)

        outpath = PREDICTIONS_DIR +'/' +MODEL_NAME +'/dev'
        myfile = outpath+'/tags.stacked'

        isExist = os.path.exists(outpath)

        if not isExist:
            # Create a new directory because it does not exist 
            os.makedirs(outpath)
        #     print("The new directory is created!")

        Path(myfile).write_text(stacked_probabilities)

        print('=========================================')   
        
        print('=====predicting using linear model')  
        
        myshell = SCRIPT_DIR + '/run_linear.sh'
        myshell
        subprocess.run([myshell], shell = True)
        print('=========================================')   
                    

making estimator target_2 tags model predictions
making estimator target_2 gap_tags model predictions
making estimator target_2 sentence_scores model predictions
making estimator target_3 tags model predictions
making estimator target_3 gap_tags model predictions
making estimator target_3 sentence_scores model predictions
making estimator target_4 tags model predictions
making estimator target_4 gap_tags model predictions
making estimator target_4 sentence_scores model predictions
making estimator target_1 tags model predictions
making estimator target_1 gap_tags model predictions
making estimator target_1 sentence_scores model predictions
stacking probablities for linear model
=====predicting using linear model
Predicting with model linear
Generating dev set predictions...


## Below are all the evaluation results for transfer 4 model.

In [110]:
%%yaml stackedtag_evaluate_3
# ==========  Stacked Linear Model (target tags)  ==========
# Example file for configuring the evaluation pipeline
#
# The input type for prediction files (Probabilities[probs] or tags)
# type: tags
 
# The format of gold files (wmt17/wmt18)
format: wmt18

# Format of predictions (wmt17/wmt18). Either they predict gaps or not.
pred-format: wmt17

# File path for the reference files
gold-target: data/word-level/en_de.nmt/dev.tags

# File path for the prediction files
pred-target: predictions_transfer3/linear/dev/tags

<IPython.core.display.Javascript object>

In [111]:
save_config(stackedtag_evaluate_3, 'stackedtag_evaluate_3.yml')
!kiwi evaluate --config stackedtag_evaluate_3.yml

----------------------------------------------------------------------------
Word-level scores for tags:
File                                     F1_mult      F1_OK        F1_BAD   
predictions_transfer3/linear/dev/tags    0.43880      0.91742      0.47830  


In [112]:
%%yaml targettags_evaluate_3
# ==========  Average Ensemble Model (target tags)  ==========
# Example file for configuring the evaluation pipeline
#
# The input type for prediction files (Probabilities[probs] or tags)
# type: tags
 
# The format of gold files (wmt17/wmt18)
format: wmt18

# Format of predictions (wmt17/wmt18). Either they predict gaps or not.
pred-format: wmt17

# File path for the reference files
gold-target: data/word-level/en_de.nmt/dev.tags

# File path for the prediction files
pred-target: [predictions_transfer3/estimator/target_1/dev/tags,
              predictions_transfer3/estimator/target_2/dev/tags,
              predictions_transfer3/estimator/target_3/dev/tags,
              predictions_transfer3/estimator/target_4/dev/tags,
              predictions_baseline/ape/src_pe/dev/tags,
              predictions_baseline/ape/mt_pe/dev/tags,
              predictions_baseline/nuqe/target/dev/tags]

<IPython.core.display.Javascript object>

In [113]:
save_config(targettags_evaluate_3, 'targettags_evaluate_3.yml')
!kiwi evaluate --config targettags_evaluate_3.yml

----------------------------------------------------------------------------------------
Word-level scores for tags:
File                                                 F1_mult      F1_OK        F1_BAD   
*ensemble*                                           0.42097      0.93072      0.45230  
predictions_transfer3/estimator/target_4/dev/tags    0.37832      0.9049       0.41808  
predictions_baseline/ape/src_pe/dev/tags             0.37535      0.87182      0.43054  
predictions_transfer3/estimator/target_1/dev/tags    0.37397      0.89699      0.41691  
predictions_transfer3/estimator/target_2/dev/tags    0.36687      0.91339      0.40165  
predictions_transfer3/estimator/target_3/dev/tags    0.36580      0.90551      0.40397  
predictions_baseline/nuqe/target/dev/tags            0.31824      0.90716      0.35080  
predictions_baseline/ape/mt_pe/dev/tags              0.10961      0.93035      0.11782  


In [114]:
%%yaml gaptags_evaluate_3
# ==========  Average Ensemble Model (gap tags)  ==========
# Example file for configuring the evaluation pipeline
#
# The input type for prediction files (Probabilities[probs] or tags)
# type: tags
 
# The format of gold files (wmt17/wmt18)
format: wmt18

# Format of predictions (wmt17/wmt18). Either they predict gaps or not.
pred-format: wmt17

# File path for the reference files
gold-target: data/word-level/en_de.nmt/dev.tags

# File path for the prediction files
pred-gaps: [predictions_transfer3/estimator/target_1/dev/gap_tags,
              predictions_transfer3/estimator/target_2/dev/gap_tags,
              predictions_transfer3/estimator/target_3/dev/gap_tags,
              predictions_transfer3/estimator/target_4/dev/gap_tags,
              predictions_baseline/ape/src_pe/dev/gap_tags,
              predictions_baseline/ape/mt_pe/dev/gap_tags,
              predictions_baseline/nuqe/gaps/dev/gap_tags]

<IPython.core.display.Javascript object>

In [115]:
save_config(gaptags_evaluate_3, 'gaptags_evaluate_3.yml')
!kiwi evaluate --config gaptags_evaluate_3.yml 

--------------------------------------------------------------------------------------------
Word-level scores for gap_tags:
File                                                     F1_mult      F1_OK        F1_BAD   
*ensemble*                                               0.25272      0.98899      0.25554  
predictions_baseline/ape/src_pe/dev/gap_tags             0.21709      0.97691      0.22222  
predictions_baseline/ape/mt_pe/dev/gap_tags              0.18978      0.98749      0.19218  
predictions_transfer3/estimator/target_3/dev/gap_tags    0.18477      0.97854      0.18882  
predictions_transfer3/estimator/target_2/dev/gap_tags    0.18188      0.97857      0.18587  
predictions_transfer3/estimator/target_4/dev/gap_tags    0.17807      0.97755      0.18216  
predictions_transfer3/estimator/target_1/dev/gap_tags    0.17684      0.97828      0.18077  
predictions_baseline/nuqe/gaps/dev/gap_tags              0.16613      0.98256      0.16908  


In [116]:
%%yaml sentencescores_evaluate_3
# ==========  Average Ensemble Model (sentence scores)  ==========
# Example file for configuring the evaluation pipeline
#
# The input type for prediction files (Probabilities[probs] or tags)
# type: tags
 
# The format of gold files (wmt17/wmt18)
format: wmt18

# Format of predictions (wmt17/wmt18). Either they predict gaps or not.
pred-format: wmt17

# File path for the reference files
gold-sents: data/sentence_level/en_de/dev.nmt.hter

# File path for the prediction files
pred-sents: [predictions_transfer3/estimator/target_1/dev/sentence_scores_nospace,
              predictions_transfer3/estimator/target_2/dev/sentence_scores_nospace,
              predictions_transfer3/estimator/target_3/dev/sentence_scores_nospace,
              predictions_transfer3/estimator/target_4/dev/sentence_scores_nospace]

<IPython.core.display.Javascript object>

In [117]:
save_config(sentencescores_evaluate_3, 'sentencescores_evaluate_3.yml')
!kiwi evaluate --config sentencescores_evaluate_3.yml 

-----------------------------------------------------------------------------
Sentence-level scoring:
File (predictions_transfer3/estimator)    Pearson r    MAE          RMSE     
*ensemble*                                0.50753      0.11215      0.16433  
target_4/dev/sentence_scores_nospace      0.49114      0.10923      0.16677  
target_1/dev/sentence_scores_nospace      0.48519      0.11469      0.16632  
target_2/dev/sentence_scores_nospace      0.47103      0.11225      0.16892  
target_3/dev/sentence_scores_nospace      0.45226      0.12416      0.17172  
-----------------------------------------------------------------------------
Sentence-level ranking:
File (predictions_transfer3/estimator)    Spearman r    DeltaAvg 
*ensemble*                                0.56914       0.08264  
target_2/dev/sentence_scores_nospace      0.55594       0.08118  
target_4/dev/sentence_scores_nospace      0.55583       0.07905  
target_3/dev/sentence_scores_nospace      0.54291 