<a href="https://colab.research.google.com/github/AaronDebattista09/ICS5200/blob/main/MGEC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preparation

These cells execute some preparatory steps. Ideally, you should interface to a Google Drive so that Marian NMT can save model checkpoints if and when required.

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
%cd /content

!mkdir -p GIT/
!mkdir -p GIT/ICS5200

%cd GIT

We need to install git-lfs, otherwise we might miss out on very large files.

In [None]:
!git init
!apt-get install git-lfs
!git lfs install

In [None]:
!rm -rf /content/GIT/ICS5200

In [None]:
!git clone https://github.com/AaronDebattista09/ICS5200.git

# Build

These cells execute the steps required to build Marian, including all pre-requisites and the reinstallation of CMake.

In [None]:
# Install Debugger

!pip install remote-pdb
!pip install boto3
!pip install urllib3

!pip install fairseq
!pip install omegaconf
!pip install hydra
!pip install hydra-core --upgrade
!pip install bitarray
!pip install sentencepiece

In [None]:
%cd /content/GIT

In [None]:
!git clone https://github.com/grammatical/pretraining-bea2019.git
!git clone https://github.com/rsennrich/subword-nmt.git

In [None]:
!sudo apt-get install git cmake build-essential libboost-system-dev libprotobuf10 protobuf-compiler libprotobuf-dev openssl libssl-dev libgoogle-perftools-dev 

In [None]:
!sudo apt remove --purge --auto-remove cmake
!sudo apt update && \
!sudo apt install -y software-properties-common lsb-release && \
!sudo apt clean all

!get -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | sudo tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
!sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ bionic main"

!sudo apt update
!sudo apt install kitware-archive-keyring
!sudo rm /etc/apt/trusted.gpg.d/kitware.gpg

!sudo apt update
!sudo apt install cmake

In [None]:
!sudo apt-get remove libtcmalloc-minimal4
!sudo apt-get install libtcmalloc-minimal4
!cp /usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4 /usr/lib/x86_64-linux-gnu/libtcmalloc.so.4

In [None]:
!cp "/content/GIT/ICS5200/Overwrite/Makefile" "/content/GIT/pretraining-bea2019/systems/tools/Makefile"

In [None]:
!rm -rf /content/GIT/pretraining-bea2019/systems/tools/errant
!rm -rf /content/GIT/pretraining-bea2019/systems/tools/jfleg
!rm -rf /content/GIT/pretraining-bea2019/systems/tools/m2scorer

%cd /content/GIT/pretraining-bea2019/systems/tools
!make all -f /content/GIT/pretraining-bea2019/systems/tools/Makefile

!rm -rf /content/GIT/pretraining-bea2019/systems/tools/marian-dev
!make marian-dev -f /content/GIT/pretraining-bea2019/systems/tools/Makefile


# Datasets

These cells load up data from Korpus Malti V3. They do not need to be executed since the processed files are already in the Git repository.

### MLRS Corpus

In [None]:
!pip install -U pip setuptools wheel
!pip install -U spacy
!python -m spacy download en_core_web_sm
!pip install spacy-langdetect

In [None]:
!rm -rf /content/Downloads

In [None]:
%cd /content/

DL = "Downloads"
DL_ZIP = DL + "/zip"
DL_FILES = DL + "/files"

!mkdir -p $DL
!mkdir -p $DL_ZIP
!mkdir -p $DL_FILES
!rm -rf $DL_ZIP/*

In [None]:
!pip install gdown

%cd /content/$DL_ZIP

# &export=download
!gdown 'https://drive.google.com/u/0/uc?id=1EphgAV1WI5gQRqfFW502Nd2NfZmAuYDR&export=download&confirm=t' -O Academic.zip
!gdown 'https://drive.google.com/u/0/uc?id=1EWFF5eXq7QyFlfVTrja7wQ3Zzyf8CFt2&export=download&confirm=t' -O Culture.zip
!gdown 'https://drive.google.com/u/0/uc?id=1EpsLRf5bwtT4qL9CSoCZw4dw1pQJmxOg&export=download&confirm=t' -O European.zip 
!gdown 'https://drive.google.com/u/0/uc?id=1EMOxIu2nXRyIh7akCH6Tvu6ZxF_l-3j7&export=download&confirm=t' -O Law.zip
!gdown 'https://drive.google.com/u/0/uc?id=1ETH8get-DeSDDEoIxDrfb0kZC6yUnKX3&export=download&confirm=t' -O News.zip
!gdown 'https://drive.google.com/u/0/uc?id=1Epg8uHzZnWyRtyhLZEw2f8vyV2Cr8EFu&export=download&confirm=t' -O Opinion.zip
!gdown 'https://drive.google.com/u/0/uc?id=1EL_hI1I-LErAt3y4urX8p_zEBcorPuhw&export=download&confirm=t' -O Parliament.zip
!gdown 'https://drive.google.com/u/0/uc?id=1EAptpKju7NE7Gp0D9Gc0tO2Rtq5uEE8a&export=download&confirm=t' -O Religion.zip
!gdown 'https://drive.google.com/u/0/uc?id=1EDjFM7XP0UI-eHtaFJwsazAStk5wYG8h&export=download&confirm=t' -O Sport.zip

%cd /content/$DL_FILES

!mkdir -p Academic
!mkdir -p Culture
!mkdir -p European
!mkdir -p Law
!mkdir -p News
!mkdir -p Opinion
!mkdir -p Parliament
!mkdir -p Religion
!mkdir -p Sport

In [None]:
!rm -rf /content/$DL_FILES/*
!sudo apt-get install unzip

%cd /content/$DL_ZIP

!unzip -o Academic -d /content/$DL_FILES/Academic/ 
!unzip -o Culture -d /content/$DL_FILES/Culture/
!unzip -o European -d /content/$DL_FILES/European/
!unzip -o Law -d /content/$DL_FILES/Law/
!unzip -o News -d /content/$DL_FILES/News/
!unzip -o Opinion -d /content/$DL_FILES/Opinion/
!unzip -o Parliament -d /content/$DL_FILES/Parliament/
!unzip -o Religion -d /content/$DL_FILES/Religion/
!unzip -o Sport -d /content/$DL_FILES/Sport/

In [None]:
import os
import os.path

mlrs_file_list = []

for dirpath, dirnames, filenames in os.walk("/content/" + DL_FILES):
  for filename in [f for f in filenames if f.endswith(".txt")]:
    mlrs_file_list.append(os.path.join(dirpath, filename))

for file in mlrs_file_list:
  os.rename(file, file[:-4].replace('.','_') + '.xml')

In [None]:
mlrs_file_list = []

for dirpath, dirnames, filenames in os.walk("/content/" + DL_FILES):
  for filename in [f for f in filenames if f.endswith(".xml")]:
    mlrs_file_list.append(os.path.join(dirpath, filename))

for file in mlrs_file_list:
  print("Updating: " + file)
  with open(file, "r+") as f:
      old = f.read() # read everything in the file
      f.seek(0) # rewind
      f.write("<document>" + old.replace('&','&amp;') + "</document>")

In [None]:
import xml.etree.ElementTree as ET

mlrs_tokens_array = []

for xml_file in filter(lambda x: x, mlrs_file_list): # I can edit the filter if needed...

  print("Processing: " + xml_file)
  tree = ET.parse(xml_file)
  root = tree.getroot()

  for p in root[0].findall('p'):
    for s in p.findall('s'):
      mlrs_sentence_array = []
      for row in s.text.split('\n'):
        if row != '':
          mlrs_sentence_array.append(row.split('\t'))

      mlrs_tokens_array.append(mlrs_sentence_array)

In [None]:
mlrs_clean_text = []
for token_list in map(lambda sentence: 
                      filter(lambda x: x, 
                             map(lambda ele: str(ele[0]).replace('\x93','').replace('\x94', '').replace('…','')
                             , sentence))
                      , mlrs_tokens_array):
  mlrs_clean_text.append(" ".join(list(token_list)))

In [None]:
import re

def check_to_remove(input_string):
  
  if bool(re.search(r'^[^a-zA-Z]*$', input_string)) \
    or bool(re.search(r'.*"$', input_string)) \
    or bool(re.search(r'^(Sinfonija|4 Improvviżi op|Ouverture|It- Trota " op).*$', input_string)):
    return False
  else:
    return True

def eliminate_text(input_string):
  
  output_string = input_string

  if bool(re.search(r'^MR SPEAKER :', output_string)):
    output_string = re.sub(r'^MR SPEAKER :', '', output_string)

  if bool(re.search(r'( Onor\. Membri : Aye )', output_string)):
    output_string = re.sub(r'( Onor\. Membri : Aye )', '', output_string)

  if bool(re.search(r'( Onor\. Membri : No )', output_string)):
    output_string = re.sub(r'( Onor\. Membri : No )', '', output_string)

  if bool(re.search(r'( Onor\. Membri : Iva )', output_string)):
    output_string = re.sub(r'( Onor\. Membri : Iva )', '', output_string)

  if bool(re.search(r'^ONOR.+:', output_string)):
    output_string = re.sub(r'^ONOR.+:', '', output_string)

  if bool(re.search(r'^THE CHAIRMAN :', output_string)):
    output_string = re.sub(r'^THE CHAIRMAN :', '', output_string)
   
  
  output_string = re.sub('\(.*\)', '', output_string)

  return output_string.strip()

mlrs_regex_clean_text = list(map(eliminate_text, filter(check_to_remove, mlrs_clean_text)))

for t in mlrs_regex_clean_text:
  print(t)

print("\n\nSanitised using Regex. Kept {0}/{1} rows.".format(len(mlrs_regex_clean_text), len(mlrs_clean_text)))

In [None]:
import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector

def get_lang_detector(nlp, name):
    return LanguageDetector()

Language.factory("language_detector", func=get_lang_detector)

nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('language_detector', last=True)

In [None]:
mlrs_lang_text = []

count = 0
for text in mlrs_regex_clean_text:
  doc = nlp(text)
  detect_language = doc._.language 
  detect_language["text"] = text
  mlrs_lang_text.append(detect_language)
  count += 1

  if count % 500 == 0 or count == len(mlrs_regex_clean_text):
    print("Processed: {0}/{1}".format(count, len(mlrs_regex_clean_text)))


In [None]:
# Remove English text
mlrs_filtered_text = list(map(lambda x: x['text'], 
  filter(lambda x: (x['language'] == 'en' and x['score'] <= 0.9) 
                    or x['language'] != 'en', mlrs_lang_text)))

for text in mlrs_filtered_text:
  print(text)

# Replication

This section can be skipped. However, running these cells would peform a simple English translation using one of the Marian models that were used in the original study.

In [None]:
%cd /content/GIT/pretraining-bea2019/systems/model.lowresource
!./download.sh

In [None]:
%cd /content/GIT/pretraining-bea2019/systems/

!rm -rf /content/GIT/pretraining-bea2019/systems/runs/generation/low_resource/
!mkdir -p /content/GIT/pretraining-bea2019/systems/runs/generation/low_resource/

!echo "Mary had a litle lamb ." > runs/generation/low_resource/test_lr.in 
!./run.sh model.lowresource \
  runs/generation/low_resource/test_lr.in \
  runs/generation/low_resource/test_lr.out  -d 0 1

In [None]:
!mkdir -p /content/GIT/pretraining-bea2019/systems/runs/generation/low_resource

!./tools/marian-dev/build/marian-scorer -m model.lowresource/rl1.npz \
 -v model.lowresource/vocab.{spm,spm} \
 --n-best --n-best-feature R2L1 \
 --workspace 6000 --mini-batch-words 4000 \
 -t runs/generation/low_resource/test_lr.in runs/generation/low_resource/test_lr.out.nbest0

# Configurations for Implementation

You can activate/deactivate certain adaptations from here. 

* **source_word_corruption** - Toggles the removal of words from source files.
* **domain_error_adaptation** - Toggles the use of synthesised files.
* **large_vocabs** - Toggle the use of larger vocabularies. REQUIRES domain_error_adaptation to be turned on.
* **tied_embeddings** - Toggles the user of tied embeddings.
* **pretrained** - Set value in (0,1,2).
  * **0** - No pretraining.
  * **1** - Pretrain from BERTu.
  * **2** - Pretrain from mBERTu.
* **load_eval_sets** - Adds the evaluation sets to the training data.

#### Preconfigurations 

These preconfiguraitons are based on the experiments undertaken in the study.

| Experiment          | Src.Corr | DE.Adapt | Lrg.Vocab | Tied.Emb | Pt.BERTu | Pt.mBERTu | Eval.Sets |
| ------------------- | -------- | -------- | --------- | -------- | -------- | --------- | --------- |
| Baselines           |    N     |    N     |     N     |    N     |    N     |     N     |     N     |
| Base+Src.Corr       |    Y     |    N     |     N     |    N     |    N     |     N     |     N     |
| Base+Tied.Emb       |    N     |    N     |     N     |    Y     |    N     |     N     |     N     |
| Base+T.E+BERTu      |    N     |    N     |     N     |    Y     |    Y     |     N     |     N     |
| Base+T.E+mBERTu     |    N     |    N     |     N     |    Y     |    N     |     Y     |     N     |
| DE.Adapt            |    N     |    Y     |     N     |    N     |    N     |     N     |     N     |
| DE.Adapt+Lrg.Vocab  |    N     |    Y     |     Y     |    N     |    N     |     N     |     N     |
| DE.Adapt+T.E        |    N     |    Y     |     N     |    Y     |    N     |     N     |     N     |
| DE.Adapt+T.E+BERTu  |    N     |    Y     |     N     |    Y     |    Y     |     N     |     N     |
| DE.Adapt+T.E+mBERTu |    N     |    Y     |     N     |    Y     |    N     |     Y     |     N     |
| Final               |    N     |    Y     |     N     |    Y     |    Y     |     N     |     Y     |

In [None]:
source_word_corruption = False #@param {type:"boolean"}
domain_error_adaptation = True #@param {type:"boolean"}
large_vocabs = False  #@param {type:"boolean"}
tied_embeddings = True #@param {type:"boolean"}
pretrained = 1 #@param {type:"integer"}
load_eval_sets = True #@param {type:"boolean"} 

# Implementation (Maltese)

These cells will setup the Maltese GEC system and all its prerequisites.

#### Prerequisites

In [None]:
%cd /content/GIT/pretraining-bea2019/training/

!mkdir -p bpe

%cd /content/GIT/pretraining-bea2019/systems/

!mkdir -p model.lowresource.mt/log


In [None]:
!pip install sentence-splitter
!pip install sentencepiece
!pip install --upgrade git+https://github.com/cisnlp/simalign.git#egg=simalign

In [None]:
%cd /content/GIT/ICS5200/Tools/

This is a helper method to facilitate tokenisation. In the event that code depracates, setting **use_rest** to True will call the tokeniser from an online API, but be warned that this approach is much slower.

In [None]:
from tokenisation import MTRegex, MTParTokenizer, MTWordTokenizer, MTSentenceTokenizer, MTSentencePieceTokenizer
import urllib.parse
import requests
from IPython.display import clear_output

tokeniser = MTWordTokenizer();
tokeniser_api_link = "https://mlrs.research.um.edu.mt/tools/mlrsapi/tokenise?text="

def tokenise(path, use_rest=False):

  tokens = []
  with open(path, 'r', errors='replace') as f:

    lines = f.readlines()
    count = 0

    for line in lines:

      if (use_rest):
        query = "https://mlrs.research.um.edu.mt/tools/mlrsapi/tokenise?text=" + urllib.parse.quote(line)
        response = requests.get(query)
        tokens.append(response.json()['result'])
      else:
        tokens.append(tokeniser.tokenize(line))
      
      count += 1
      print("{0}/{1}".format(count, len(lines)))

    clear_output()
    print("Processed: " + path)
  
  return tokens


#### Tokenisation

In [None]:
import glob

src_tokens = tokenise("/content/GIT/ICS5200/QariTalProvi_Batch_1/Curated/src.txt")
trg_tokens = tokenise("/content/GIT/ICS5200/QariTalProvi_Batch_1/Curated/trg.txt")

if domain_error_adaptation:
  src_tokens = src_tokens + \
    tokenise("/content/GIT/ICS5200/Common Voice/Synthesized/src.txt") + \
    tokenise("/content/GIT/ICS5200/MLRS/Synthesized/src.txt")
  
  trg_tokens = trg_tokens + \
    tokenise("/content/GIT/ICS5200/Common Voice/Synthesized/trg.txt") + \
    tokenise("/content/GIT/ICS5200/MLRS/Synthesized/trg.txt")
    
if load_eval_sets:
  eval_src_tokens = []
  eval_trg_tokens = []
  
  lst_eval_src = glob.glob("/content/GIT/ICS5200/QariTalProvi_Batch_2/Training/Converted/Source/*.txt")
  lst_eval_trg = glob.glob("/content/GIT/ICS5200/QariTalProvi_Batch_2/Training/Converted/Target/*.txt")
  lst_eval_src.sort()
  lst_eval_trg.sort()

  for eval_src_file in lst_eval_src:
    eval_src_tokens += tokenise(eval_src_file)
  
  for eval_trg_file in lst_eval_trg:
    eval_trg_tokens += tokenise(eval_trg_file)

  src_tokens = src_tokens + eval_src_tokens
  trg_tokens = trg_tokens + eval_trg_tokens

In [None]:
len(trg_tokens)

In [None]:
with open('/content/GIT/pretraining-bea2019/training/src_tokenised.txt','w') as f:
  c = 0
  for line in src_tokens:
    f.writelines(" ".join(line) + "\n")
    c += 1
  
  print("Wrote {0} lines to file.".format(c))

In [None]:
with open('/content/GIT/pretraining-bea2019/training/trg_tokenised.txt','w') as f:
  c = 0
  for line in trg_tokens:
    f.writelines(" ".join(line) + "\n")
    c += 1
    
  print("Wrote {0} lines to file.".format(c))

#### Test/Validate Splits

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(src_tokens, trg_tokens, test_size=0.10)

print("X Train: " + str(len(x_train)))
print("X Test: " + str(len(x_test)))
print("Y Train: " + str(len(y_train)))
print("Y Test: " + str(len(y_test)))

with open("/content/GIT/pretraining-bea2019/training/train.src", "w", encoding="UTF-8") as f:
  f.writelines(list(map(lambda t: " ".join(t) + "\n",x_train)))

with open("/content/GIT/pretraining-bea2019/training/train.trg", "w", encoding="UTF-8") as f:
  f.writelines(list(map(lambda t: " ".join(t) + "\n",y_train)))

with open("/content/GIT/pretraining-bea2019/training/valid.src", "w", encoding="UTF-8") as f:
  f.writelines(list(map(lambda t: " ".join(t) + "\n",x_test)))

with open("/content/GIT/pretraining-bea2019/training/valid.trg", "w", encoding="UTF-8") as f:
  f.writelines(list(map(lambda t: " ".join(t) + "\n",y_test)))

In [None]:
# Check samples
with open("/content/GIT/pretraining-bea2019/training/train.src", "r+", encoding="UTF-8") as train_src, \
  open("/content/GIT/pretraining-bea2019/training/train.trg", "r+", encoding="UTF-8") as train_trg, \
  open("/content/GIT/pretraining-bea2019/training/valid.src", "r+", encoding="UTF-8") as valid_src, \
  open("/content/GIT/pretraining-bea2019/training/valid.trg", "r+", encoding="UTF-8") as valid_trg:
  
  for conn in [train_src, train_trg, valid_src, valid_trg]:

    lines = conn.readlines()

    print("Sampling: " + conn.name)
    for line in lines[0:5]:  
      print(line, end="")

    print("Lines: " + str(len(lines)))
    print()


#### BPE Encoding

In [None]:
!mkdir -p /content/GIT/pretraining-bea2019/training/bpe

In [None]:
!/content/GIT/subword-nmt/subword_nmt/learn_bpe.py  < /content/GIT/pretraining-bea2019/training/train.src > /content/GIT/pretraining-bea2019/training/bpe/train.bpe.src
!/content/GIT/subword-nmt/subword_nmt/learn_bpe.py  < /content/GIT/pretraining-bea2019/training/train.trg > /content/GIT/pretraining-bea2019/training/bpe/train.bpe.trg
!/content/GIT/subword-nmt/subword_nmt/learn_bpe.py < /content/GIT/pretraining-bea2019/training/valid.src > /content/GIT/pretraining-bea2019/training/bpe/valid.bpe.src
!/content/GIT/subword-nmt/subword_nmt/learn_bpe.py < /content/GIT/pretraining-bea2019/training/valid.trg > /content/GIT/pretraining-bea2019/training/bpe/valid.bpe.trg

In [None]:
with open("/content/GIT/pretraining-bea2019/training/bpe/train.bpe.src", "r+", encoding="UTF-8") as train_src, \
  open("/content/GIT/pretraining-bea2019/training/bpe/train.bpe.trg", "r+", encoding="UTF-8") as train_trg, \
  open("/content/GIT/pretraining-bea2019/training/bpe/valid.bpe.src", "r+", encoding="UTF-8") as valid_src, \
  open("/content/GIT/pretraining-bea2019/training/bpe/valid.bpe.trg", "r+", encoding="UTF-8") as valid_trg:
  
  for conn in [train_src, train_trg, valid_src, valid_trg]:

    lines = conn.readlines()

    print("Sampling: " + conn.name)
    for line in lines[0:5]:  
      print(line, end="")

    print("Lines: " + str(len(lines)))
    print()

In [None]:
with open("/content/GIT/pretraining-bea2019/training/bpe/train.bpe.src", "r+") as f:
  print(len(f.readlines()))
with open("/content/GIT/pretraining-bea2019/training/bpe/train.bpe.trg", "r+") as f:
  print(len(f.readlines()))
with open("/content/GIT/pretraining-bea2019/training/bpe/valid.bpe.src", "r+") as f:
  print(len(f.readlines()))
with open("/content/GIT/pretraining-bea2019/training/bpe/valid.bpe.trg", "r+") as f:
  print(len(f.readlines()))

#### Building Vocabularies with SentencePiece

In [None]:
VOCAB_SIZE_SRC = 14506
VOCAB_SIZE_TRG = 13882

if domain_error_adaptation:
  if large_vocabs:
    VOCAB_SIZE_SRC = 30776
    VOCAB_SIZE_TRG = 22444
  elif tied_embeddings:
    VOCAB_SIZE_SRC = 22000
    VOCAB_SIZE_TRG = 22000

print("Vocabulary sizes set to: \nSOURCE: {0}\nTARGET: {1}".format(VOCAB_SIZE_SRC,VOCAB_SIZE_TRG))

In [None]:
!mkdir -p /content/GIT/pretraining-bea2019/training/vocab

In [None]:
MARIAN_VOCAB = "/content/GIT/pretraining-bea2019/systems/tools/marian-dev/build/marian-vocab"
VOCAB_DIR = "/content/GIT/pretraining-bea2019/training/vocab"

TRAIN_SRC = "/content/GIT/pretraining-bea2019/training/train.src"
TRAIN_TRG = "/content/GIT/pretraining-bea2019/training/train.trg"
VALID_SRC = "/content/GIT/pretraining-bea2019/training/valid.src"
VALID_TRG = "/content/GIT/pretraining-bea2019/training/valid.trg"

TRAIN_BPE_SRC = "/content/GIT/pretraining-bea2019/training/bpe/train.bpe.src"
TRAIN_BPE_TRG = "/content/GIT/pretraining-bea2019/training/bpe/train.bpe.trg"
VALID_BPE_SRC = "/content/GIT/pretraining-bea2019/training/bpe/valid.bpe.src"
VALID_BPE_TRG = "/content/GIT/pretraining-bea2019/training/bpe/valid.bpe.trg"


!cat $TRAIN_SRC $VALID_SRC | $MARIAN_VOCAB > $VOCAB_DIR/vocab.src.yml
!cat $TRAIN_TRG $VALID_TRG | $MARIAN_VOCAB > $VOCAB_DIR/vocab.trg.yml
!cat $TRAIN_SRC $VALID_SRC $TRAIN_TRG $VALID_TRG | $MARIAN_VOCAB > $VOCAB_DIR/vocab.src_trg.yml

!cat $TRAIN_BPE_SRC $VALID_BPE_SRC | $MARIAN_VOCAB > $VOCAB_DIR/vocab.bpe.src.yml
!cat $TRAIN_BPE_TRG $VALID_BPE_TRG | $MARIAN_VOCAB > $VOCAB_DIR/vocab.bpe.trg.yml
!cat $TRAIN_BPE_SRC $VALID_BPE_SRC $TRAIN_BPE_TRG $VALID_BPE_TRG | $MARIAN_VOCAB > $VOCAB_DIR/vocab.bpe.src_trg.yml

We need a slight hack here because Marian needs to recognise a particular file extension so that it could add its own custom required tokens to the vocabularies. These tokens are mandatory otherwise the pipeline wouldn't work.

In [None]:
!rm -rf /content/GIT/pretraining-bea2019/training/vocab/hack_needed/*
!rm -rf /content/GIT/pretraining-bea2019/training/vocab/spm*

In [None]:
%cd /content/GIT/pretraining-bea2019/training/vocab

!mkdir -p hack_needed
%cd hack_needed

!rm -rf /content/GIT/pretraining-bea2019/training/vocab/hack_needed/*
!rm -rf /content/GIT/pretraining-bea2019/training/vocab/spm*

SPM_DECODE = "/content/GIT/pretraining-bea2019/systems/tools/marian-dev/build/spm_decode"
SPM_ENCODE = "/content/GIT/pretraining-bea2019/systems/tools/marian-dev/build/spm_encode"
SPM_EXPORT_VOCAB = "/content/GIT/pretraining-bea2019/systems/tools/marian-dev/build/spm_export_vocab"
SPM_NORMALIZE = "/content/GIT/pretraining-bea2019/systems/tools/marian-dev/build/spm_normalize"
SPM_TRAIN = "/content/GIT/pretraining-bea2019/systems/tools/marian-dev/build/spm_train"

!sed 's/\r$//' /content/GIT/pretraining-bea2019/training/src_tokenised.txt > \
   /content/GIT/pretraining-bea2019/training/src_tokenised_prep.txt
!sed 's/\r$//' /content/GIT/pretraining-bea2019/training/trg_tokenised.txt > \
   /content/GIT/pretraining-bea2019/training/trg_tokenised_prep.txt

!$SPM_TRAIN --input /content/GIT/pretraining-bea2019/training/src_tokenised_prep.txt --vocab_size $VOCAB_SIZE_SRC  --model_prefix spm.src --model_type=word
!$SPM_TRAIN --input /content/GIT/pretraining-bea2019/training/trg_tokenised_prep.txt --vocab_size $VOCAB_SIZE_TRG  --model_prefix spm.trg --model_type=word

!cp spm.src.vocab ../spm.src.yml
!cp spm.src.model ../spm.src.spm
!cp spm.trg.vocab ../spm.trg.yml
!cp spm.trg.model ../spm.trg.spm

#### Source Word Corruption

This part of the Notebook genereates corrupted source file if source word corruption is active.

In [None]:
if source_word_corruption:

  %cd /content/GIT/ICS5200/Tools/

  from synthesis import Synthesizer, SynthesisStrategy

  with open("/content/GIT/pretraining-bea2019/training/train.src", "r") as f:
    src_train_tokens = list(map(lambda x: x.replace('\n', '').split(" "), f.readlines()))
    
    print("\nSAMPLE FROM SOURCE\n")
    for x in src_train_tokens[0:10]:
      print(" ".join(x))

  print("\n")
  synth = Synthesizer(src_train_tokens)
  synth.synthesize(SynthesisStrategy.ORGANISED_DROPOUT, sentence_seed=1, token_seed=1, dropout_modulus=10)

  print("\nSAMPLE FROM SYNTHESISER\n")
  for x in synth.data[0:10]:
    print(" ".join(x))

  with open("/content/GIT/pretraining-bea2019/training/train.corrupt.src", "w") as f:
    
    for swd_tokens in synth.data:
      f.writelines(" ".join(swd_tokens) + "\n")

  print("\nWrote to file...")

# HuggingFace BERT

This part of the Notebook sources the pretrained BERT models. Note that the size of the embedding vectors need to be reworked to fit with the smaller vocabulary sizes of the Maltese GEC.

In [None]:
!pip install huggingface_hub

In [None]:
!rm -rf /content/GIT/pretraining-bea2019/systems/model.bert/*
!mkdir -p /content/GIT/pretraining-bea2019/systems/model.bert
%cd /content/GIT/ICS5200/Tools

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

# auth_token = "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"

from HF2M import ModelConverter

for bert in ['BERTu','mBERTu']:
  tokenizer = AutoTokenizer.from_pretrained("MLRS/{0}".format(bert))
  model = AutoModelForMaskedLM.from_pretrained("MLRS/{0}".format(bert))
  model.resize_token_embeddings(VOCAB_SIZE_SRC)

  !rm -rf /content/GIT/pretraining-bea2019/systems/model.lowresource.mt/$bert.npz
  %cd /content/GIT/ICS5200/Tools/

  MC = ModelConverter()
  MC.feed_model(model)
  MC.translate_model("/content/GIT/pretraining-bea2019/systems/model.bert/{0}.npz".format(bert))

# Training

If you need to, you can run the command hereunder to clear any existing model files in your folders.

In [None]:
!rm -rf /content/GIT/pretraining-bea2019/systems/model.lowresource.mt/*
%cd /content/GIT/pretraining-bea2019/systems/model.lowresource.mt/
!mkdir -p log

* **KEY** - Determines which architecture the model shall be based on. You have three options:
  * **AMUN** - Nematus equivalent transformer architecture
  * **S2S** - Sequence-to-Sequence architecture
  * **Transformer** - Vaswani-style transformer architecture
* **NETWORK_NAME_SUFFIX** - This is just a friendly suffix so that you could distinguish different models in your Google Drive. The folder names will always be a concatenation of the KEY and the NETWORK_NAME_SUFFIX
* **EPOCH_INTERVAL** - This number determines after how many epochs the model will save a checkpoint to the Google Drive. Applicable only if **ITERATE_ON_EPOCH** is set to True.
* **UPDATE_INTERVAL** - This number determines after how many batch updates the model will save a checkpoint to the Google Drive. Applicable only if **ITERATE_ON_EPOCH** is set to False.
* **ITERATE_ON_EPOCH** - Toggle between iterating on epochs or batch updates.

*Note: We recommend that you do not save checkpoints based on epochs! Epochs can become very long when dealing with EncDec architectures and it could take days to elapse a single epoch, especially when working with the larger models such as the Pretrained ones. You can leave the settings for **ITERATE_ON_EPOCH**, **EPOCH_INTERVAL** and **UPDATE_INTERVAL** exactly as is (False, 1, 500).*

In [None]:
KEY = "AMUN" #@param {type:"string"}

NETWORK_NAME_SUFFIX = "_TEST" #@param {type:"string"}
NETWORK_NAME = KEY + NETWORK_NAME_SUFFIX

EPOCH_INTERVAL = 1 #@param {type:"integer"}
UPDATE_INTERVAL = 500 #@param {type:"integer"}

ITERATE_ON_EPOCH = False #@param {type:"boolean"}

!mkdir -p /content/drive/MyDrive/MARIAN-CHECKPOINTS
!mkdir -p /content/drive/MyDrive/MARIAN-CHECKPOINTS/$NETWORK_NAME

%cd /content/GIT/pretraining-bea2019/systems

units = "epochs" if ITERATE_ON_EPOCH else "updates"

If you are starting training, then you can set **ITER** to 0. If you are continuing from a different day, then check the last checkpoint that was generated in you Google Drive and input that number into **ITER**. Marian will then copy the checkpoint from Google Drive back into the current session and resume training from that checkpoint.

**ITER_MAX** determines the point when training will stop.

*Note: Both **ITER** and **ITER_MAX** need to be numbers that make sense. If working with epochs, these would be smaller integers. But since we are working with batch iterations, they are larger integers.*

In [None]:
#@title Set Iteration Parameters
ITER =  0#@param
ITER_MAX = 500 #@param

print("NETWORK: {0}".format(NETWORK_NAME))
print("TYPE: {0}".format(TYPE))
print("START: {0} {1}".format(ITER, units))
print("END: {0} {1}".format(ITER_MAX, units))
print("")

In [None]:
ARCH_PARAM = 's2s' if pretrained else KEY.lower()
TRAIN_FROM_PARAM = "train.corrupt.src" if source_word_corruption else "train.src"

# For any model to work, it must at least have at least have the target embeddings tied (--tied-embeddings)
# The adaptation (tied-embeddings-all) ties both the source and the target embeddings.
TIED_EMBEDDINGS_PARAM = "tied-embeddings-all" if tied_embeddings else "tied-embeddings"

PRETRAINED_PARAM = ""

if pretrained == 1:
  PRETRAINED_PARAM = "--pretrained-model model.bert/BERTu.npz"
elif pretrained == 2:
  PRETRAINED_PARAM = "--pretrained-model model.bert/mBERTu.npz"

DIM_EMB_PARAM = 768 if pretrained else 512

print("ARCHITECTURE_PARAM: {0}{1}".format(ARCH_PARAM, " (..must use S2S when pretraining)" if pretrained else ""))
print("TRAIN_FROM_PARAM: {0}".format(TRAIN_FROM_PARAM))
print("TIED_EMBEDDINGS_PARAM: {0}".format(TIED_EMBEDDINGS_PARAM))
print("DIM_EMB_PARAM: {0}".format(DIM_EMB_PARAM))
print("PRETRAINED_PARAM: {0}".format(PRETRAINED_PARAM))
print("\nVocabulary sizes set to: \nSOURCE: {0}\nTARGET: {1}".format(VOCAB_SIZE_SRC,VOCAB_SIZE_TRG))

In [None]:
# COPY FROM DRIVE
if ITER != 0:
  SRC = str(ITER).zfill(2) if ITERATE_ON_EPOCH else str(ITER).zfill(5)
  !cp -r /content/drive/MyDrive/MARIAN-CHECKPOINTS/$NETWORK_NAME/$SRC/* /content/GIT/pretraining-bea2019/systems/model.lowresource.mt

Please pay close attention to platform-specific configurations like **cpu-threads**!

Also, don't forget to monitor **ITER**. If you commenced training, that value would update.

In [None]:
%cd /content/GIT/pretraining-bea2019/systems
from IPython.display import clear_output

print("ITER: {0}".format(ITER))
print("ITER_MAX: {0}".format(ITER_MAX))

while(ITER < ITER_MAX):

  if ITERATE_ON_EPOCH: 
    LAST_CHKP = str(ITER).zfill(2)
    ITER += EPOCH_INTERVAL
    NEXT_CHKP = str(ITER).zfill(2)
    AFTER = "{0}e".format(ITER)
  else:
    LAST_CHKP = str(ITER).zfill(5)
    ITER += UPDATE_INTERVAL
    NEXT_CHKP = str(ITER).zfill(5)
    AFTER = "{0}u".format(ITER)

    # --ignore-model-config \  --no-restore-corpus \    --dim-vocabs 22000 \
    # --pretrained-model model.bert/mBERTu.npz \

  !tools/marian-dev/build/marian --model model.lowresource.mt/model.npz \
    --type $ARCH_PARAM \
    --cpu-threads 8 \
    $PRETRAINED_PARAM \
    --train-sets ../training/$TRAIN_FROM_PARAM ../training/train.trg \
    --valid-sets ../training/valid.src ../training/valid.trg \
    --vocabs ../training/vocab/spm.src.spm ../training/vocab/spm.trg.spm \
    --dim-emb $DIM_EMB_PARAM \
    --mini-batch 4 \
    --maxi-batch 100 \
    --max-length 100 \
    -w 10000 \
    --layer-normalization \
    --dropout-rnn 0.2 \
    --dropout-src 0.1 \
    --dropout-trg 0.1 \
    --$TIED_EMBEDDINGS_PARAM \
    --no-restore-corpus \
    --after $AFTER \
    --seed 1111 \
    --overwrite --keep-best --exponential-smoothing \
    --normalize=1 --beam-size 6 \
    --valid-freq 500 \
    --save-freq 500u \
    --disp-freq 500 \
    --disp-label-counts \
    --learn-rate 0.0001 \
    --lr-report \
    --optimizer-params 0.9 0.98 1e-08 \
    --clip-norm 5 \
    --early-stopping 5 \
    --valid-metrics ce-mean-words cross-entropy perplexity bleu \
    --cost-type=ce-mean-words \
    --valid-reset-stalled \
    --log model.lowresource.mt/log/train.log \
    --valid-log model.lowresource.mt/log/valid.log \
    --log-level trace

  # clear_output()
  !mkdir -p /content/drive/MyDrive/MARIAN-CHECKPOINTS/$NETWORK_NAME/$NEXT_CHKP
  !cp -r /content/GIT/pretraining-bea2019/systems/model.lowresource.mt/* /content/drive/MyDrive/MARIAN-CHECKPOINTS/$NETWORK_NAME/$NEXT_CHKP

  print("LAST: {0} - THIS: {1} - PARAM: {2}".format(LAST_CHKP, NEXT_CHKP, AFTER))
  print("SAVE FOLDER: {0}/{1}".format(NETWORK_NAME, NEXT_CHKP))

# Trained Models

These cells copy over the trained models from our study.

In [None]:
%cd /content/
!mkdir -p TrainedModels

In [None]:
%cd TrainedModels

In [None]:
!gdown https://drive.google.com/uc?id=1E009yHoVW8F1WJc_u1gemuSc8MkvP9Qy
!gdown https://drive.google.com/uc?id=1rtdAqRdRzlMrM2fcVP4dyQdjshSszm10
!gdown https://drive.google.com/uc?id=1HP0gaDcHmh52tQk0PjTznYuJuXrueiiK
!gdown https://drive.google.com/uc?id=1o7GROxHfjirzZasvqJjIWL4_KpZJArn3
!gdown https://drive.google.com/uc?id=1iM47ATQOL0ffBWQILc3TyhqIHTcZul5e
!gdown https://drive.google.com/uc?id=12HfCmJXFisbn6nOqtW6kx-YjKoFnPWz2
!gdown https://drive.google.com/uc?id=1K7zrm4PbaqWX2QG7b7ATHrANB-ZmIA96
!gdown https://drive.google.com/uc?id=1EO1c3jl4qbfBGVHUB37rgnSBZPGXjoJD
!gdown https://drive.google.com/uc?id=1U_jdvJoEE8SIIjAxIRyGSSMts_W-LR2g
!gdown https://drive.google.com/uc?id=1UAsJOjkjjBvwrJWibp2Q--fjJzoefbDS
!gdown https://drive.google.com/uc?id=1cisBb5XhrJmjeFep6Zs9f3Qk3T2JZPYv
!gdown https://drive.google.com/uc?id=1sggRQV17q2n2QrKjlOu2_Ah-5IWpJjum
!gdown https://drive.google.com/uc?id=1Q_4s-9Sq2q0j_YnTt4oFVhAq5FOuWc0c

# Evaluation

This part of the Notebook is dedicated to the Evaluation phases of the study.

#### Recall Models

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
!mkdir -p /content/drive/MyDrive/MARIAN-CHECKPOINTS/EVAL-MODELS

%cd /content/
!mkdir -p Evaluation
!mkdir -p Evaluation/Models
!mkdir -p Evaluation/Decoder
!mkdir -p Evaluation/Vocabs
!mkdir -p Evaluation/Source
!mkdir -p Evaluation/Target
!mkdir -p /content/drive/MyDrive/MARIAN-DECODE
!mkdir -p /content/drive/MyDrive/MARIAN-VOCABS

During training, we placed all the best performing models in a folder called EVAL-MODELS. We have provided these models via the links in the previous section and the are instead located in the *TrainedModels* folder.

In [None]:
!cp -a "/content/TrainedModels" "/content/Evaluation/Models/"

In [None]:
# You can skip this if you don't have any.
!cp -a "/content/drive/MyDrive/MARIAN-CHECKPOINTS/EVAL-MODELS/." "/content/Evaluation/Models/"

In [None]:
!cp -a "/content/GIT/ICS5200/Evaluation Vocabs/." "/content/Evaluation/Vocabs/"

The list hereunder will determine which models will be evaluated. For the sake of demonstration, only *Final-Model* has been included, but all of the other models can be passed here. They can be added by appending elements to the **decoding_models** list. For example, this is how we can add "Tied-Embeddings.npz": 

['Final-Model','Tied-Embeddings']

In [None]:
decoding_models = ['Final-Model']

#### Decoding

In [None]:
%cd /content/GIT/ICS5200/Tools/

In [None]:
MARIAN_DECODE_DIR = "/content/drive/MyDrive/MARIAN-DECODE"
MARIAN_VOCAB_DIR = "/content/drive/MyDrive/MARIAN-VOCABS"

In [None]:
!pip install sentence_splitter
!pip install sentencepiece

We technically already loaded the tokeniser, but we included this for convenience in the event that validation is being performed on its own.

In [None]:
from tokenisation import MTRegex, MTParTokenizer, MTWordTokenizer, MTSentenceTokenizer, MTSentencePieceTokenizer
import urllib.parse
import requests
from IPython.display import clear_output

tokeniser = MTWordTokenizer();
tokeniser_api_link = "https://mlrs.research.um.edu.mt/tools/mlrsapi/tokenise?text="

def tokenise(path, use_rest=False):

  tokens = []
  with open(path, 'r', errors='replace') as f:

    lines = f.readlines()
    count = 0

    for line in lines:

      if (use_rest):
        query = "https://mlrs.research.um.edu.mt/tools/mlrsapi/tokenise?text=" + urllib.parse.quote(line)
        response = requests.get(query)
        tokens.append(response.json()['result'])
      else:
        tokens.append(tokeniser.tokenize(line))
      
      count += 1
      print("{0}/{1}".format(count, len(lines)))

    clear_output()
    print("Processed: " + path)
  
  return tokens


Some general steps related to setup.

In [None]:
%cd /content/

In [None]:
import os

model_directories = []

!rm -rf Evaluation/Decoder/*

for file in os.listdir("/content/Evaluation/Models/"):
  DIR_NAME = file.replace('.npz', '')

  model_directories.append(DIR_NAME)
  !mkdir -p Evaluation/Decoder/$DIR_NAME
  !mkdir -p Evaluation/Decoder/$DIR_NAME/Log
  !mkdir -p Evaluation/Decoder/$DIR_NAME/Output

In [None]:
glob.glob('/content/GIT/ICS5200/QariTalProvi_Batch_2/Training/{0}/Target/*.txt'.format('Converted' if full_set else 'Subset'))

In [None]:
import glob

EVALS_DIR = '/content/Evaluation/Decoder/'
MODELS_DIR = '/content/Evaluation/Models/'
SOURCE_DIR = '/content/Evaluation/Source/'
TARGET_DIR = '/content/Evaluation/Target/'
REFERENCES_DIR = '/content/Evaluation/References/'
M2_DIR = "/content/Evaluation/M2/"
SCORES_DIR = "/content/Evaluation/Scores/"
DECODER_DIR = '/content/GIT/pretraining-bea2019/systems/tools/marian-dev/build/marian-decoder'
SCORER_DIR = '/content/GIT/pretraining-bea2019/systems/tools/marian-dev/build/marian-scorer'

full_set = False
source_listdir = glob.glob('/content/GIT/ICS5200/QariTalProvi_Batch_2/Training/{0}/Source/*.txt'.format('Converted' if full_set else 'Subset'))
target_listdir = glob.glob('/content/GIT/ICS5200/QariTalProvi_Batch_2/Training/{0}/Target/*.txt'.format('Converted' if full_set else 'Subset'))

source_listdir.sort()
target_listdir.sort()

In [None]:
!rm -rf $SOURCE_DIR/*
!rm -rf $TARGET_DIR/*

for source_file in source_listdir:
  token_list = tokenise(source_file)

  new_file = SOURCE_DIR + source_file.split("/")[-1:][0].replace('txt','tokenised.in')
  
  with open(new_file, "w", encoding="utf-8") as f:
      print("Writing to file: {0}".format(new_file))

      f.write("\n".join(list(map(lambda x: " ".join(x), token_list))))

for target_file in target_listdir:
  token_list = tokenise(target_file)

  new_file = TARGET_DIR + target_file.split("/")[-1:][0].replace('txt','tokenised.ref')
  
  with open(new_file, "w", encoding="utf-8") as f:
      print("Writing to file: {0}".format(new_file))

      f.write("\n".join(list(map(lambda x: " ".join(x), token_list))))


In [None]:
tokenised_source_listdir = glob.glob(SOURCE_DIR + '/*.tokenised.in')
tokenised_source_listdir.sort()

#### Translation

Requires Marian build.

In [None]:
# GENERATE
%cd /content

from os.path import exists

generate_from_scratch = False

for model_directory in decoding_models:
  
  CURRENT_MODEL_DIR = MODELS_DIR + model_directory + ".npz"
  OUTPUT_DIR = EVALS_DIR + model_directory + "/Output/"
  LOG_DIR = EVALS_DIR + model_directory + "/Log/"
  SRC_VOCAB = "/content/Evaluation/Vocabs/bench.src.spm" if model_directory in  ['Benchmark-AMUN','Benchmark-S2S','Benchmark-Transformer', 'SRC-Corruption','Benchmark-Tied-Embeddings','Benchmark-Pretrained-BERTu','Benchmark-Pretrained-mBERTu'] \
    else "/content/Evaluation/Vocabs/22K.src.spm"
  TRG_VOCAB = "/content/Evaluation/Vocabs/bench.trg.spm" if model_directory in  ['Benchmark-AMUN','Benchmark-S2S','Benchmark-Transformer', 'SRC-Corruption','Benchmark-Tied-Embeddings','Benchmark-Pretrained-BERTu','Benchmark-Pretrained-mBERTu'] \
    else "/content/Evaluation/Vocabs/22K.trg.spm"

  print("*** {0} ***".format(model_directory))
  print("Model File: {0}".format(CURRENT_MODEL_DIR))
  print("Output Dir: {0}".format(OUTPUT_DIR))
  print("  Log File: {0}\n".format(LOG_DIR))

  c = 0

  if generate_from_scratch:

    for s, t in zip(tokenised_source_listdir, target_listdir):
      c += 1

      print("Processing pair {2}:\n\tSORUCE: {0}\n\tTARGET: {1}".format(s, t, c))
      LOG_FILE = LOG_DIR + "PAIR-{0:03}.log".format(c)
      OUTPUT_FILE = OUTPUT_DIR + "PAIR-{0:03}.out".format(c)

      !$DECODER_DIR -m $CURRENT_MODEL_DIR -n --cpu-threads 5  -i '$s' -o $OUTPUT_FILE --log $LOG_FILE --vocabs $SRC_VOCAB $TRG_VOCAB 
      print("\tLogged to {0}".format(LOG_FILE))
    
    print("\tPersisting to Google Drive... {0}/{1}".format(MARIAN_DECODE_DIR, model_directory))

    !mkdir -p $MARIAN_DECODE_DIR/$model_directory
    !cp $OUTPUT_DIR/* $MARIAN_DECODE_DIR/$model_directory

  else:

    print("Recovering from Google Drive...")
    !cp  $MARIAN_DECODE_DIR/$model_directory/* $OUTPUT_DIR/

### ERRANT Scoring

### Preparation

In [None]:
%cd /content/GIT/ICS5200/Tools

from M2_reference_generation import generate_m2_reference

In [None]:
%cd /content/

In [None]:
!python -m spacy download en

In [None]:
!pip install errant

### Genereate Reference Files

We generate MaxMatch (M2) reference files from our target folder.

In [None]:
import glob

REFERENCES_DIR = "/content/Evaluation/References/"
!rm -rf $REFERENCES_DIR
!mkdir -p $REFERENCES_DIR

for filepath in glob.glob(TARGET_DIR + "*"):
  filename = filepath.split("/")[-1:][0].replace("ref", "errant.ref")
  print("Generating: {0}".format(filename))
  generate_m2_reference(filepath, REFERENCES_DIR + filename)

In [None]:
target_listdir = glob.glob(TARGET_DIR + "*.ref")
target_listdir.sort()

c = 0
for x in target_listdir:
  c += 1
  print('{0}: {1}'.format(c,x))

In [None]:
# PAD empty translations

for model_directory in decoding_models:
  OUTPUT_DIR = EVALS_DIR + model_directory + "/Output/"
  
  output_listdir = glob.glob(OUTPUT_DIR + "*.out")
  output_listdir.sort()

  for o in output_listdir:
    with open(o, "r", encoding="utf-8") as f:
      lines = f.read()

    revised_lines = list(map(lambda x: x if x else '<NA>', lines.split('\n')))
    with open(o, "w", encoding="utf-8") as f:
      f.write('\n'.join(revised_lines))

### Compare Translations

We compare our model-generated translations against our reference files.

In [None]:
import glob

for model_directory in decoding_models:

  OUTPUT_DIR = EVALS_DIR + model_directory + "/Output/"
  
  output_listdir = glob.glob(OUTPUT_DIR + "*.out")
  reference_listdir = glob.glob(TARGET_DIR + "*.tokenised.ref")

  output_listdir.sort()
  reference_listdir.sort()

  !rm -rf $M2_DIR$model_directory
  !mkdir -p $M2_DIR$model_directory
  for o in output_listdir:

    filename = o.split("/")[-1:][0].replace("out", "tagged.m2")
    iter = filename.split(".")[0][-3:]

    r = reference_listdir[int(iter)-1]

    print("Comparing {0} to {1}...".format(o, r))
    !errant_parallel -orig "$o" -cor "$r" -out "$M2_DIR$model_directory/$filename"

### Scoring

Produce the scores for each comparison exercise.

In [None]:
import glob

!mkdir -p $SCORES_DIR

for term in ['dt', 'ds', 'cs']:

  print("\nSCHEME: {0}".format(term))

  for model_directory in decoding_models:

    SCORES_FILE = SCORES_DIR + model_directory + ".{0}.scores".format(term)
    !rm -rf $SCORES_FILE

    tagged_listdir = glob.glob(M2_DIR + model_directory + "/*.tagged.m2")
    references_listdir = glob.glob(REFERENCES_DIR + "/*.errant.ref" )

    tagged_listdir.sort()
    references_listdir.sort()

    c = 0
    for t in tagged_listdir:

      iter = t.split("/")[-1:][0].split(".")[0][-3:]
      r = references_listdir[int(iter)-1]

      c += 1
      print("Scoring - {2:03} - {0} against {1}".format(t, r, c))

      !echo $t >> $SCORES_FILE
      !errant_compare -hyp "$t" -ref "$r" -$term >> $SCORES_FILE


In [None]:
!mkdir -p /content/drive/MyDrive/MARIAN-SCORES

Produce the total scores across all the files for the model. Set **persist** to True to save the output on Google Drive.

In [None]:
persist = False #@param {type:"boolean"}

In [None]:
import csv

schemes = {'dt': 'Token-Based Error Detection'
  , 'ds': 'Span-Based Error Detection'
  , 'cs': 'Span-Based Error Correction'}

for term in ['dt', 'ds', 'cs']:

  for model_directory in decoding_models:
    score_file = SCORES_DIR + model_directory + '.{0}.scores'.format(term)

    aggregates = []

    with open(score_file, "r", encoding="utf-8") as f:
      lines = list(map(lambda x: x.replace('\n', ''), f.readlines()))

      counter = 0
      current = {}
      for i in range(0, len(lines)):

        line = lines[i]

        if counter == 0:
          current['file'] = line
        elif counter == 4:
          arr_val = line.split('\t')
          if len(arr_val) < 6:
            current['TP'] = 0
            current['FP'] = 0
            current['FN'] = 0
            current['Prec'] = 0
            current['Rec'] = 0
            current['F0.5'] = 0
          else:
            current['TP'] = arr_val[0]
            current['FP'] = arr_val[1]
            current['FN'] = arr_val[2]
            current['Prec'] = arr_val[3]
            current['Rec'] = arr_val[4]
            current['F0.5'] = arr_val[5]
        elif counter == 6:
          aggregates.append(current)
          current = {}
          counter = -1

        counter += 1

    score_file_agg = score_file.replace("scores", "agg")

    with open(score_file_agg, "w", encoding="utf-8") as f:
      dict_writer = csv.DictWriter(f, aggregates[0].keys())
      dict_writer.writeheader()
      dict_writer.writerows(list(filter(lambda x: "/content/Evaluation" in x['file'], aggregates)))
    
    score_summary = score_file.replace("scores", "summary")
    
    TP = sum(list(map(lambda x: int(x['TP']), aggregates)))
    FP = sum(list(map(lambda x: int(x['FP']), aggregates)))
    FN = sum(list(map(lambda x: int(x['FN']), aggregates)))
    Prec = TP/(TP+FP)
    Rec = TP/(TP+FN)
    FScore = 1.25*((Prec*Rec)/(0.25*Prec+Rec))

    prompts = [
      "{0}".format(schemes[term]),
      "TP: {0}".format(TP),
      "FP: {0}".format(FP),
      "FN: {0}".format(FN),
      "Precision: {0}".format(Prec),
      "Recall: {0}".format(Rec),
      "F0.5 Score: {0}".format(FScore)]

    with open(score_summary, "w", encoding="utf-8") as f:
      for prompt in prompts:
        print(prompt)
        f.write(prompt)

    print()

    if persist:
      !mkdir -p /content/drive/MyDrive/MARIAN-SCORES/$model_directory/
      !mkdir -p /content/drive/MyDrive/MARIAN-SCORES/$model_directory/SCORES/
      !mkdir -p /content/drive/MyDrive/MARIAN-SCORES/$model_directory/AGGREGATES/
      !mkdir -p /content/drive/MyDrive/MARIAN-SCORES/$model_directory/SUMMARY/
      !cp '$score_file' '/content/drive/MyDrive/MARIAN-SCORES/$model_directory/SCORES/'
      !cp '$score_file_agg' '/content/drive/MyDrive/MARIAN-SCORES/$model_directory/AGGREGATES/'
      !cp '$score_summary' '/content/drive/MyDrive/MARIAN-SCORES/$model_directory/SUMMARY/'
