found here : https://github.com/ImperialNLP/NLPLabs/blob/c724834960345085690802233966682bc3321723/lab06/lab06_solutions.ipynb

In [2]:
%%bash

# Download the corpus
URL="https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/tok"

#cd data

for split in "train" "val" "test_2016_flickr"; do
    for lang in en de fr; do
        fname="${split}.lc.norm.tok.${lang}"
        if [ ! -f $fname ]; then
            echo "Downloading $fname"
            wget -q "${URL}/$fname" -O "${split/_2016_flickr/}.${lang}"
        fi
    done
done
echo 

# Print the first 10 lines with line numbers of 
# the English and French training data
cat -n train.en | head -n10
echo
cat -n train.fr | head -n10
echo
cd ..

Downloading train.lc.norm.tok.en
Downloading train.lc.norm.tok.de
Downloading train.lc.norm.tok.fr
Downloading val.lc.norm.tok.en
Downloading val.lc.norm.tok.de
Downloading val.lc.norm.tok.fr
Downloading test_2016_flickr.lc.norm.tok.en
Downloading test_2016_flickr.lc.norm.tok.de
Downloading test_2016_flickr.lc.norm.tok.fr

     1	two young , white males are outside near many bushes .
     2	several men in hard hats are operating a giant pulley system .
     3	a little girl climbing into a wooden playhouse .
     4	a man in a blue shirt is standing on a ladder cleaning a window .
     5	two men are at the stove preparing food .
     6	a man in green holds a guitar while the other man observes his shirt .
     7	a man is smiling at a stuffed lion
     8	a trendy girl talking on her cellphone while gliding slowly down the street .
     9	a woman with a large purse is walking by a gate .
    10	boys dancing on poles in the middle of the night .

     1	deux jeunes hommes blancs sont dehors

In [11]:
import numpy as np

In [3]:
class Multi30K:
    """A dataset wrapper for Multi30K."""
    def __init__(self, tokenizer, src_file, trg_file):

        self.tokenizer = tokenizer
    
        self.src_sents, self.trg_sents = self.read_sentences(src_file, trg_file)

    def read_sentences(self, src_file, trg_file):
        src_sents = []
        trg_sents = []

        # Read source side
        with open(src_file) as f:
            for line in f:
                line = line.strip()
                src_sents.append(line) 
            
        # Read target side
        with open(trg_file) as f:
            for line in f:
                line = line.strip()
                trg_sents.append(line)

        assert len(src_sents) == len(trg_sents), "Files are not aligned!"
        return src_sents, trg_sents
    
    def collate_fn(self, idx):
        src_texts = [self.src_sents[i] for i in idx]
        trg_texts = [self.trg_sents[i] for i in idx]
        
        output = self.tokenizer.prepare_seq2seq_batch(src_texts=src_texts, 
                                                      tgt_texts=trg_texts, 
                                                      max_length=128, 
                                                      max_target_length=128,
                                                      return_tensors='pt',
                                                      truncation=True)
        return output
    
    def __len__(self):
        return len(self.src_sents)

    def __getitem__(self, idx):
        return idx

In [4]:
from transformers import EncoderDecoderModel, MarianMTModel, MarianTokenizer, BartModel, BartConfig, BertConfig, BartForCausalLM,Trainer,TrainingArguments
model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-de')
model

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1335.0, style=ProgressStyle(description‚Ä¶




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=297928209.0, style=ProgressStyle(descri‚Ä¶




MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(58101, 512, padding_idx=58100)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(58101, 512, padding_idx=58100)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0): MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        )
   

In [5]:
def main_mt():
    
    ## QUESTION 5 ##

    mt_tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
    mt_dataset = Multi30K(mt_tokenizer, 'train.en', 'train.de')
    mt_test_dataset = Multi30K(mt_tokenizer, 'test.en', 'test.de')
    
    model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-de')

    training_args = TrainingArguments(
        output_dir='./experiment/mt',
        learning_rate = 0.00005,
        logging_steps= 5000,
        save_steps = 10000,
        num_train_epochs = 1,
        per_device_train_batch_size=2
    )
    trainer = Trainer(
        model=model,                         
        args=training_args,                 
        train_dataset=mt_dataset,                     
        data_collator=mt_dataset.collate_fn
    )

    trainer.train()

    ## when you already trained your model and want to start from a checkpoint
    #trainer.train("./experiment/mt/checkpoint-40000")

    trainer.save_model('./models/mt_marianmt/')

In [6]:
main_mt()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=768489.0, style=ProgressStyle(descripti‚Ä¶




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=796845.0, style=ProgressStyle(descripti‚Ä¶




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1273232.0, style=ProgressStyle(descript‚Ä¶




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=42.0, style=ProgressStyle(description_w‚Ä¶




    There is an imbalance between your GPUs. You may want to exclude GPU 0 which
    has less than 75% of the memory or cores of GPU 2. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


Step,Training Loss


In [9]:
import sacrebleu

def evaluate_mt(model,mt_tokenizer, mt_test_dataset):

  bleu = []

  model.eval()

#   for file in tqdm(range(len(mt_test_dataset))):
  for file in range(len(mt_test_dataset)):

    src_text = mt_test_dataset.src_sents[file]
    targ_text_origin = mt_test_dataset.trg_sents[file]

    translated = model.generate(**mt_tokenizer.prepare_seq2seq_batch(src_text, return_tensors="pt"))
    translated_text = [mt_tokenizer.decode(t, skip_special_tokens=True) for t in translated]

    bleu.append(sacrebleu.corpus_bleu(translated_text, targ_text_origin, force=True).score)

  bleu = np.asarray(bleu)

  return np.average(bleu)

In [10]:
model = MarianMTModel.from_pretrained('./models/mt_marianmt/')

mt_tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
mt_test_dataset = Multi30K(mt_tokenizer, 'test.en', 'test.de')

bleu = evaluate_mt(model,mt_tokenizer, mt_test_dataset)

print(bleu)

NameError: name 'np' is not defined

In [12]:
mt_test_dataset

<__main__.Multi30K at 0x7fe838894a90>