In [3]:
import os
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import BertConfig, BertTokenizer, BertModel

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [4]:
config = BertConfig.from_pretrained("bert-base-multilingual-cased")
model = BertModel.from_pretrained("bert-base-multilingual-cased")
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased", device=device)
weights = model.get_input_embeddings().weight.detach().numpy()
vocab = tokenizer.get_vocab()

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
import pickle
base_data_dir = "./data/tokenized_data"
for data_file in os.listdir(base_data_dir):
# for data_file in ['test.en', 'test.fa']:
    print("Process", data_file)
    with open(os.path.join(base_data_dir, data_file), 'r') as file:
        data = file.read().splitlines()
        tokenized_input = tokenizer.batch_encode_plus(data, add_special_tokens=True, max_length=16, padding='max_length', truncation=True)
    lines = ""
    for ids in tokenized_input['input_ids']:
        tokens = tokenizer.convert_ids_to_tokens(ids)
        lines += " ".join(tokens)
        lines += "\n"
    with open(os.path.join("data/encoded_data/", data_file), 'w') as f:
        f.write(lines)

Process valid.fa
Process test.fa
Process valid.en
Process train.en
Process test.en
Process train.fa


In [5]:
word_embeddings = []
with open(os.path.join("./data/bert_weights.txt"), 'w') as f:
    for word, index in vocab.items():
        embedding = weights[index]
        line = word  + " " + " ".join(map(str, embedding.tolist()[:512]))
        f.write(f"{line}\n")

In [12]:
import torch
import torch.nn as nn
from fairseq import utils
from fairseq.models import FairseqEncoder
from fairseq.models import FairseqDecoder
from fairseq.models import FairseqEncoderDecoderModel, register_model
from fairseq.models.lstm import LSTMModel

In [13]:
@register_model('simple_lstm')
class SimpleLSTMModel(LSTMModel):
    @staticmethod
    def add_args(parser):
        super().add_args(parser)
        parser.add_argument('--freeze', type=bool, default=True,
                            help='freeze layers')

    @classmethod
    def build_model(cls, args, task):
        model = super().build_model(cls, args, task)
        if args.freeze:
            for p in model.encoder.parameters():
                p.requires_grad = False

            for p in model.decoder.parameters():
                p.requires_grad = False


ValueError: Cannot register duplicate model (simple_lstm)

In [14]:
from fairseq.models import register_model_architecture

@register_model_architecture('simple_lstm', 'tutorial_simple_lstm')
def tutorial_simple_lstm(args):
    args.freeze_layers = getattr(args, "freeze_layers", False)

ValueError: Cannot register duplicate model architecture (tutorial_simple_lstm)

In [2]:
!fairseq-train \
    "./data/data_bin_bert/" \
    --arch lstm --share-decoder-input-output-embed \
    --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
    --lr 0.0025 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
    --dropout 0.25 --weight-decay 0.0001 \
    --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
    --max-tokens 4096 \
    --eval-bleu \
    --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
    --eval-bleu-detok moses \
    --eval-bleu-print-samples \
    --best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
    --fp16 --memory-efficient-fp16 \
    --max-epoch 5 \
    --encoder-embed-path ./data/bert_weights.txt \
    --decoder-embed-path ./data/bert_weights.txt \
    --share-all-embeddings \
    --save-dir ./data/data_bin_bert/checkpoints/ \
    --tensorboard-logdir ./data/data_bin_bert/logs

2023-06-02 00:11:00 | INFO | fairseq_cli.train | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': './data/data_bin_bert/logs', 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'comm

In [3]:
!fairseq-train \
    "./data/data_bin_bert/" \
    --arch lstm --share-decoder-input-output-embed \
    --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
    --lr 0.0025 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
    --dropout 0.25 --weight-decay 0.0001 \
    --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
    --max-tokens 4096 \
    --eval-bleu \
    --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
    --eval-bleu-detok moses \
    --eval-bleu-print-samples \
    --best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
    --fp16 --memory-efficient-fp16 \
    --max-epoch 5 \
    --save-dir ./data/data_bin_bert/checkpoints/ \
    --tensorboard-logdir ./data/data_bin_bert/logs

2023-06-02 00:15:30 | INFO | fairseq_cli.train | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 100, 'log_format': None, 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': './data/data_bin_bert/logs', 'wandb_project': None, 'azureml_logging': False, 'seed': 1, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': True, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'on_cpu_convert_precision': False, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'amp': False, 'amp_batch_retries': 2, 'amp_init_scale': 128, 'amp_scale_window': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': False, 'suppress_crashes': False, 'use_plasma_view': False, 'plasma_path': '/tmp/plasma'}, 'comm