In [40]:
import datasets
import torch

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from transformers import BartTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

import warnings
warnings.filterwarnings("ignore")

## English semantic features data

In [2]:
english_features_raw = pd.read_excel('./final_words_2017.xlsx')
english_features = english_features_raw[['cue', 'translated']]
english_features = english_features.drop_duplicates()
english_features

Unnamed: 0,cue,translated
0,abandon,desert
1,abandon,give
2,abandon,leave
5,abandon,up
6,abandon,withdraw
...,...,...
69271,true,honest
69274,true,real
69277,true,right
69280,true,truth


In [21]:
english_features_raw

Unnamed: 0,where,cue,feature,translated,frequency_feature,frequency_translated,n,normalized_feature,normalized_translated,pos_cue,pos_feature,pos_translated,a1,a2,a3,FSG,BSG,word_list,school_code
0,b,abandon,desert,desert,9,9,60,15.000000,15.000000,verb,noun,noun,0,0,0,,,mturk,4.0
1,b,abandon,give,give,19,19,60,31.666667,31.666667,verb,verb,verb,0,0,0,,,mturk,4.0
2,b,abandon,leave,leave,26,32,60,43.333333,53.333333,verb,verb,verb,0,0,0,,,mturk,4.0
3,b,abandon,leaving,leave,1,32,60,1.666667,53.333333,verb,verb,verb,present_participle,0,0,,,mturk,4.0
4,b,abandon,left,leave,5,32,60,8.333333,53.333333,verb,adjective,verb,past_tense,0,0,,,mturk,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69279,b,true,rightly,right,1,21,60,1.666667,35.000000,adjective,adjective,adjective,characteristic,0,0,0.014,0.000,mturk,4.0
69280,b,true,truth,truth,10,10,60,16.666667,16.666667,adjective,noun,noun,0,0,0,,,mturk,4.0
69281,b,true,unfaithful,faith,1,13,60,1.666667,21.666667,adjective,adjective,noun,not,characteristic,0,0.014,0.000,mturk,4.0
69282,b,true,unreal,real,1,57,60,1.666667,95.000000,adjective,adjective,adjective,not,0,0,0.021,0.043,mturk,4.0


In [72]:
# split training set and validation set
split_ratio = 0.8
random_seed = 7 # R

training_set, validation_set = train_test_split(english_features, train_size=split_ratio, random_state=random_seed)

validation_set

Unnamed: 0,cue,translated
39338,mink_coat,black
22236,esteem,person
30944,hindsight,look
3474,aware,wake
5176,belt,fat
...,...,...
41123,neck,thin
62870,tiptoe,toe
33707,jealousy,covet
24802,flap,open


In [66]:
training_set = training_set.to_dict(orient='records')
validation_set = validation_set.to_dict(orient='records')

validation_set

[{'cue': 'mink_coat', 'translated': 'black'},
 {'cue': 'esteem', 'translated': 'person'},
 {'cue': 'hindsight', 'translated': 'look'},
 {'cue': 'aware', 'translated': 'wake'},
 {'cue': 'belt', 'translated': 'fat'},
 {'cue': 'advise', 'translated': 'guide'},
 {'cue': 'roof', 'translated': 'sun'},
 {'cue': 'cliff', 'translated': 'edge'},
 {'cue': 'swerve', 'translated': 'car'},
 {'cue': 'cavern', 'translated': 'ground'},
 {'cue': 'seagull', 'translated': 'fly'},
 {'cue': 'ox', 'translated': 'leg'},
 {'cue': 'cone', 'translated': 'ice'},
 {'cue': 'goldfish', 'translated': 'fin'},
 {'cue': 'north', 'translated': 'compass'},
 {'cue': 'dress', 'translated': 'cover'},
 {'cue': 'caress', 'translated': 'soft'},
 {'cue': 'vulture', 'translated': 'wing'},
 {'cue': 'malt', 'translated': 'chocolate'},
 {'cue': 'prom', 'translated': 'dress'},
 {'cue': 'build', 'translated': 'produce'},
 {'cue': 'challenge', 'translated': 'force'},
 {'cue': 'channel', 'translated': 'water'},
 {'cue': 'hurt', 'transla

## Tokenization: BART pre-trained model

In [5]:
bart = BartTokenizer.from_pretrained("facebook/bart-large")

token_test = bart.encode('Hello world!')
print(token_test)

string = bart.decode(token_test, skip_special_tokens=True)
print(string)

[0, 31414, 232, 328, 2]
Hello world!


In [15]:
batch_encode = bart(['WordNet', 'basic level'])
print(batch_encode)

def batchDecode(inputs, skip_special_tokens):
    batch_decode = []
    for item in inputs:
        decode = bart.decode(item, skip_special_tokens=skip_special_tokens)
        batch_decode.append(decode)
    return batch_decode

batch_decode = batchDecode(batch_encode['input_ids'], skip_special_tokens=True)
print(batch_decode)

{'input_ids': [[0, 44051, 15721, 2], [0, 42607, 672, 2]], 'attention_mask': [[1, 1, 1, 1], [1, 1, 1, 1]]}
['WordNet', 'basic level']


In [100]:
def tokenize(record):
    norms = list(record['cue'])
    model_input = bart(norms)
    targets = list(record['translated'])
    model_input['target'] = bart(targets)['input_ids']
    return model_input

In [101]:
tokenize(training_set[:2])

{'input_ids': [[0, 30919, 48226, 2], [0, 34153, 2]], 'attention_mask': [[1, 1, 1, 1], [1, 1, 1]], 'target': [[0, 3998, 16231, 2], [0, 8292, 2]]}

In [102]:
batchDecode([[0, 30919, 48226, 2], [0, 3998, 16231, 2]], True)

['squirrel', 'climb']

In [116]:
tokenized_training = tokenize(training_set)
tokenized_validation = tokenize(validation_set)

tokenized_validation

{'input_ids': [[0, 119, 4291, 1215, 29582, 2], [0, 29704, 2], [0, 298, 2028, 32764, 2], [0, 24590, 2], [0, 24187, 2], [0, 28006, 1496, 2], [0, 1001, 1116, 2], [0, 3998, 4822, 2], [0, 4184, 14477, 2], [0, 3245, 12170, 2], [0, 1090, 1073, 5023, 2], [0, 4325, 2], [0, 33666, 2], [0, 16472, 9106, 2], [0, 25407, 2], [0, 36220, 2], [0, 3245, 5224, 2], [0, 705, 17898, 2], [0, 119, 3967, 2], [0, 12501, 2], [0, 23411, 2], [0, 25324, 20526, 2], [0, 27681, 2], [0, 298, 7363, 2], [0, 7210, 4405, 2], [0, 35349, 19471, 2], [0, 1073, 13802, 2], [0, 225, 17952, 2], [0, 22776, 2], [0, 28636, 2], [0, 2911, 415, 18198, 687, 2], [0, 757, 30771, 2], [0, 20378, 1792, 25364, 2], [0, 2871, 4468, 2], [0, 4308, 29, 2], [0, 17536, 2], [0, 16424, 2], [0, 438, 7100, 594, 2], [0, 22617, 2990, 2], [0, 571, 1073, 2], [0, 6622, 2], [0, 18116, 2], [0, 13523, 2], [0, 2463, 2], [0, 14785, 15394, 2], [0, 9996, 44610, 2], [0, 10868, 2], [0, 29582, 2], [0, 6406, 2], [0, 8645, 2], [0, 3153, 459, 2], [0, 6298, 11173, 2], [0, 3

In [117]:
class ForT5Dataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, index):
        input_ids = torch.tensor(self.inputs["input_ids"][index]).squeeze()
        target_ids = torch.tensor(self.targets["target"][index]).squeeze()

        return {"input_ids": input_ids, "labels": target_ids}

In [118]:
tokenized_training = ForT5Dataset(tokenized_training, tokenized_training)
tokenized_validation = ForT5Dataset(tokenized_validation, tokenized_validation)

In [119]:
tokenized_validation.__getitem__(25)

{'input_ids': tensor([    0, 35349, 19471,     2]),
 'labels': tensor([    0, 35349,     2])}

## Fine-tune the model

In [125]:
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large")

batch_size = 8
args = Seq2SeqTrainingArguments(
    'BART-fine-tuned',
    evaluation_strategy = 'epoch',
    learning_rate = 2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=8,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=False
)

data_collator = DataCollatorForSeq2Seq(bart, model=model)

loading configuration file https://huggingface.co/facebook/bart-large/resolve/main/config.json from cache at /export/scratch2/home/haochen/.cache/huggingface/transformers/3f12fb71b844fcb7d591fdd4e55027da90d7b5dd6aa5430ad00ec6d76585f26c.bc22f15dc7ba074ee0a60bdd34c5f2fe3b6d746f89e765303376c51aff04e260
Model config BartConfig {
  "_name_or_path": "facebook/bart-large",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "forced_bos_token_id"

In [136]:
metric = datasets.load_metric('sacrebleu')

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = batchDecode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, bart.pad_token_id)
    decoded_labels = batchDecode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != bart.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [87]:
tokenized_validation

{'input_ids': [[0, 119, 4291, 1215, 29582, 2], [0, 29704, 2], [0, 298, 2028, 32764, 2], [0, 24590, 2], [0, 24187, 2], [0, 28006, 1496, 2], [0, 1001, 1116, 2], [0, 3998, 4822, 2], [0, 4184, 14477, 2], [0, 3245, 12170, 2], [0, 1090, 1073, 5023, 2], [0, 4325, 2], [0, 33666, 2], [0, 16472, 9106, 2], [0, 25407, 2], [0, 36220, 2], [0, 3245, 5224, 2], [0, 705, 17898, 2], [0, 119, 3967, 2], [0, 12501, 2], [0, 23411, 2], [0, 25324, 20526, 2], [0, 27681, 2], [0, 298, 7363, 2], [0, 7210, 4405, 2], [0, 35349, 19471, 2], [0, 1073, 13802, 2], [0, 225, 17952, 2], [0, 22776, 2], [0, 28636, 2], [0, 2911, 415, 18198, 687, 2], [0, 757, 30771, 2], [0, 20378, 1792, 25364, 2], [0, 2871, 4468, 2], [0, 4308, 29, 2], [0, 17536, 2], [0, 16424, 2], [0, 438, 7100, 594, 2], [0, 22617, 2990, 2], [0, 571, 1073, 2], [0, 6622, 2], [0, 18116, 2], [0, 13523, 2], [0, 2463, 2], [0, 14785, 15394, 2], [0, 9996, 44610, 2], [0, 10868, 2], [0, 29582, 2], [0, 6406, 2], [0, 8645, 2], [0, 3153, 459, 2], [0, 6298, 11173, 2], [0, 3

In [137]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_training,
    eval_dataset=tokenized_validation,
    data_collator=data_collator,
    tokenizer=bart,
    compute_metrics=compute_metrics
)

In [138]:
trainer.train()

***** Running training *****
  Num examples = 3
  Num Epochs = 8
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 8


Epoch,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 3
  Batch size = 8
***** Running Evaluation *****
  Num examples = 3
  Batch size = 8
***** Running Evaluation *****
  Num examples = 3
  Batch size = 8
***** Running Evaluation *****
  Num examples = 3
  Batch size = 8
***** Running Evaluation *****
  Num examples = 3
  Batch size = 8
***** Running Evaluation *****
  Num examples = 3
  Batch size = 8
***** Running Evaluation *****
  Num examples = 3
  Batch size = 8
***** Running Evaluation *****
  Num examples = 3
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=8, training_loss=0.8964371085166931, metrics={'train_runtime': 24.21, 'train_samples_per_second': 0.991, 'train_steps_per_second': 0.33, 'total_flos': 203166056448.0, 'train_loss': 0.8964371085166931, 'epoch': 8.0})