In [165]:
import datasets
import torch

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from transformers import BartTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoTokenizer

import warnings
warnings.filterwarnings("ignore")

In [136]:
# only once run for download WordNet or update
import nltk
# nltk.download('wordnet', download_dir='./')
nltk.data.path.append('../corpora_size')
from nltk.corpus import wordnet as wn

# One to One Generation

## English semantic features data

In [166]:
english_features_raw = pd.read_excel('./final_words_2017.xlsx')
english_features = english_features_raw[['cue', 'translated']]
english_features = english_features.drop_duplicates()
english_features

Unnamed: 0,cue,translated
0,abandon,desert
1,abandon,give
2,abandon,leave
5,abandon,up
6,abandon,withdraw
...,...,...
69271,true,honest
69274,true,real
69277,true,right
69280,true,truth


In [71]:
english_features_raw

Unnamed: 0,where,cue,feature,translated,frequency_feature,frequency_translated,n,normalized_feature,normalized_translated,pos_cue,pos_feature,pos_translated,a1,a2,a3,FSG,BSG,word_list,school_code
0,b,abandon,desert,desert,9,9,60,15.000000,15.000000,verb,noun,noun,0,0,0,,,mturk,4.0
1,b,abandon,give,give,19,19,60,31.666667,31.666667,verb,verb,verb,0,0,0,,,mturk,4.0
2,b,abandon,leave,leave,26,32,60,43.333333,53.333333,verb,verb,verb,0,0,0,,,mturk,4.0
3,b,abandon,leaving,leave,1,32,60,1.666667,53.333333,verb,verb,verb,present_participle,0,0,,,mturk,4.0
4,b,abandon,left,leave,5,32,60,8.333333,53.333333,verb,adjective,verb,past_tense,0,0,,,mturk,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69279,b,true,rightly,right,1,21,60,1.666667,35.000000,adjective,adjective,adjective,characteristic,0,0,0.014,0.000,mturk,4.0
69280,b,true,truth,truth,10,10,60,16.666667,16.666667,adjective,noun,noun,0,0,0,,,mturk,4.0
69281,b,true,unfaithful,faith,1,13,60,1.666667,21.666667,adjective,adjective,noun,not,characteristic,0,0.014,0.000,mturk,4.0
69282,b,true,unreal,real,1,57,60,1.666667,95.000000,adjective,adjective,adjective,not,0,0,0.021,0.043,mturk,4.0


In [72]:
# split training set and validation set
split_ratio = 0.8
random_seed = 7 # R

training_set, validation_set = train_test_split(english_features, train_size=split_ratio, random_state=random_seed)

validation_set

Unnamed: 0,cue,translated
39338,mink_coat,black
22236,esteem,person
30944,hindsight,look
3474,aware,wake
5176,belt,fat
...,...,...
41123,neck,thin
62870,tiptoe,toe
33707,jealousy,covet
24802,flap,open


In [73]:
training_set = training_set.to_dict(orient='records')
validation_set = validation_set.to_dict(orient='records')

validation_set

[{'cue': 'mink_coat', 'translated': 'black'},
 {'cue': 'esteem', 'translated': 'person'},
 {'cue': 'hindsight', 'translated': 'look'},
 {'cue': 'aware', 'translated': 'wake'},
 {'cue': 'belt', 'translated': 'fat'},
 {'cue': 'advise', 'translated': 'guide'},
 {'cue': 'roof', 'translated': 'sun'},
 {'cue': 'cliff', 'translated': 'edge'},
 {'cue': 'swerve', 'translated': 'car'},
 {'cue': 'cavern', 'translated': 'ground'},
 {'cue': 'seagull', 'translated': 'fly'},
 {'cue': 'ox', 'translated': 'leg'},
 {'cue': 'cone', 'translated': 'ice'},
 {'cue': 'goldfish', 'translated': 'fin'},
 {'cue': 'north', 'translated': 'compass'},
 {'cue': 'dress', 'translated': 'cover'},
 {'cue': 'caress', 'translated': 'soft'},
 {'cue': 'vulture', 'translated': 'wing'},
 {'cue': 'malt', 'translated': 'chocolate'},
 {'cue': 'prom', 'translated': 'dress'},
 {'cue': 'build', 'translated': 'produce'},
 {'cue': 'challenge', 'translated': 'force'},
 {'cue': 'channel', 'translated': 'water'},
 {'cue': 'hurt', 'transla

## Tokenization: BART pre-trained model

In [74]:
bart = BartTokenizer.from_pretrained("facebook/bart-large")

token_test = bart.encode('Hello world!')
print(token_test)

string = bart.decode(token_test, skip_special_tokens=True)
print(string)

loading file https://huggingface.co/facebook/bart-large/resolve/main/vocab.json from cache at /export/scratch2/home/haochen/.cache/huggingface/transformers/0d6fc8b2ef1860c1f8f0baff4b021e3426cc7d11b153f98e563b799603ee2f25.647b4548b6d9ea817e82e7a9231a320231a1c9ea24053cc9e758f3fe68216f05
loading file https://huggingface.co/facebook/bart-large/resolve/main/merges.txt from cache at /export/scratch2/home/haochen/.cache/huggingface/transformers/6e75e35f0bdd15870c98387e13b93a8e100237eb33ad99c36277a0562bd6d850.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/facebook/bart-large/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/facebook/bart-large/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/facebook/bart-large/resolve/main/tokenizer_config.json from cache at /export/scratch2/home/haochen/.cache/huggingface/transformers/1abf196c889c24daca2909359ca2090e5fcbfa21a9e

[0, 31414, 232, 328, 2]
Hello world!


In [75]:
batch_encode = bart(['WordNet', 'basic level prediction'])
print(batch_encode)

def batchDecode(inputs, skip_special_tokens):
    batch_decode = []
    for item in inputs:
        decode = bart.decode(item, skip_special_tokens=skip_special_tokens)
        batch_decode.append(decode)
    return batch_decode

batch_decode = batchDecode(batch_encode['input_ids'], skip_special_tokens=True)
print(batch_decode)

{'input_ids': [[0, 44051, 15721, 2], [0, 42607, 672, 16782, 2]], 'attention_mask': [[1, 1, 1, 1], [1, 1, 1, 1, 1]]}
['WordNet', 'basic level prediction']


In [76]:
def tokenize(record):
    model_input = {'input_ids': [], 'targets': []}
    for item in record:
        norms = item['cue']
        model_input['input_ids'].append(bart(norms)['input_ids'])
        targets = item['translated']
        model_input['targets'].append(bart(targets)['input_ids'])
    return model_input

In [77]:
training_set[1]

{'cue': 'secure', 'translated': 'lock'}

In [78]:
tokenize(training_set[:2])

{'input_ids': [[0, 30919, 48226, 2], [0, 34153, 2]],
 'targets': [[0, 3998, 16231, 2], [0, 8292, 2]]}

In [79]:
batchDecode([[0, 34153, 2], [0, 8292, 2]], True)

['secure', 'lock']

In [None]:
tokenized_training = tokenize(training_set)
tokenized_validation = tokenize(validation_set)

tokenized_validation

In [None]:
class ForT5Dataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.targets['input_ids'])

    def __getitem__(self, index):
        input_ids = torch.tensor(self.inputs['input_ids'][index]).squeeze()
        target_ids = torch.tensor(self.targets['targets'][index]).squeeze()

        return {'input_ids': input_ids, 'labels': target_ids}

In [None]:
tokenized_training_dataset = ForT5Dataset(tokenized_training, tokenized_training)
tokenized_validation_dataset = ForT5Dataset(tokenized_validation, tokenized_validation)

In [None]:
tokenized_training_dataset.__len__()

## Fine-tune the model

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large")

batch_size = 8
args = Seq2SeqTrainingArguments(
    'BART-fine-tuned',
    evaluation_strategy = 'epoch',
    learning_rate = 2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=False
)

data_collator = DataCollatorForSeq2Seq(bart, model=model)

In [None]:
metric = datasets.load_metric('sacrebleu')

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = batchDecode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, bart.pad_token_id)
    decoded_labels = batchDecode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != bart.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_training_dataset,
    eval_dataset=tokenized_validation_dataset,
    data_collator=data_collator,
    tokenizer=bart,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
pred = trainer.predict(tokenized_validation_dataset)
pred.predictions.shape

## Test the model

In [None]:
predictions = pred.predictions

In [None]:
sample = 349
batchDecode([tokenized_validation['input_ids'][sample], pred.predictions[sample]], True)

In [None]:
def test_dataset(test_list):
    test_df = pd.DataFrame(test_list, columns=['cue'])
    translated_col = pd.Series([str(x+1) for x in range(len(test_df))], name='translated')
    test_df = pd.concat([test_df, translated_col], axis=1)
    test_set = test_df.to_dict(orient='records')
    tokenized_test = tokenize(test_set)
    tokenized_test_dataset = ForT5Dataset(tokenized_test, tokenized_test)
    return tokenized_test_dataset

In [None]:
test = ["Haochen Wang"]
testing_dataset = test_dataset(test)
sample = 0
batchDecode([testing_dataset.__getitem__(sample)['input_ids'], trainer.predict(testing_dataset).predictions[sample]], True)

# One to Some Generation

## Flatten the training data

In [None]:
def group_flatten(dataframe):
    english_features_cue = dataframe.set_index('cue')
    norm_series = dataframe['cue'].unique()
    for cue in norm_series:
        translation = list(english_features_cue.loc[cue]['translated'])
        flatten = ''
        for word in translation:
            flatten += word + ' '
        english_features_cue.loc[cue, 'translated'] = flatten
    english_features_cue = english_features_cue.drop_duplicates()
    english_features_cue = english_features_cue.reset_index()
    return english_features_cue

In [None]:
english_features_flatten = group_flatten(english_features)
english_features_flatten

## Produce the dataset

In [None]:
def produceDataset(dataset_raw):
    split_ratio = 0.8
    random_seed = 7
    training_set, validation_set = train_test_split(dataset_raw, train_size=split_ratio, random_state=random_seed)
    training_set = training_set.to_dict(orient='records')
    validation_set = validation_set.to_dict(orient='records')

    tokenized_training = tokenize(training_set)
    tokenized_validation = tokenize(validation_set)

    tokenized_training_dataset = ForT5Dataset(tokenized_training, tokenized_training)
    tokenized_validation_dataset = ForT5Dataset(tokenized_validation, tokenized_validation)
    return tokenized_training_dataset, tokenized_validation_dataset

In [None]:
tokenized_flatten_training_dataset, tokenized_flatten_validation_dataset = produceDataset(english_features_flatten)
tokenized_flatten_validation_dataset.__len__()

## Fine-tune the model

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large")

batch_size = 8
args_flatten = Seq2SeqTrainingArguments(
    'BART-fine-tuned_flatten',
    evaluation_strategy = 'epoch',
    learning_rate = 2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=False
)

data_collator = DataCollatorForSeq2Seq(bart, model=model)

In [None]:
trainer_flatten = Seq2SeqTrainer(
    model,
    args_flatten,
    train_dataset=tokenized_flatten_training_dataset,
    eval_dataset=tokenized_flatten_validation_dataset,
    data_collator=data_collator,
    tokenizer=bart,
    compute_metrics=compute_metrics
)

In [None]:
trainer_flatten.train()

## Test the model

In [104]:
test = ["adjustable_wrench"]
testing_dataset = test_dataset(test)
sample = 0
test_translation = batchDecode([testing_dataset.__getitem__(sample)['input_ids'], trainer_flatten.predict(testing_dataset).predictions[sample]], True)
test_translation

***** Running Prediction *****
  Num examples = 1
  Batch size = 8


['adjustable_wrench', 'adjust change change hold hold hold screw tool ']

In [107]:
translations = test_translation[1].split()
translations_set_len = len(set(translations))
translations_set_len

5

# Fit The Basic Level Dataset

In [112]:
# read the pre-processed data all agreed
structral_data = pd.read_csv('../corpora_size/size_differential_features.csv', index_col=None)
base_feature = ['nrdirhypers_x',
                'nrhypos_x',
                'nrpartrels_normalised_x',
                'depthfromtopsynset_normalised_x',
                'glosslength_normalised_x',
                'minwordlength_x',
                'nroflemmas_x',
                'polyscore_max_x']
target = ['vote_x']
structral_data = structral_data[['Synsets', 'domain_x', 'norm']+base_feature+target]
structral_data

Unnamed: 0,Synsets,domain_x,norm,nrdirhypers_x,nrhypos_x,nrpartrels_normalised_x,depthfromtopsynset_normalised_x,glosslength_normalised_x,minwordlength_x,nroflemmas_x,polyscore_max_x,vote_x
0,Synset('adjustable_wrench.n.01'),tool,adjustable_wrench,1,7,0.0,1.012903,0.563173,17,2,1,nb
1,Synset('allen_wrench.n.01'),tool,allen_wrench,1,0,0.0,1.012903,0.391092,12,1,1,nb
2,Synset('alligator_wrench.n.01'),tool,alligator_wrench,1,0,0.0,1.012903,1.517437,16,1,1,nb
3,Synset('awl.n.01'),tool,awl,1,2,15.7,0.911613,0.985552,3,1,1,b
4,Synset('backsaw.n.01'),tool,backsaw,1,0,0.0,1.114194,1.110701,7,2,1,nb
...,...,...,...,...,...,...,...,...,...,...,...,...
834,Synset('ballet_skirt.n.01'),garm,ballet_skirt,1,0,0.0,0.947552,0.578283,4,2,2,nb
835,Synset('mess_jacket.n.01'),garm,mess_jacket,1,0,0.0,1.158120,1.652238,11,3,1,nb
836,Synset('long_johns.n.01'),garm,long_johns,1,0,0.0,1.052836,0.479149,10,1,1,nb
837,Synset('undies.n.01'),garm,undies,1,0,0.0,1.158120,0.280880,6,1,1,nb


In [128]:
def batchPredictDecode(dataframe):
    norms = dataframe['norm']
    predicting_dataset = test_dataset(list(norms))
    batch_prediction = trainer_flatten.predict(predicting_dataset).predictions
    batch_decoding = batchDecode(batch_prediction, True)
    dataframe['raw_translation'] = batch_decoding
    return dataframe

In [130]:
translated_data = batchPredictDecode(structral_data)
translated_data.to_csv('./translated_raw.csv', index=False)

translated_data

***** Running Prediction *****
  Num examples = 839
  Batch size = 8


Unnamed: 0,Synsets,domain_x,norm,nrdirhypers_x,nrhypos_x,nrpartrels_normalised_x,depthfromtopsynset_normalised_x,glosslength_normalised_x,minwordlength_x,nroflemmas_x,polyscore_max_x,vote_x,raw_translation
0,Synset('adjustable_wrench.n.01'),tool,adjustable_wrench,1,7,0.0,1.012903,0.563173,17,2,1,nb,adjust change change hold hold hold screw tool
1,Synset('allen_wrench.n.01'),tool,allen_wrench,1,0,0.0,1.012903,0.391092,12,1,1,nb,break hold hold hold metal screw tool
2,Synset('alligator_wrench.n.01'),tool,alligator_wrench,1,0,0.0,1.012903,1.517437,16,1,1,nb,animal claw claw claw chew chew chew eat teeth...
3,Synset('awl.n.01'),tool,awl,1,2,15.7,0.911613,0.985552,3,1,1,b,cloth cloth hand knit knit wool
4,Synset('backsaw.n.01'),tool,backsaw,1,0,0.0,1.114194,1.110701,7,2,1,nb,cut blade blade blade cut edge edge hand heave...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
834,Synset('ballet_skirt.n.01'),garm,ballet_skirt,1,0,0.0,0.947552,0.578283,4,2,2,nb,act ballet ballet ballet dance dance dance dre...
835,Synset('mess_jacket.n.01'),garm,mess_jacket,1,0,0.0,1.158120,1.652238,11,3,1,nb,clean cloth cloth cloth cover mess mess
836,Synset('long_johns.n.01'),garm,long_johns,1,0,0.0,1.052836,0.479149,10,1,1,nb,bend cloth cloth cloth cover cloth cloth comfo...
837,Synset('undies.n.01'),garm,undies,1,0,0.0,1.158120,0.280880,6,1,1,nb,cloth cloth cloth cover cloth cover comfort co...


In [154]:
def batchLemmaPredictDecode(row):
    record = row['Synsets']
    synset = wn.synset(record[8:-2])
    concept_norms = [x.name() for x in synset.lemmas()]
    predicting_dataset = test_dataset(concept_norms)
    batch_prediction = trainer_flatten.predict(predicting_dataset).predictions
    batch_decoding = batchDecode(batch_prediction, True)
    return batch_decoding

In [None]:
translated_lemmas_data = structral_data.copy()
translated_lemmas_data['raw_translation_lemmas'] = translated_lemmas_data.apply(batchLemmaPredictDecode, axis=1)

In [None]:
translated_lemmas_data.to_csv('./translated_raw.csv', index=False)