In [28]:
import datasets
import transformers
from datasets import load_dataset
from evaluate import load

In [29]:
train_dataset = load_dataset('json', data_files="data/naive_random.json", field="train",split="train")
val_dataset = load_dataset('json', data_files="data/naive_random.json", field="val",split="train")
test_dataset = load_dataset('json', data_files="data/naive_random.json", field="test",split="train")


In [30]:

test_dataset[0]

{'idx': 135430,
 'lengths_punctuation': [','],
 'number': 0,
 'clue': 'Achy shaking stopped by iodine, salt and kaolin',
 'soln': 'chinaclay',
 'across_or_down': '',
 'id': '',
 'creator': 'Arachne',
 'type': 'cryptic',
 'unique_clue_id': '',
 'orig_lengths': '5,4',
 'pos': [0, 0],
 'lengths': [5, 4],
 'dataset': '',
 'soln_with_spaces': 'china clay'}

In [31]:
## Some data statistics
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
unique_answers = np.unique(train_dataset['soln'])

unique_answers = pd.DataFrame(unique_answers)


unique_type = np.unique(train_dataset['type'])
print(f'Total number of unique types is: {len(unique_type)}')

print(f' total number of examples: {len(train_dataset)},    number of unique answers: {len(unique_answers)}')
# unique_answers.hist()

# histogram = []
# for x in unique_answers:
#     v = len(train_dataset.filter(lambda example: example["soln"] == x))
#     histogram.append([x, v])





Total number of unique types is: 1
 total number of examples: 85428,    number of unique answers: 42099


In [16]:
# Concat clue, with length. 
def concat_length(example):

    example["clue"] = f'{example["clue"]} ({example["orig_lengths"]})   '

    return example

In [17]:
train_dataset = train_dataset.map(concat_length)
val_dataset = val_dataset.map(concat_length)
test_dataset = test_dataset.map(concat_length)


In [18]:
train_dataset = train_dataset.select_columns(["clue", "soln", "soln_with_spaces"])
val_dataset = val_dataset.select_columns(["clue", "soln" , "soln_with_spaces"])
test_dataset = test_dataset.select_columns(["clue", "soln" , "soln_with_spaces"])


In [19]:
train_dataset[:2]

{'clue': ['Suffering to grasp edge of plant (8)   ',
  'Honour Ben and Noel with new order (7)   '],
 'soln': ['agrimony', 'ennoble'],
 'soln_with_spaces': ['agrimony', 'ennoble']}

In [20]:
model_checkpoint = "t5-base"
metric = load("rouge")
acc_metric = load("accuracy")


In [21]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
max_input_length = 1024
max_target_length = 32
def preprocess_function(examples):
    inputs = [doc for doc in examples["clue"]]
    model_inputs = tokenizer(inputs, padding= 'longest', truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(text_target=examples["soln_with_spaces"], padding= 'longest', truncation=True)

    model_inputs["labels"] = labels["input_ids"]    
    return model_inputs

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [22]:
tokenized_train= train_dataset.map(preprocess_function, batched=True)
tokenized_val= val_dataset.map(preprocess_function, batched=True)
tokenized_test= test_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/85428 [00:00<?, ? examples/s]

Map:   0%|          | 0/28476 [00:00<?, ? examples/s]

Map:   0%|          | 0/28476 [00:00<?, ? examples/s]

In [23]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /home/abdelrahman.sadallah/local/cuda-11.7/lib64/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.9
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /home/abdelrahman.sadallah/.conda/envs/nlp/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


  warn(msg)


In [24]:
import nltk
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    # Note that other metrics may not have a `use_aggregator` parameter
    # and thus will return a list, computing a metric for each sentence.
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
    # Extract a few results
    result = {key: value * 100 for key, value in result.items()}
    

##############################################################################
    ## Calculating accuracy:

    flatten_pred = np.array(predictions).flatten()
    flatten_labels = labels.flatten()
    accuracy_result= acc_metric.compute(references=flatten_pred, predictions=flatten_labels)
    accuracy_result = {key: value * 100 for key, value in accuracy_result.items()}
    result.update(accuracy_result)

###############################################################################3


    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [25]:
train_batch_size = 256
val_batch_size = 128
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"experiments/{model_name}-finetuned-random",
    evaluation_strategy = "steps",
    save_strategy="steps",
    save_steps=500,
    learning_rate=2e-5,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=val_batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    load_best_model_at_end=True,
    metric_for_best_model=" ",
    report_to="tensorboard"
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [26]:
trainer.train()

  0%|          | 0/3340 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 2.2037, 'learning_rate': 1.7077844311377248e-05, 'epoch': 1.5}




  0%|          | 0/223 [00:00<?, ?it/s]

{'eval_loss': 1.2065181732177734, 'eval_rouge1': 0.1868, 'eval_rouge2': 0.0097, 'eval_rougeL': 0.1892, 'eval_rougeLsum': 0.1877, 'eval_accuracy': 67.6942, 'eval_gen_len': 5.5279, 'eval_runtime': 55.0614, 'eval_samples_per_second': 517.168, 'eval_steps_per_second': 4.05, 'epoch': 1.5}
{'loss': 1.25, 'learning_rate': 1.4101796407185631e-05, 'epoch': 2.99}




  0%|          | 0/223 [00:00<?, ?it/s]

{'eval_loss': 1.1525862216949463, 'eval_rouge1': 0.3234, 'eval_rouge2': 0.0132, 'eval_rougeL': 0.3251, 'eval_rougeLsum': 0.3233, 'eval_accuracy': 66.7794, 'eval_gen_len': 5.7106, 'eval_runtime': 61.0898, 'eval_samples_per_second': 466.133, 'eval_steps_per_second': 3.65, 'epoch': 2.99}
{'loss': 1.2575, 'learning_rate': 1.1119760479041916e-05, 'epoch': 4.49}




  0%|          | 0/223 [00:00<?, ?it/s]

{'eval_loss': 1.1971290111541748, 'eval_rouge1': 0.4953, 'eval_rouge2': 0.0184, 'eval_rougeL': 0.4948, 'eval_rougeLsum': 0.4944, 'eval_accuracy': 68.1005, 'eval_gen_len': 5.4426, 'eval_runtime': 59.8522, 'eval_samples_per_second': 475.772, 'eval_steps_per_second': 3.726, 'epoch': 4.49}


KeyboardInterrupt: 

In [32]:
# test 

model_name = "experiments/t5-base-finetuned-random/checkpoint-1500"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [39]:
max_input_length = 512

idx = 0

c = 0
for idx in range(50):
    clue = train_dataset[idx]['clue'] + f' ({train_dataset[idx]["orig_lengths"]})'
    inputs = clue
    inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, return_tensors="pt")
    output = model.generate(**inputs, num_beams=10, do_sample=True)
    decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
    predicted_title = nltk.sent_tokenize(decoded_output.strip())[0]
    label  = train_dataset[idx]['soln_with_spaces']

    print(f' Clue is {clue}, label is {label}, predictions is {predicted_title}')

    if label == predicted_title:
        c +=1

print(c*2) 




 Clue is Suffering to grasp edge of plant (8), label is agrimony, predictions is herbaceous
 Clue is Honour Ben and Noel with new order (7), label is ennoble, predictions is benevolent
 Clue is Bit the royal we love? Cheers! (4), label is iota, predictions is sneeze
 Clue is Chemist curtailed mixture to one with a blood deficiency (8), label is ischemia, predictions is rheumatologist
 Clue is Performer's part is temporary part (7), label is artiste, predictions is emcee
 Clue is Bearded old party leader sat for artist (7), label is opposed, predictions is artist
 Clue is License prepared to help mum (7), label is silence, predictions is nanny
 Clue is Young man alien kicked on the way out (6), label is bucket, predictions is adolescent
 Clue is Omar, crazy about end of Rubaiyat, beginning to dine with Fitzgerald on sausage (10), label is mortadella, predictions is iroquoise
 Clue is Resolved to be firm (7), label is decided, predictions is resolute
 Clue is Miser books sequel to unfini