In [1]:
!pip install datasets==2.15.0
!pip install transformers[torch]
!pip install nltk 
!pip install accelerate -U
!pip install torch
!pip install sentencepiece
!pip install matplotlib
!pip install sacrebleu

Collecting datasets==2.15.0
  Obtaining dependency information for datasets==2.15.0 from https://files.pythonhosted.org/packages/e2/cf/db41e572d7ed958e8679018f8190438ef700aeb501b62da9e1eed9e4d69a/datasets-2.15.0-py3-none-any.whl.metadata
  Downloading datasets-2.15.0-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow-hotfix (from datasets==2.15.0)
  Obtaining dependency information for pyarrow-hotfix from https://files.pythonhosted.org/packages/e4/f4/9ec2222f5f5f8ea04f66f184caafd991a39c8782e31f5b0266f101cb68ca/pyarrow_hotfix-0.6-py3-none-any.whl.metadata
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting fsspec[http]<=2023.10.0,>=2023.1.0 (from datasets==2.15.0)
  Obtaining dependency information for fsspec[http]<=2023.10.0,>=2023.1.0 from https://files.pythonhosted.org/packages/e8/f6/3eccfb530aac90ad1301c582da228e4763f19e719ac8200752a4841b0b2d/fsspec-2023.10.0-py3-none-any.whl.metadata
  Downloading fsspec-2023.10.0-py3-none-any.whl.metadata (6.8 kB)
Down

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import MT5ForConditionalGeneration, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import numpy as np
from transformers import EarlyStoppingCallback
import sacrebleu



In [3]:
model_checkpoint = 'google/mt5-small'
# model_checkpoint = 'NepaliAI/mt5-small-finetuned-Nepali-Health-50k'

task = "NepaliAI/Nepali-HealthChat"

from datasets import load_dataset
raw_datasets = load_dataset(task)

splitted_datasets = raw_datasets['train'].train_test_split(test_size=0.1)

if model_checkpoint in ['google/mt5-small','google/mt5-base','NepaliAI/mt5-small-finetuned-Nepali-Health-50k']:
    prefix = "answer: "
else:
    prefix = ""
    
max_input_length = 512
max_target_length = 512 # base = 1024 (max)

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["Question"]]

    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(text_target=examples["Answer"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs
tokenized_datasets = splitted_datasets.map(preprocess_function, batched=True,remove_columns=["Question", "Answer"])
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Downloading readme:   0%|          | 0.00/105 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/13.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.11M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/13.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.78M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/13.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/13.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.87M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/49560 [00:00<?, ? examples/s]

Map:   0%|          | 0/5507 [00:00<?, ? examples/s]

In [4]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 49560
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5507
    })
})

In [5]:
#see for compute_metrics# https://github.com/huggingface/transformers/blob/main/examples/pytorch/summarization/run_summarization.py#L718
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    # rougeLSum expects newline after each sentence
#     preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
#     labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    
     # Wrap each element in the decoded_labels list with another list
    decoded_labels = [[label] for sublist in decoded_labels for label in sublist]

    # Compute BLEU score using sacrebleu library
    bleu_score = sacrebleu.corpus_bleu(decoded_preds, decoded_labels).score

    result = {"bleu": bleu_score}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [6]:
model = MT5ForConditionalGeneration.from_pretrained(model_checkpoint)
batch_size = 2
args = Seq2SeqTrainingArguments(
    "NFT",
    evaluation_strategy = "epoch",
    save_strategy="epoch",
    
#     evaluation_strategy = "steps",
#     save_strategy="steps",
    
#     eval_steps=200,#increase this to 600
#     save_steps=600,#1200
    
    learning_rate=2e-4,
    optim="adafactor",
    
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    
    weight_decay=0.01,
    gradient_accumulation_steps=8,
    
    save_total_limit=3,
    num_train_epochs=5,
    
    predict_with_generate=True,
    load_best_model_at_end=True,
    
    generation_max_length=128,#decrease this to 50 
    fp16=False,
    report_to="tensorboard",
)

pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [8]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    compute_metrics=compute_metrics if args.predict_with_generate else None,
)

In [None]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
0,2.2173,1.942202,52.9424,125.4801
2,1.9321,1.750227,40.3977,126.5613
4,1.8797,1.716162,26.1323,126.8117


In [None]:
model.save_pretrained("NFT")
tokenizer.save_pretrained("NFT")

In [None]:
# Load the trained model
model = MT5ForConditionalGeneration.from_pretrained("NFT")

# Load the tokenizer for generating new output
tokenizer = AutoTokenizer.from_pretrained("NFT")

In [None]:
input_text = "answer: म मेरो भावना र आफैले केहि चीजहरू मार्फत जाँदैछु। म भर्खरै सुत्छु र म कसरी बेकार छु र म यहाँ कसरी हुनुहुँदैन भनेर सोच्छु तर म केही गर्दैन। मैले कहिल्यै आत्महत्या गर्ने प्रयास गरेको छैन वा सोचेको छैन। म सधैं मेरो समस्याहरू समाधान गर्न चाहन्छु, तर म यसको वरिपरि कहिल्यै पुगिन। म कसरी सबैका लागि बेकार भएको मेरो भावनालाई परिवर्तन गर्न सक्छु?"
inputs = tokenizer(input_text,return_tensors='pt',max_length=256,truncation=True)
print(f'input_text: {input_text}')
print(f'tokenized_inputs: {inputs}')
generated_text = model.generate(**inputs,max_length=256,min_length=128,length_penalty=12.0,num_beams=20,top_p=0.95,top_k=90,do_sample=True,temperature=0.7,num_return_sequences=1,no_repeat_ngram_size=3)
# generated_text = model.generate(**inputs,max_length=128,min_length=90,length_penalty=12.0,num_beams=5,do_sample=True,num_return_sequences=1,no_repeat_ngram_size=4)

generated_text
generated_response = tokenizer.batch_decode(generated_text,skip_special_tokens=True)[0]
tokens = generated_response.split(" ")
filtered_tokens = [token for token in tokens if not token.startswith("<extra_id_")]
print(' '.join(filtered_tokens))

In [None]:
input_text = "answer: म मेरो छातीको बीचमा तीव्र दुखाइ महसुस गर्छु र मैले फ्याँक्नु पर्ने महसुस हुन्छ। यो लगभग 5 वा 6 पटक भएको छ, सामान्यतया मैले दुखाइ भएको ठाउँमा तताउने प्याड राखेको छु र केहि समय पछि यो हट्छ, तर यस पटक त्यस्तो छैन। के यो नराम्रो कुरा हो?"
inputs = tokenizer(input_text,return_tensors='pt',max_length=256,truncation=True)
print(f'input_text: {input_text}')
print(f'tokenized_inputs: {inputs}')
generated_text = model.generate(**inputs,max_length=256,min_length=128,length_penalty=4.0,num_beams=5,top_p=0.95,top_k=1500,do_sample=True,temperature=0.7,num_return_sequences=1,no_repeat_ngram_size=4)
# generated_text = model.generate(**inputs,max_length=128,min_length=90,length_penalty=12.0,num_beams=5,do_sample=True,num_return_sequences=1,no_repeat_ngram_size=4)

generated_text
generated_response = tokenizer.batch_decode(generated_text,skip_special_tokens=True)[0]
tokens = generated_response.split(" ")
filtered_tokens = [token for token in tokens if not token.startswith("<extra_id_")]
print(' '.join(filtered_tokens))

In [None]:
input_text = "answer: नमस्ते.....मलाई बलियो खोकी लागेको छ, केवल स्पष्ट सेतो फोम आउँदैछ, मेरो छाती जब म खोक्छु। मेरो नाक वा मेरो मुखबाट धेरै आवाज आउँछ, म निदाउन सक्दिन, किनभने। म नेटी बर्तन प्रयोग गरेर नियमित गर्छु। म पनि धेरै रिसाउँछु। म पाँच वर्षको लागि प्रिड्रिसन लिइरहेको छु। किनभने मलाई फोक्सोको समस्या छ ।"
inputs = tokenizer(input_text,return_tensors='pt',max_length=256,truncation=True)
print(f'input_text: {input_text}')
print(f'tokenized_inputs: {inputs}')
generated_text = model.generate(**inputs,max_length=256,min_length=128,length_penalty=4.0,num_beams=5,top_p=0.95,top_k=150,do_sample=True,temperature=0.7,num_return_sequences=1,no_repeat_ngram_size=6)
# generated_text = model.generate(**inputs,max_length=128,min_length=90,length_penalty=12.0,num_beams=5,do_sample=True,num_return_sequences=1,no_repeat_ngram_size=4)

generated_text
generated_response = tokenizer.batch_decode(generated_text,skip_special_tokens=True)[0]
tokens = generated_response.split(" ")
filtered_tokens = [token for token in tokens if not token.startswith("<extra_id_")]
print(' '.join(filtered_tokens))

In [None]:
# generation_hyperparameters = {
#     "max_length": 256,
#     "min_length": 128,
#     "length_penalty": 4.0,
#     "num_beams": 5,
#     "top_p": 0.95,
#     "top_k": 150,
#     "do_sample": True,
#     "temperature": 0.7,
#     "num_return_sequences": 1,
#     "no_repeat_ngram_size": 3,
# }
# metadata = {
#     "hyperparameters": generation_hyperparameters,
# }


# # Push the model to the Hugging Face Model Hub with metadata
# model.push_to_hub("NepaliAI/mt5-small-finetuned-Nepali-Health-50k", use_auth_token="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", commit_message="Fine-tuned model with generation hyperparameters", metadata=metadata)

# # Push the tokenizer to the Hugging Face Model Hub with metadata
# tokenizer.push_to_hub("NepaliAI/mt5-small-finetuned-Nepali-Health-50k", use_auth_token="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", commit_message="Fine-tuned tokenizer with generation hyperparameters", metadata=metadata)

In [None]:
results = trainer.evaluate()
metrics = results.metrics
keys = list(metrics.keys())
values = [metrics[key] for key in keys]
print(values)

In [None]:
results