In [None]:
# get packages inatalled and ready to use
! pip install datasets transformers accelerate
! pip install evaluate
import evaluate
import transformers

In [None]:
#log in to Huggingface so that datasets and models can be read
from huggingface_hub import notebook_login
notebook_login()

In [None]:
# get task ready
GLUE_TASKS = ["cola", "mnli", "mnli-mm", "mrpc", "qnli", "qqp", "rte", "sst2", "stsb", "wnli"]
task = "cola"

#select model and batch size
model_checkpoint = "distilbert-base-uncased"
batch_size = 16

#choose alternative task in case boolq can't run
import datasets
actual_task = "mnli" if task == "mnli-mm" else task

#get related dataset
dataset = datasets.load_dataset("glue", actual_task)

In [None]:
# get the metric
metric = evaluate.load('glue', actual_task)

In [None]:
#get things needed to view datasets
import datasets
import random
import pandas as pd
from IPython.display import display, HTML

#view datasets to ensure it's successfully downloaded
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))
show_random_elements(dataset["train"])

In [None]:
#view metric to ensure it's successfully downloaded
import numpy as np

fake_preds = np.random.randint(0, 2, size=(64,))
fake_labels = np.random.randint(0, 2, size=(64,))
metric.compute(predictions=fake_preds, references=fake_labels)

In [None]:
#tokenize the model
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [None]:
#example of tokenize
tokenizer("Hello, this one sentence!", "And this sentence goes with it.")

In [None]:
#assign different meanings to different sentences
task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mnli-mm": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}
sentence1_key, sentence2_key = task_to_keys[task]
if sentence2_key is None:
    print(f"Sentence: {dataset['train'][0][sentence1_key]}")
else:
    print(f"Sentence 1: {dataset['train'][0][sentence1_key]}")
    print(f"Sentence 2: {dataset['train'][0][sentence2_key]}")
    
#define a function to preprocess datasets by autotokenize each sentence of the dataset
def preprocess_function(examples):
    if sentence2_key is None:
        return tokenizer(examples[sentence1_key], truncation=True)
    return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True)
preprocess_function(dataset['train'][:5])

In [None]:
#apply the function above to the whole dataset
encoded_dataset = dataset.map(preprocess_function, batched=True)
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
num_labels = 3 if task.startswith("mnli") else 1 if task=="stsb" else 2

In [None]:
# apply automodelforsequenceclassification to the model
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

In [None]:
#define the metric for the validation of model
metric_name = "pearson" if task == "stsb" else "matthews_correlation" if task == "cola" else "accuracy"
model_name = model_checkpoint.split("/")[-1]

#set model name, number of epochs, batch size and solve imbalaced label problem, set metric and validation
args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=True,
)
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if task != "stsb":
        predictions = np.argmax(predictions, axis=1)
    else:
        predictions = predictions[:, 0]
    return metric.compute(predictions=predictions, references=labels)
validation_key = "validation_mismatched" if task == "mnli-mm" else "validation_matched" if task == "mnli" else "validation"

#complete finetune based on definition above
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset[validation_key],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
# show results
trainer.train()

In [None]:
#show accuracy
trainer.evaluate()

In [None]:
#upload it
trainer.push_to_hub()