<a href="https://colab.research.google.com/github/wandb/examples/blob/master/lightning/projects/yahoo-answers-classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<img src="https://i.imgur.com/vnejHGh.png" width="800">
<!--- @wandbcode{huggingface_tables} -->

# Yahoo! Answers Classification

## Installation and set-up

In [None]:
%%capture
!pip install transformers datasets wandb

In [None]:
from datasets import load_dataset
from datasets import load_metric
import numpy as np
import wandb

wandb.login()

# Data preparation

We'll download the
[Yahoo! Answers Dataset](https://paperswithcode.com/sota/text-classification-on-yahoo-answers),
which includes questions and answers from the now-defunct
[Yahoo! Answers forum](https://en.wikipedia.org/wiki/Yahoo!_Answers).


In [None]:
dataset = load_dataset("yahoo_answers_topics")
dataset["train"][0]

We'll decrease the size of the dataset for faster logging.

In [None]:
dataset['test'] = dataset['test'].select(range(dataset['test'].num_rows // 10))

The task is to predict the category of the question, AKA its `topic`.

In [None]:
label_list = dataset['train'].unique('topic')
num_labels = len(label_list)
dataset = dataset.rename_column('topic', 'labels')

# Training the model and logging to W&B


This cell sets up logging of validation data during training,
so that we can see model outputs and not just loss and accuracy.

In [None]:
from wandb.sdk.integration_utils.data_logging import ValidationDataLogger

# automatically log model to W&B at the end
%env WANDB_LOG_MODEL=true

accuracy_metric = load_metric("accuracy")

validation_targets = [dataset['test'].features['labels'].int2str(x) for x in dataset['test']['labels']]
validation_logger = ValidationDataLogger(inputs=dataset["test"][:],
                                         targets=validation_targets)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # convert predictions from class (0, 1, 2, ...) to label (Health, Science…)
    prediction_labels = [dataset['test'].features['labels'].int2str(x.item())
                         for x in predictions]
    
    # log predictions
    validation_logger.log_predictions(prediction_labels)

    # metrics from the datasets library have a compute method
    return accuracy_metric.compute(predictions=predictions, references=labels)

In [None]:
###
# TrainingArguments -- configure training and logging
###

from transformers import Trainer, TrainingArguments

run = wandb.init(entity="wandb", project="yahoo-answers-topics-transformers")

args = TrainingArguments(  # docs: https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments
    report_to='wandb',                    # enable logging to W&B
    output_dir='topic_classification',    # set output directory
    overwrite_output_dir=True,            # is it okay to overwrite files there?
    evaluation_strategy='steps',          # check evaluation metrics on a given # of steps
    dataloader_num_workers=2,             # additional dataloading processes
    logging_steps=50,                     # how often do we log?
    logging_first_step=True,              # do we log at the start of training?
    eval_steps=50,                        # how often do we run evaluation?
    eval_accumulation_steps=1,            # how much do we accumulate between evaluations?
    load_best_model_at_end=True,          # do we save the model at the end?
    metric_for_best_model='accuracy',     # how do we judge the best model?
    # hyperparameters
    max_steps=500,                        # how long should we train for?
    per_device_train_batch_size=8,        # batch size, increase to saturate GPU memory
    learning_rate=5e-6,                   # optimizer learning rate, increase when you increase batch_size
    weight_decay=0.,                      # weight decay regularization penalty
    adam_epsilon=1e-8,                    # epsilon hyperparameter for Adam
    adafactor=False,                      # use AdaFactor instead of AdamW
    max_grad_norm=1.,                     # gradient clipping maximum
    lr_scheduler_type="linear",           # learning rate adjustment schedule
)

###
# AutoModel and AutoTokenizer -- configure model
###

from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer

# model options here:
#  https://huggingface.co/transformers/pretrained_models.html

# try these: "bert-base-uncased", "squeezebert/squeezebert-mnli-headless",
#            "distilbert-base-uncased", "xlnet-base-cased", "distilroberta-base"
#            "allenai/longformer-base-4096",

wandb.config["model_string"] = "distilbert-base-uncased"

model = AutoModelForSequenceClassification.from_pretrained(wandb.config.model_string, num_labels=num_labels)
tokenizer = AutoTokenizer.from_pretrained(wandb.config.model_string)

tokenized_dataset = dataset.map(lambda x: tokenizer(x['question_title'], truncation=True), batched=True)

trainer = Trainer(
    model=model, args=args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

###
# 🏃‍♀️ Run Training 🏃‍♂️
###

trainer.train()

run.finish()