<a href="https://colab.research.google.com/github/2dot71mily/youtube_captions_corrections/blob/main/notebooks/youtube_captions_multi_classification_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install git+https://github.com/huggingface/transformers.git

!pip install datasets
!pip3 install wandb


In [2]:
import transformers
import datasets
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import load_dataset

from collections import Counter

import numpy as np
import torch

import wandb
import pdb

In [3]:
# Paste your wandb API key in below 
!wandb login --relogin

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [4]:
%env WANDB_PROJECT=ytmc_temp
%env TASK_NAME=test
%env WANDB_WATCH='all'

env: WANDB_PROJECT=ytmc_temp
env: TASK_NAME=test
env: WANDB_WATCH='all'


In [5]:
# Dataset parameters (https://huggingface.co/datasets/youtube_caption_corrections)
IDX_COL_NAME = 'video_ids'
LABEL_COL_NAME = 'diff_type'

CLASSES_BINARY = ["NO_DIFF", "DIFF"]
CLASSES_MULTI_WEIGHT_MAP = {
    "NO_DIFF":1,
    "CASE_DIFF":1,
    "PUNCUATION_DIFF":1,
    "CASE_AND_PUNCUATION_DIFF":1,
    "STEM_BASED_DIFF":1,
    "DIGIT_DIFF":1,
    "INTRAWORD_PUNC_DIFF":1,
    "UNKNOWN_TYPE_DIFF":10,
}
CLASSES_MULTI = list(CLASSES_MULTI_WEIGHT_MAP.keys())

In [40]:
# Training / model hyper-parameters
TESTING = False
CLASSIFICATION_TYPE = 'multi' # from ['binary', 'multi']
ADD_MANUAL_WEIGHT = False

BATCH_SIZE = 32
LR=3e-5
EPOCHS = 4
MAX_SEGMENT_SIZE = 512
N_SPECIAL_TOKENS = 2
LR_SCHEDULER = 'constant' #  from ['constant', 'cosine', 'linear']
WEIGHT_DECAY = 0.1

MANUAL_WEIGHTS = list(CLASSES_MULTI_WEIGHT_MAP.values()) if ADD_MANUAL_WEIGHT else []
TRAINING_NAME = \
    f"LR{LR}{LR_SCHEDULER}_BS{BATCH_SIZE}_seg{MAX_SEGMENT_SIZE}_WD{WEIGHT_DECAY}_E{EPOCHS}"

CLASSES = CLASSES_MULTI if CLASSIFICATION_TYPE == 'multi' else CLASSES_BINARY

MODEL_CHECKPOINT = "distilbert-base-uncased"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
def get_segment(raw_data, max_size):

    def segment(feature):
        segments = []
        for row in feature:
            segments += [row[i : i + max_size] for i in range(0, len(row), max_size)]
            if len(feature) % max_size != 0:  # drop if last sub-segment is smaller
              segments = segments[:-1]
        return segments

    segmented_data = {}
    for feature in raw_data:
        if feature != IDX_COL_NAME: 
            segmented_data[feature] = segment(raw_data[feature])
    return segmented_data


def tokenize_and_align_labels(batch, tokenizer):

    # Label and value alignment as some tokens get broken down in to sub-words
    def align_values(feature):

        aligned_segments = []
        for i, value in enumerate(batch[feature]):
            # Get new indexes after some tokens get broken down in to sub-words
            word_ids = tokenized_seq.word_ids(batch_index=i)

            if feature == LABEL_COL_NAME:
                if num_classes == 2:
                    # Change labels into binary if we only want two classes
                    aligned_values = [
                        -100 if word_idx is None else 1 if value[word_idx] > 0 else 0
                        for word_idx in word_ids
                    ]
                else:
                    aligned_values = [
                        -100 if word_idx is None else value[word_idx]
                        for word_idx in word_ids
                    ]
            # String sequences get string special token
            else:
                aligned_values = [
                    "##SPECIAL" if word_idx is None else value[word_idx]
                    for word_idx in word_ids
                ]
            aligned_segments.append(aligned_values)
        return aligned_segments

    # tokenizer returns dict with keys: input_ids, attention_mask
    tokenized_seq = tokenizer(
        batch["default_seq"], truncation=True, is_split_into_words=True
    )

    num_classes = len(CLASSES)
    for feature in batch:
        tokenized_seq[feature] = align_values(feature)

    tokenized_seq["labels"] = tokenized_seq[LABEL_COL_NAME]
    return tokenized_seq



In [8]:
def compute_metrics(p):
    predictions, labels = p

    labels = labels.flatten()
    mask = labels != -100
    labels = labels[mask]

    predictions = np.argmax(predictions, axis=2)
    predictions = predictions.flatten()
    predictions = predictions[mask]

    metrics = {}
    precision = []
    recall = []
    for class_idx, class_name in enumerate(CLASSES):
        precision.append(get_class_precision(predictions, labels, class_idx))
        metrics[f"{class_name}_precision"] = precision[class_idx]
        recall.append(get_class_recall(predictions, labels, class_idx))
        metrics[f"{class_name}_recall"] = recall[class_idx]

    class_weights = get_weights(
        torch.tensor(labels), manual_weights=MANUAL_WEIGHTS, device='cpu'
    )

    weights = [class_weights[l].item() for l in labels if l != -100]
    weighted_true_positives = [
        int(p == l) * w 
        for (p, l, w) in zip(predictions, labels, weights) 
        if l != -100
    ]

    metrics["weighted_accuracy"] = (
        np.array(weighted_true_positives).sum() / np.array(weights).sum()
    )
    metrics["weighted_precision"] = np.array(
        [c_pr * c_w for c_pr, c_w in zip(precision, class_weights)]
    ).sum()
    metrics["weighted_recall"] = np.array(
        [c_re * c_w for c_re, c_w in zip(recall, class_weights)]
    ).sum()

    wandb.log({"conf_mat" : wandb.plot.confusion_matrix(
          probs=None,
          preds=predictions, y_true=labels,
          class_names=CLASSES)
    })
    return metrics

In [9]:
def get_class_precision(predictions, labels, class_type):
    class_fp = [
        int(p == class_type and l != class_type)
        for (p, l) in zip(predictions, labels)
        if l != 100
    ]
    tp_sum = get_class_true_positives(predictions, labels, class_type)
    return tp_sum / (tp_sum + np.array(class_fp).sum())


def get_class_recall(predictions, labels, class_type):
    class_fn = [
        int(p != class_type and l == class_type)
        for (p, l) in zip(predictions, labels)
        if l != 100
    ]
    tp_sum = get_class_true_positives(predictions, labels, class_type)
    return tp_sum / (tp_sum + np.array(class_fn).sum())


def get_class_true_positives(predictions, labels, class_type):
    class_tp = [
        int(p == class_type and l == class_type)
        for (p, l) in zip(predictions, labels)
        if l != 100
    ]
    return np.array(class_tp).sum()


def get_weights(labels, manual_weights=[], device='cpu'):
    class_cnts = torch.bincount(
        labels[labels != -100], minlength=len(CLASSES)
    ).float()
    weights = torch.where(class_cnts > 0, 1 / class_cnts, class_cnts)
    if manual_weights:
        weights = weights * \
            torch.tensor(manual_weights, device=torch.device(device)).float()
    sum = weights.sum(dtype=torch.float)
    final_weights = weights / sum
    return torch.where(weights > 0, final_weights, weights)


In [10]:
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = self.weighted_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss

    def weighted_loss(self, logits, labels, num_labels=len(CLASSES)):
        class_weights = get_weights(
            labels.view(-1), manual_weights=MANUAL_WEIGHTS, device=DEVICE)
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
        return loss_fct(logits.view(-1, num_labels), labels.view(-1))

In [11]:
if TESTING:
    yt_dataset = load_dataset("youtube_caption_corrections", split='train[:01%]')
else:
    yt_dataset = load_dataset("youtube_caption_corrections")['train']

Downloading builder script:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset youtube_caption_corrections/default (download: 212.17 MiB, generated: 339.49 MiB, post-processed: Unknown size, total: 551.66 MiB) to /root/.cache/huggingface/datasets/youtube_caption_corrections/default/0.0.0/8fae144c23187d43bc6909569c1221160f250f471c9e41211a1ed35e8d121655...


Downloading data files:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/11.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/9.69M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/9.57M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.0M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/4 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/10769 [00:00<?, ? examples/s]

Dataset youtube_caption_corrections downloaded and prepared to /root/.cache/huggingface/datasets/youtube_caption_corrections/default/0.0.0/8fae144c23187d43bc6909569c1221160f250f471c9e41211a1ed35e8d121655. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [12]:
yt_dataset

Dataset({
    features: ['video_ids', 'default_seq', 'correction_seq', 'diff_type'],
    num_rows: 10769
})

In [13]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [14]:
# segments may be truncated at tokenizer stage if single token splits per BPE
seg_kwargs = {"max_size": MAX_SEGMENT_SIZE - N_SPECIAL_TOKENS} 

segment_dataset = yt_dataset.map(
    get_segment,
    batched=True,
    remove_columns=[IDX_COL_NAME], # Only feature that is not a sequence
    fn_kwargs=seg_kwargs,
)

  0%|          | 0/11 [00:00<?, ?ba/s]

In [15]:
segment_dataset

Dataset({
    features: ['default_seq', 'correction_seq', 'diff_type'],
    num_rows: 27925
})

In [16]:
tok_kwargs = {"tokenizer": tokenizer}

tokenized_dataset = segment_dataset.map(
    tokenize_and_align_labels, batched=True, fn_kwargs=tok_kwargs,
)

  0%|          | 0/28 [00:00<?, ?ba/s]

In [17]:
tokenized_dataset

Dataset({
    features: ['default_seq', 'correction_seq', 'diff_type', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 27925
})

In [18]:
split_dataset = tokenized_dataset.shuffle(seed=42).train_test_split(test_size=0.05)

In [19]:
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_CHECKPOINT, num_labels=len(CLASSES)
)

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN t

In [41]:
args = TrainingArguments(
    TRAINING_NAME,
    evaluation_strategy="epoch",
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=WEIGHT_DECAY,
    save_total_limit=3,
    logging_steps=10,
    lr_scheduler_type=LR_SCHEDULER
)

data_collator = DataCollatorForTokenClassification(tokenizer)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [21]:
trainer = WeightedTrainer(
    model,
    args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"], 
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [22]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: correction_seq, diff_type, default_seq. If correction_seq, diff_type, default_seq are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 26528
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 3316
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33memilylearning[0m (use `wandb login --relogin` to force relogin)


Epoch,Training Loss,Validation Loss,No Diff Precision,No Diff Recall,Case Diff Precision,Case Diff Recall,Puncuation Diff Precision,Puncuation Diff Recall,Case And Puncuation Diff Precision,Case And Puncuation Diff Recall,Stem Based Diff Precision,Stem Based Diff Recall,Digit Diff Precision,Digit Diff Recall,Intraword Punc Diff Precision,Intraword Punc Diff Recall,Unknown Type Diff Precision,Unknown Type Diff Recall,Weighted Accuracy,Weighted Precision,Weighted Recall
1,0.8028,0.799863,0.98451,0.521631,0.291867,0.774884,0.430378,0.759876,0.131511,0.767587,0.012487,0.79572,0.130862,0.971525,0.051979,0.815599,0.059935,0.347348,0.719271,0.060507,0.804837
2,0.7514,0.754341,0.981293,0.616283,0.311193,0.819741,0.467788,0.789155,0.171735,0.733555,0.01587,0.797665,0.148427,0.960502,0.074576,0.80532,0.088908,0.352339,0.73432,0.07596,0.799355
3,0.6667,0.743227,0.983864,0.583776,0.320716,0.784357,0.453193,0.800011,0.15817,0.772567,0.019373,0.782101,0.150431,0.955603,0.067982,0.827086,0.069175,0.43685,0.742794,0.073983,0.80419
4,0.6008,0.781391,0.982957,0.63768,0.311921,0.799672,0.448268,0.809729,0.15493,0.78066,0.0249,0.697471,0.167006,0.955297,0.081775,0.814389,0.091316,0.409102,0.738,0.08279,0.763783


Saving model checkpoint to mweightsFalse_multi_    LR3e-05constant_BS32_seg512_WD0.1_E4/checkpoint-500
Configuration saved in mweightsFalse_multi_    LR3e-05constant_BS32_seg512_WD0.1_E4/checkpoint-500/config.json
Model weights saved in mweightsFalse_multi_    LR3e-05constant_BS32_seg512_WD0.1_E4/checkpoint-500/pytorch_model.bin
tokenizer config file saved in mweightsFalse_multi_    LR3e-05constant_BS32_seg512_WD0.1_E4/checkpoint-500/tokenizer_config.json
Special tokens file saved in mweightsFalse_multi_    LR3e-05constant_BS32_seg512_WD0.1_E4/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: correction_seq, diff_type, default_seq. If correction_seq, diff_type, default_seq are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1397
  Batch size = 32
Saving 

TrainOutput(global_step=3316, training_loss=0.7364848468933519, metrics={'train_runtime': 3047.349, 'train_samples_per_second': 34.821, 'train_steps_per_second': 1.088, 'total_flos': 1.386536580415488e+16, 'train_loss': 0.7364848468933519, 'epoch': 4.0})

## Results

In [23]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: correction_seq, diff_type, default_seq. If correction_seq, diff_type, default_seq are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1397
  Batch size = 32


{'epoch': 4.0,
 'eval_CASE_AND_PUNCUATION_DIFF_precision': 0.15492957746478872,
 'eval_CASE_AND_PUNCUATION_DIFF_recall': 0.7806598879435568,
 'eval_CASE_DIFF_precision': 0.3119207454536346,
 'eval_CASE_DIFF_recall': 0.7996724890829694,
 'eval_DIGIT_DIFF_precision': 0.1670056739107162,
 'eval_DIGIT_DIFF_recall': 0.9552969993876301,
 'eval_INTRAWORD_PUNC_DIFF_precision': 0.08177513355998058,
 'eval_INTRAWORD_PUNC_DIFF_recall': 0.8143893591293833,
 'eval_NO_DIFF_precision': 0.982957405008801,
 'eval_NO_DIFF_recall': 0.6376797182515019,
 'eval_PUNCUATION_DIFF_precision': 0.44826758662066896,
 'eval_PUNCUATION_DIFF_recall': 0.8097285190470169,
 'eval_STEM_BASED_DIFF_precision': 0.024900156277131446,
 'eval_STEM_BASED_DIFF_recall': 0.6974708171206225,
 'eval_UNKNOWN_TYPE_DIFF_precision': 0.09131592363904152,
 'eval_UNKNOWN_TYPE_DIFF_recall': 0.4091024448016242,
 'eval_loss': 0.7813906669616699,
 'eval_runtime': 33.1136,
 'eval_samples_per_second': 42.188,
 'eval_steps_per_second': 1.329,
 'e

In [46]:
%%capture
!pip install huggingface_hub
!curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
!sudo apt-get install git-lfs

from huggingface_hub import notebook_login

In [47]:
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [48]:
trainer.push_to_hub??

In [55]:
TRAINING_NAME

'LR3e-05constant_BS32_seg512_WD0.1_E4'

In [52]:
#trainer.push_to_hub('youtube_captions_error_classification')
# HTTPError: 400 Client Error: Bad Request for url: https://huggingface.co/api/repos/create - Only regular characters and '-', '_', '.' are accepted. '--' and '..' are forbidden. '-' and '.' cannot start or end the name. The name cannot end with ".git". Max length is 96.