<a href="https://colab.research.google.com/github/AlbertBannister/cricinfo-commentary-scraper/blob/main/cricket_ner_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers accelerate datasets evaluate seqeval

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.24.0-py3-none-any.whl (260 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.0/261.0 kB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m6.7 MB/s[0m eta [36m0:00:0

In [None]:
import transformers
from datasets import load_dataset
from transformers import AutoTokenizer

In [None]:
itask = "ner" # Should be one of "ner", "pos" or "chunk"
model_checkpoint = "bert-base-uncased"
batch_size = 16
MAX_LENGTH = 512

In [None]:
data=load_dataset("json", data_files="/content/drive/MyDrive/Colab Notebooks/data/NLP/cricket_ner.jsonl")
dataset=data["train"].train_test_split(seed=42)
dataset

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'meta', '_input_hash', '_task_hash', '_is_binary', 'spans', 'tokens', '_view_id', 'answer', '_timestamp', '_annotator_id', '_session_id'],
        num_rows: 2643
    })
    test: Dataset({
        features: ['text', 'meta', '_input_hash', '_task_hash', '_is_binary', 'spans', 'tokens', '_view_id', 'answer', '_timestamp', '_annotator_id', '_session_id'],
        num_rows: 881
    })
})

In [None]:
labels = [
    "SHOT",
    "DELIVERY",
    "LINE",
    "LENGTH"
    ]
tag2id = {tag: i+1 for i, tag in enumerate(labels)}
id2tag = {v:k for k, v in tag2id.items()}
id2tag

{1: 'SHOT', 2: 'DELIVERY', 3: 'LINE', 4: 'LENGTH'}

In [None]:
label2id = {
    'O': 0,
    **{f'B-{k}': 2*v - 1 for k, v in tag2id.items()},
    **{f'I-{k}': 2*v for k, v in tag2id.items()}
}
id2label = {v:k for k, v in label2id.items()}
id2label

{0: 'O',
 1: 'B-SHOT',
 3: 'B-DELIVERY',
 5: 'B-LINE',
 7: 'B-LENGTH',
 2: 'I-SHOT',
 4: 'I-DELIVERY',
 6: 'I-LINE',
 8: 'I-LENGTH'}

In [None]:

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenized=tokenizer(dataset["train"]["text"][42], return_offsets_mapping=True)
words = tokenizer.convert_ids_to_tokens(tokenized["input_ids"])
words

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

['[CLS]',
 'fires',
 'that',
 'with',
 'lively',
 'pace',
 'outside',
 'off',
 'stump',
 ',',
 'the',
 'ball',
 'climbs',
 'after',
 'pitching',
 ',',
 'left',
 'alone',
 '[SEP]']

In [None]:
def get_token_role_in_span(token_start: int, token_end: int, span_start: int, span_end: int):
    """
    Check if the token is inside a span.
    Args:
      - token_start, token_end: Start and end offset of the token
      - span_start, span_end: Start and end of the span
    Returns:
      - "B" if beginning
      - "I" if inner
      - "O" if outer
      - "N" if not valid token (like <SEP>, <CLS>, <UNK>)
    """
    if token_end <= token_start:
        return "N"
    if token_start < span_start or token_end > span_end:
        return "O"
    if token_start > span_start:
        return "I"
    else:
        return "B"



def tokenize_and_adjust_labels(sample):
    """
    Args:
        - sample (dict): {"id": "...", "text": "...", "tags": [{"start": ..., "end": ..., "tag": ...}, ...]
    Returns:
        - The tokenized version of `sample` and the labels of each token.
    """
    # Tokenize the text, keep the start and end positions of tokens with `return_offsets_mapping` option
    # Use max_length and truncation to ajust the text length
    tokenized = tokenizer(sample["text"],
                          return_offsets_mapping=True,
                          truncation=True
                          )

    labels = [0 for _ in range(len(tokenized["input_ids"]))]

    # Scan all the tokens and spans, assign the corresponding label if the token lies at the beginning
    # or inside the spans
    for i, (token_start, token_end) in enumerate(tokenized["offset_mapping"]):
        for span in sample["spans"]:
            role = get_token_role_in_span(token_start, token_end, span["start"], span["end"])
            if role == "N":
                labels[i] = -100
            elif role == "B":
                labels[i] = label2id[f"B-{span['label']}"]
            elif role == "I":
                labels[i] = label2id[f"I-{span['label']}"]


    return {**tokenized, "labels": labels}

In [None]:
tokenized_ds = dataset.map(
    tokenize_and_adjust_labels,
    remove_columns=dataset["train"].column_names,
).remove_columns("offset_mapping")
tokenized_ds


Map:   0%|          | 0/2643 [00:00<?, ? examples/s]

Map:   0%|          | 0/881 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2643
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 881
    })
})

In [None]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
batch = data_collator([tokenized_ds["train"][i] for i in range(10)])
batch["labels"]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    0,    0,    7,    0,    0,    0,    5,    0,    1,    0,    0,
            0,    0,    0,    0,    0,    0, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100],
        [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100],
        [-100,    7,    0,    5,    6,    6,    0,    0,    0,    0,    0,    0,
            0,    0,    1, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -1

In [None]:
import evaluate
metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [None]:
import numpy as np

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "/content/drive/MyDrive/my_models/cricket_mlm/checkpoint-6111",
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at /content/drive/MyDrive/my_models/cricket_mlm/checkpoint-6111 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.metrics import classification_report

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.170943,0.800792,0.852086,0.825643,0.951248
2,No log,0.156595,0.824513,0.873156,0.848138,0.959272
3,No log,0.182237,0.824554,0.857564,0.840735,0.956081
4,0.133400,0.172323,0.85288,0.867257,0.860008,0.961946
5,0.133400,0.171599,0.851657,0.887906,0.869404,0.963636
6,0.133400,0.205351,0.850826,0.867678,0.85917,0.961289
7,0.027400,0.213833,0.858948,0.887906,0.873187,0.963495
8,0.027400,0.214118,0.864122,0.887063,0.875442,0.96387
9,0.027400,0.224374,0.865408,0.888748,0.876923,0.964011
10,0.005600,0.226973,0.866009,0.887906,0.876821,0.963823


TrainOutput(global_step=1660, training_loss=0.05036748118070235, metrics={'train_runtime': 450.8236, 'train_samples_per_second': 58.626, 'train_steps_per_second': 3.682, 'total_flos': 905967063382392.0, 'train_loss': 0.05036748118070235, 'epoch': 10.0})

In [None]:
from transformers import pipeline

# sanity check
token_classifier = pipeline(
    "token-classification", model=trainer.model, tokenizer=tokenizer, aggregation_strategy="simple", device=0
)
token_classifier("Lots happening that ball, ends up with overthrows. Ball landed outside off and turned away extravagantly, Blundell cut it but had to reach out, so he toed it towards mid-off. The fielder, Islam, charged in and threw the non-striker's stumps direct. It edged the outer half of the stumps and deflected too fine for the keeper. Back-up fielder's chase ended in vain")

[{'entity_group': 'LINE',
  'score': 0.99950343,
  'word': 'outside off',
  'start': 63,
  'end': 74},
 {'entity_group': 'DELIVERY',
  'score': 0.9906617,
  'word': 'turned away',
  'start': 79,
  'end': 90},
 {'entity_group': 'SHOT',
  'score': 0.9943463,
  'word': 'cut',
  'start': 115,
  'end': 118}]

In [None]:
trainer.model.save_pretrained("./drive/MyDrive/my_models/cricket_ner_bert_after_pretraining_v1")