<a href="https://colab.research.google.com/github/AlbertBannister/cricinfo-commentary-scraper/blob/main/cricket_ner_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/7.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/7.9 MB[0m [31m4.7 MB/s[0m eta [36m0:00:02[0m[2K     [91m━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/7.9 MB[0m [31m23.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m7.9/7.9 MB[0m [31m78.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m7.9/7.9 MB[0m [31m76.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m56.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.0-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [21]:
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
from pathlib import Path
import pandas as pd

drive_path = Path("/content/drive/MyDrive")
model_path = drive_path / "my_models/cricket_ner_bert_after_pretraining_v0"

In [22]:
df = pd.read_csv(drive_path / "Colab Notebooks/data/NLP/inference_demo.csv", index_col=0)
data = df.text.to_list()
df.head(5)

Unnamed: 0,match_slug,text,inningNumber,oversUnique,oversActual,overNumber,ballNumber,batsmanRuns,legbyes,byes,wides,noballs,isFour,isSix,isWicket,dismissalType,wagonX,wagonY,wagonZone,length
0,west-indies-vs-pakistan-1st-t20i-645645,"flighted offbreak, neatly punched to long-on f...",1,0.01,0.1,1,1,1,0,0,0,0,False,False,False,0,206,329,4,52
1,west-indies-vs-pakistan-1st-t20i-645645,drills the first ball he faces down the ground...,1,0.02,0.2,1,2,1,0,0,0,0,False,False,False,0,138,328,5,83
2,west-indies-vs-pakistan-1st-t20i-645645,"beaten in the flight, and we have timbahhhhh. ...",1,0.03,0.3,1,3,0,0,0,0,0,False,False,True,2,0,0,0,246
3,west-indies-vs-pakistan-1st-t20i-645645,"full and flat, defended to cover",1,0.04,0.4,1,4,0,0,0,0,0,False,False,False,0,110,159,7,32
4,west-indies-vs-pakistan-1st-t20i-645645,pushed wide of mid-off for a single,1,0.05,0.5,1,5,1,0,0,0,0,False,False,False,0,116,226,6,35


In [23]:
labels = [
    "SHOT",
    "DELIVERY",
    "LINE",
    "LENGTH"
    ]
tag2id = {tag: i+1 for i, tag in enumerate(labels)}
id2tag = {v:k for k, v in tag2id.items()}
id2tag

{1: 'SHOT', 2: 'DELIVERY', 3: 'LINE', 4: 'LENGTH'}

In [24]:
label2id = {
    'O': 0,
    **{f'B-{k}': 2*v - 1 for k, v in tag2id.items()},
    **{f'I-{k}': 2*v for k, v in tag2id.items()}
}
id2label = {v:k for k, v in label2id.items()}
id2label

{0: 'O',
 1: 'B-SHOT',
 3: 'B-DELIVERY',
 5: 'B-LINE',
 7: 'B-LENGTH',
 2: 'I-SHOT',
 4: 'I-DELIVERY',
 6: 'I-LINE',
 8: 'I-LENGTH'}

In [25]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForTokenClassification.from_pretrained(
    model_path,
    id2label=id2label,
    label2id=label2id,
)

In [26]:
# Replace this with your own checkpoint
token_classifier = pipeline(
    "token-classification", model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=0
)

In [27]:
preds = token_classifier(data)
preds[:5]

[[{'entity_group': 'LENGTH',
   'score': 0.9992643,
   'word': 'flighted',
   'start': 0,
   'end': 8},
  {'entity_group': 'DELIVERY',
   'score': 0.99837303,
   'word': 'offbreak',
   'start': 9,
   'end': 17},
  {'entity_group': 'SHOT',
   'score': 0.99967396,
   'word': 'punched',
   'start': 26,
   'end': 33}],
 [{'entity_group': 'SHOT',
   'score': 0.99965775,
   'word': 'drills',
   'start': 0,
   'end': 6}],
 [{'entity_group': 'DELIVERY',
   'score': 0.9485887,
   'word': 'flight',
   'start': 14,
   'end': 20},
  {'entity_group': 'SHOT',
   'score': 0.99765444,
   'word': 'shortens',
   'start': 97,
   'end': 105},
  {'entity_group': 'LENGTH',
   'score': 0.9963894,
   'word': 'length',
   'start': 110,
   'end': 116},
  {'entity_group': 'DELIVERY',
   'score': 0.9985881,
   'word': 'offbreak',
   'start': 191,
   'end': 199},
  {'entity_group': 'DELIVERY',
   'score': 0.940809,
   'word': 'turn',
   'start': 218,
   'end': 222},
  {'entity_group': 'LINE',
   'score': 0.9986398

In [28]:
for text, prediction in zip(data, preds):
  print(text)
  label_pretty = [" " for _ in range(len(text))]
  for label in prediction:
    label_pretty[label["start"]: label["end"]] = ["-" for _ in range(label["end"] - label["start"])]
    label_pretty[label["end"] - 1] = "|"
    label_pretty[label["start"]: label["start"] + len(label["entity_group"])] = [c for c in label["entity_group"]]


  print("".join(label_pretty))

flighted offbreak, neatly punched to long-on for one
LENGTH-| DELIVERY         SHOT--|                   
drills the first ball he faces down the ground. There is a long-on in place, though
SHOT-|                                                                             
beaten in the flight, and we have timbahhhhh. Hafeez has seen Charles advancing, I think, and he shortens his length. Charles still appears in charge, but he plays the defensive shot for the offbreak. The ball doesn't turn, and hits the off stump
              DELIVERY                                                                           SHOT---|     LENGTH                                                                           DELIVERY                   DELIVERY           LINE----|
full and flat, defended to cover
LENGTH   DELIVESHOT---|         
pushed wide of mid-off for a single
SHOT-|                             
on a length, just outside off, defended off the front foot
     LENGTH       LINE------|  SHOT