# Library imports and Constants Definitions

## Import

In [None]:
%pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/280.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/280.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.27.2


In [None]:
%pip install datasets seqeval evaluate transformers transformers[torch]

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/510.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/510.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     

In [None]:
import pandas as pd
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict
import ast
from transformers import DataCollatorForTokenClassification
import evaluate
import numpy as np
from transformers import pipeline
from transformers import Trainer
from transformers import TrainingArguments
from transformers import AutoModelForTokenClassification
from seqeval.metrics import classification_report


## Constants

In [None]:
PATH_TRAIN_DS = "/content/train.csv"
PATH_DS = PATH_TRAIN_DS
TOKEN_NAME, NER_NAME, HF_LABEL_NAME = "tokens", "ner_tag", "labels"
DS_FEATURES = [TOKEN_NAME, NER_NAME]
NER_LABELS = ["o", "GTAG", "DEV", "PUB", "OS", "DATE"]
PRE_TRAINED_TRANSFORMER_MODEL = "bert-base-cased"

In [None]:
idx2label = {i: label for i, label in enumerate(NER_LABELS)}
idx2label

{0: 'o', 1: 'GTAG', 2: 'DEV', 3: 'PUB', 4: 'OS', 5: 'DATE'}

In [None]:
label2idx = {v: k for k, v in idx2label.items()}
label2idx

{'o': 0, 'GTAG': 1, 'DEV': 2, 'PUB': 3, 'OS': 4, 'DATE': 5}

In [None]:
nlabels = 6

In [None]:
string2list = lambda s: (ast.literal_eval(s))

# Dataset

## Pre-processamento

In [None]:
ds_raw = pd.read_csv(PATH_DS, usecols=DS_FEATURES, converters={TOKEN_NAME: string2list, NER_NAME: string2list}, nrows=50000)

In [None]:
ds_raw[NER_NAME] = ds_raw[NER_NAME].apply(lambda sentence_tags: [label2idx[lbl] for lbl in sentence_tags])

# BERT

## Futher Preprocessing

In [None]:
ds_hf = Dataset.from_pandas(ds_raw)
ds_hf

Dataset({
    features: ['tokens', 'ner_tag'],
    num_rows: 204000
})

In [None]:
hf_tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_TRANSFORMER_MODEL)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
      if word_id != current_word:
        current_word = word_id
        label = -100 if word_id is None else labels[word_id]
        new_labels.append(label)
      elif word_id is None:
        new_labels.append(-100)
      else:
        label = labels[word_id]
        new_labels.append(label)
    return new_labels


In [None]:
def tokenize_and_align_labels(examples):
  tokenized_inputs = hf_tokenizer(examples[TOKEN_NAME], truncation=True, is_split_into_words=True)
  all_labels = examples[NER_NAME]
  new_labels = []
  for i, labels in enumerate(all_labels):
    word_ids = tokenized_inputs.word_ids(i)
    new_labels.append(align_labels_with_tokens(labels, word_ids))
  tokenized_inputs[HF_LABEL_NAME] = new_labels
  return tokenized_inputs

In [None]:
ds_hf_raw_tokenized = ds_hf.map(tokenize_and_align_labels,batched=True)

Map:   0%|          | 0/204000 [00:00<?, ? examples/s]

In [None]:
ds_hf_train_test = ds_hf_raw_tokenized.train_test_split(test_size=0.2)
ds_hf_test_val = ds_hf_train_test['test'].train_test_split(test_size=0.5)
ds_hf_tokenized = DatasetDict({
    'train': ds_hf_train_test['train'],
    'test': ds_hf_test_val['test'],
    'validation': ds_hf_test_val['train']})

## MODEL

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=hf_tokenizer)

In [None]:
metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [None]:
def compute_metrics(eval_preds):
  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)

  true_labels = [[NER_LABELS[l] for l in label if l!= -100] for label in labels]
  true_predictions = [
      [NER_LABELS[p] for (p, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, labels)
  ]

  all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
  return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
  }

In [None]:
model=AutoModelForTokenClassification.from_pretrained(PRE_TRAINED_TRANSFORMER_MODEL, id2label=idx2label,label2id=label2idx, num_labels=nlabels)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
args = TrainingArguments("bert-ner-steam", evaluation_strategy="epoch",save_strategy="epoch",learning_rate=2e-5, num_train_epochs=1,weight_decay=0.01)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds_hf_tokenized["train"],
    eval_dataset=ds_hf_tokenized["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=hf_tokenizer
)
trainer.train()

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0013,0.0003,0.999842,0.999887,0.999865,0.999957




TrainOutput(global_step=20400, training_loss=0.001285304039620849, metrics={'train_runtime': 1733.9825, 'train_samples_per_second': 94.119, 'train_steps_per_second': 11.765, 'total_flos': 1848459688168032.0, 'train_loss': 0.001285304039620849, 'epoch': 1.0})

In [None]:
trainer.save_model("/content/test-model")

In [None]:

model_checkpoint = "/content/test-model"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
token_classifier("Show me Mature games with Old School apocalypse themes.")

[{'entity_group': 'o',
  'score': 0.99999887,
  'word': 'Show me',
  'start': 0,
  'end': 7},
 {'entity_group': 'GTAG',
  'score': 0.99999785,
  'word': 'Mature',
  'start': 8,
  'end': 14},
 {'entity_group': 'o',
  'score': 0.99999887,
  'word': 'games with',
  'start': 15,
  'end': 25},
 {'entity_group': 'GTAG',
  'score': 0.9999981,
  'word': 'Old School',
  'start': 26,
  'end': 36},
 {'entity_group': 'o',
  'score': 0.9999987,
  'word': 'apocalypse themes.',
  'start': 37,
  'end': 55}]

# BI-LSTM

In [None]:
from sklearn import svm
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import classification_report
import tqdm
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.layers import InputLayer, TimeDistributed, SpatialDropout1D, Bidirectional
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import tensorflow as tf


In [None]:
(ds_raw[TOKEN_NAME].explode().values).size

521480

In [None]:
words = list(set(ds_raw[TOKEN_NAME].explode().values))
words.append("ENDPAD")
num_words = len(words)

In [None]:
num_words

4618

In [None]:
tags = list(set(ds_raw[NER_NAME].explode().values))
num_tags = len(tags)

In [None]:
num_tags

5

In [None]:

def sentence_integrate(data):
  sents = []
  for ws, ts in zip(data[TOKEN_NAME].values.tolist(), data[NER_NAME].values.tolist()):
    sents.append([(w, t) for w, t in zip(ws, ts)])

  return sents

sentences = sentence_integrate(ds_raw)

In [None]:
word2idx =  {w: i for i, w in enumerate(words)}
idx2word = {i: w for i, w in enumerate(words)}

In [None]:
max_len = 20

X = [[word2idx[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=num_words-1)

y = [[w[1] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=0)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
#build model


model = keras.Sequential()
model.add(InputLayer((max_len)))
model.add(Embedding(input_dim=num_words, output_dim=max_len, input_length=max_len))
model.add(SpatialDropout1D(0.1))
model.add( Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1)))

model.compile(optimizer="adam",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")



In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

callbacks = [tensorboard_callback]


history = model.fit(
    x=x_train,
    y=y_train,
    validation_data=(x_test,y_test),
    batch_size=32,
    epochs=1,
    callbacks=callbacks,
    verbose=1
)




In [None]:
y_pred = model.predict(x_test)
y_pred = np.argmax(y_pred, axis=-1)



In [None]:
y_test_cr = y_test

In [None]:
metric = evaluate.load("seqeval")

In [None]:
  true_labels = [[NER_LABELS[l] for l in label if l < nlabels] for label in y_test_cr]
  true_predictions = [
      [NER_LABELS[p] for (p, l) in zip(prediction, label) if l < nlabels]
      for prediction, label in zip(y_pred, y_test_cr)
  ]

In [None]:
  all_metrics = metric.compute(predictions=true_predictions, references=true_labels)

In [None]:
{
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
}

{'precision': 0.9677756367097717,
 'recall': 0.9586284610431423,
 'f1': 0.9631803320355372,
 'accuracy': 0.99358}

In [None]:
p = model.predict(np.array([x_test[23]]))
p = np.argmax(p, axis=-1)

print("{:15}{:5}\t {}\n".format("Word", "True", "Pred"))
print("-" *30)
for w, true, pred in zip(x_test[23], y_test[23], p[0]):
    print("{:15}{}\t{}".format(words[w], idx2label[true], idx2label[pred]))

Word           True 	 Pred

------------------------------
List           o	o
all            o	o
War            GTAG	GTAG
Point          GTAG	GTAG
&              GTAG	GTAG
Click          GTAG	GTAG
games          o	o
with           o	o
Chess          GTAG	GTAG
support.       o	o
ENDPAD         o	o
ENDPAD         o	o
ENDPAD         o	o
ENDPAD         o	o
ENDPAD         o	o
ENDPAD         o	o
ENDPAD         o	o
ENDPAD         o	o
ENDPAD         o	o
ENDPAD         o	o
