# Token classification (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [1]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
# To run the training on TPU, you will need to uncomment the following line:
# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
!apt install git-lfs

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.2
Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.9.2-1).
0 upgraded, 0 newly installed, 0 to remove and 65 not upgraded.


In [59]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "/kaggle/working/bert-propaganda-ner/checkpoint-13125"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)

In [60]:
def get_pred(txt):
    return token_classifier(txt)

In [61]:
from collections import defaultdict
import json

def load_test(fname):
    labels_per_par = defaultdict(list)

    with open(fname, 'r', encoding="utf-8") as inf:
        for i,line in enumerate(inf):
            jobj = json.loads(line)
            par_id = jobj['id']
            text = jobj['text']
            tpe = jobj['type']

#             per_par_labels = []

#             per_par_labels.append((par_id, text))

#             per_par_labels = sorted(per_par_labels, key=lambda span: span[1][0])

            labels_per_par[par_id] = [text, tpe]

    data = []
    for par_id, [text, tpe] in labels_per_par.items():
        data.append({'id': par_id, 'text': text, 'type': tpe})
    df = pd.DataFrame(data)
    return df

def load_test_gold(fname):
    labels_per_par = defaultdict(list)

    with open(fname, 'r', encoding="utf-8") as inf:
        for i,line in enumerate(inf):
            jobj = json.loads(line)
            par_id = jobj['id']
            text = jobj['text']
            tpe = jobj['type']
            lbls = jobj['labels']

#             per_par_labels = []

#             per_par_labels.append((par_id, text))

#             per_par_labels = sorted(per_par_labels, key=lambda span: span[1][0])

            labels_per_par[par_id] = [text, tpe, lbls]

    data = []
    for par_id, [text, tpe, lbls] in labels_per_par.items():
        data.append({'id': par_id, 'text': text, 'type': tpe, 'labels': lbls})
    df = pd.DataFrame(data)
    return df

In [62]:
import pandas as pd
test = load_test('/kaggle/input/araieval/araieval/araieval24_task1_test.jsonl')

In [64]:
test['text'] = test['text'].apply(preprocess_arabic_text)

In [66]:
test['pred_labels'] = test['text'].apply(get_pred)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [67]:
test['pred_labels'].iloc[0]

[{'entity_group': 'Loaded_Language',
  'score': 0.26992783,
  'word': 'تخ',
  'start': 192,
  'end': 194},
 {'entity_group': 'Loaded_Language',
  'score': 0.38468197,
  'word': '##ريبي',
  'start': 194,
  'end': 198}]

In [72]:
import json

records = []

# Iterate over the DataFrame
for id, rec in test.iterrows():
    idx = rec['id']
    labels = []
    for pred in rec['pred_labels']:
        # Create a dictionary for each label with the required details
        label_details = {
            'start': pred['start'],
            'end': pred['end'],
            'technique': pred['entity_group'],
            'text': pred['word']
        }
        labels.append(label_details)
    
    # Append the constructed record to the records list
    record = {
        'id': idx,
        'labels': labels
    }
    records.append(record)

In [73]:
# Write each record to a new line in the file
with open('task1_cuet_sstm.jsonl', 'w', encoding='utf-8') as f:
    for record in records:
        json.dump(record, f, ensure_ascii=False)
        f.write('\n')  # Add a newline after each JSON object