In [1]:
import os
import sys
import logging
import importlib
import shutil
import torch
import pickle
import json
import random
import numpy as np
from argparse import Namespace

def load_object(name, kwargs):
    """
    Load objects dynamically given the object name and its arguments
    :param name: str - object name, class name or function name
    :param kwargs: dict - keyword arguments
    :return: object
    """
    object_module, object_name = name.rsplit(".", 1)
    object_module = importlib.import_module(object_module)
    fn = getattr(object_module, object_name)(**kwargs)
    return fn


In [2]:


def load_checkpoint(model_path):
    """
    Load model given the model path
    :param model_path: str - path to model
    :return: tagger - arabiner.trainers.BaseTrainer - the tagger model
             vocab - torchtext.vocab.Vocab - indexed tags
             train_config - argparse.Namespace - training configurations
    """
    with open(os.path.join(model_path, "tag_vocab.pkl"), "rb") as fh:
        tag_vocab = pickle.load(fh)

    # Load train configurations from checkpoint
    train_config = Namespace()
    with open(os.path.join(model_path, "args.json"), "r") as fh:
        train_config.__dict__ = json.load(fh)

    # Initialize the loss function, not used for inference, but evaluation
    loss = load_object(train_config.loss["fn"], train_config.loss["kwargs"])

    # Load BERT tagger
    model = load_object(train_config.network_config["fn"], train_config.network_config["kwargs"])
    model = torch.nn.DataParallel(model)

    if torch.cuda.is_available():
        model = model.cuda()

    # Update arguments for the tagger
    # Attach the model, loss (used for evaluations cases)
    train_config.trainer_config["kwargs"]["model"] = model
    train_config.trainer_config["kwargs"]["loss"] = loss

    tagger = load_object(train_config.trainer_config["fn"], train_config.trainer_config["kwargs"])
    tagger.load(os.path.join(model_path, "checkpoints"))
    return tagger, tag_vocab, train_config

In [3]:
import torch
from transformers import AutoTokenizer
import os
import json
from types import SimpleNamespace


# Load the checkpoint
model_path = "./content/output/"
tagger, tag_vocab, train_config = load_checkpoint(model_path)

# Function to tokenize and tag a sentence
def predict_sentence(sentence, tagger, tag_vocab, tokenizer):
    # Tokenize the sentence
    tokens = tokenizer(sentence, truncation=True, padding=True, max_length=512, return_tensors="pt")

    # Convert token IDs to a list (assuming 'tagger.predict' expects a list of token IDs)
    input_ids = tokens["input_ids"][0].tolist()

    # Make predictions using the tagger
    predictions = tagger.predict(input_ids)

    # Convert predicted tag IDs to tag strings
    tags = [tag_vocab.itos[prediction] for prediction in predictions]

    # Create a list of dictionaries containing word and corresponding predicted tag
    entities = []
    for token, tag in zip(tokens["input_ids"][0], tags):
        token_str = tokenizer.convert_ids_to_tokens(token.item())

        if not token_str.startswith("##"):
            entities.append({"word": token_str, "entity": tag})
        else:
            entities[-1]["word"] += token_str[2:]

    return entities

# Example usage
text = 'محمد محمد محمد أبو تريكة (مواليد 7 نوفمبر 1978) لاعب كرة قدم دولي مصري سابق'
tokenizer = AutoTokenizer.from_pretrained("your-pretrained-model")
predictions = predict_sentence(text, tagger, tag_vocab, tokenizer)

for item in predictions:
    print(item["word"] + "\t" + item["entity"])

  from .autonotebook import tqdm as notebook_tqdm


FileNotFoundError: [Errno 2] No such file or directory: './content/output/tag_vocab.pkl'