In [1]:
from transformers import AutoTokenizer

# Load XLM-Roberta tokenizer
xlm_model = "xlm-roberta-base"
xlm_tokenizer = AutoTokenizer.from_pretrained(xlm_model)


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Example Amharic sentence
amharic_text = "አማርኛ ቋንቋ እጅግ በጣም ጥሩ ነው።"

# Tokenize the Amharic sentence
tokens = xlm_tokenizer.tokenize(amharic_text)

print("Tokens:", tokens)



Tokens: ['▁አማርኛ', '▁ቋንቋ', '▁እጅግ', '▁በጣም', '▁ጥሩ', '▁ነው።']


In [3]:
# Convert tokens to input IDs
input_ids = xlm_tokenizer.convert_tokens_to_ids(tokens)

print("Input IDs:", input_ids)

Input IDs: [129050, 87917, 70912, 36059, 83144, 7914]


In [5]:
# Prepare the input for the model (including attention masks and padding)
inputs = xlm_tokenizer(amharic_text, return_tensors="pt", padding=True, truncation=True)

print("Input IDs:", inputs['input_ids'])
print("Attention Mask:", inputs['attention_mask'])


Input IDs: tensor([[     0, 129050,  87917,  70912,  36059,  83144,   7914,      2]])
Attention Mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1]])


In [6]:
from transformers import AutoModelForSequenceClassification

# Load the XLM-Roberta model for sequence classification (can be adjusted for NER)
model = AutoModelForSequenceClassification.from_pretrained(xlm_model, num_labels=3)

# Get model output
outputs = model(**inputs)

# Logits (raw model predictions)
logits = outputs.logits
print("Logits:", logits)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Logits: tensor([[-0.3776,  0.2017, -0.3907]], grad_fn=<AddmmBackward0>)


In [7]:
from transformers import AutoModelForTokenClassification

# Load the XLM-Roberta model for NER
model = AutoModelForTokenClassification.from_pretrained(xlm_model, num_labels=3)

# Get model output
outputs = model(**inputs)

# Logits for each token
logits = outputs.logits
print("Logits for tokens:", logits)


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Logits for tokens: tensor([[[-0.5625,  0.6909, -0.0972],
         [-0.5691,  0.3497, -0.0522],
         [-0.6140,  0.2838, -0.0574],
         [-0.4983,  0.4948, -0.0401],
         [-0.5481,  0.4104,  0.0268],
         [-0.5108,  0.4200, -0.0450],
         [-0.5517,  0.3558, -0.0216],
         [-0.5647,  0.6419, -0.0750]]], grad_fn=<ViewBackward0>)


In [12]:
!pip install langdetect


Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
     ---------------------------------------- 0.0/981.5 kB ? eta -:--:--
     ---------------------------------------- 0.0/981.5 kB ? eta -:--:--
     ---------------------------------------- 0.0/981.5 kB ? eta -:--:--
     ---------- ----------------------------- 262.1/981.5 kB ? eta -:--:--
     ---------- ----------------------------- 262.1/981.5 kB ? eta -:--:--
     ------------------- ---------------- 524.3/981.5 kB 762.0 kB/s eta 0:00:01
     ---------------------------- ------- 786.4/981.5 kB 762.0 kB/s eta 0:00:01
     ---------------------------- ------- 786.4/981.5 kB 762.0 kB/s eta 0:00:01
     ---------------------------- ------- 786.4/981.5 kB 762.0 kB/s eta 0:00:01
     ------------------------------------ 981.5/981.5 kB 605.5 kB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: langdetect
  Buildin