In [2]:
# Install necessary libraries
!pip install transformers datasets seqeval torch accelerate shap lime



In [5]:
from datasets import load_dataset

# Load dataset in CoNLL format
dataset = load_dataset(
    "conll2003",
    data_files={"train": "/content/labeled_data1.conll", "validation": "/content/labeled_data2.conll"}
)

# Inspect the dataset
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})


In [7]:
from datasets import load_dataset
from transformers import AutoTokenizer # import the AutoTokenizer class

# Load the tokenizer here, before the function is called.
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

def tokenize_and_align_labels(examples):
    # Now 'tokenizer' is defined and accessible within the function
    tokenized_inputs = tokenizer(
        examples["tokens"],  # Input tokens
        truncation=True,     # Truncate sequences that exceed the model's max length
        padding="max_length",  # Apply padding to the max sequence length
        is_split_into_words=True,  # Tokens are already split
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):  # Iterate over labels
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map token to word
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:  # Special tokens like [CLS] and [SEP]
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only assign label to the first subword
                label_ids.append(label[word_idx])
            else:  # For subwords, append -100
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply the updated function
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

In [8]:
!pip install evaluate



In [None]:
from transformers import AutoModelForTokenClassification, Trainer, TrainingArguments, AutoTokenizer
# Import load_metric from evaluate
from evaluate import load

metric = load("seqeval")

# Define TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }
import numpy as np

# Extract unique NER tags from the dataset
unique_labels = set()  # Initialize an empty set
for example in dataset["train"]:
  unique_labels.update(example["ner_tags"])
label_list = list(unique_labels)  # Convert the set to a list


models = ["xlm-roberta-base", "distilbert-base-multilingual-cased", "bert-base-multilingual-cased"]
results = {}

for model_name in models:
  model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(unique_labels))
  # Assuming 'tokenized_datasets' is a dictionary-like object with 'train' and 'validation' keys
  train_dataset = tokenized_datasets['train']
  val_dataset = tokenized_datasets['validation']
  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset,
      eval_dataset=val_dataset,
      tokenizer=tokenizer,
      compute_metrics=compute_metrics
    )
  trainer.train()
  eval_results = trainer.evaluate()
  results[model_name] = eval_results

# Print comparison results
for model_name, metrics in results.items():
  print(f"Model: {model_name}")
  print(metrics)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
[34m[1mwandb[0m: Currently logged in as: [33mabrhambayih733[0m ([33mabrhambayih733-kifya[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [None]:
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

In [65]:
!pip install shap lime



In [None]:
import shap

# Example prediction
text_sample = ["በ 100 ብር እቃው ተሸጠ"]
tokenized_sample = tokenizer(text_sample, return_tensors="pt")

# Get model predictions
outputs = model(**tokenized_sample)
predictions = outputs.logits.argmax(dim=-1)

# SHAP Explainer
explainer = shap.Explainer(model)
shap_values = explainer(tokenized_sample)

# Visualize
shap.initjs()
shap.summary_plot(shap_values, feature_names=text_sample)

In [None]:
from lime.lime_text import LimeTextExplainer

explainer = LimeTextExplainer(class_names=unique_labels)

# Modify the lambda function to include truncation and padding
def predict_proba(texts):
  inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
  outputs = model(**inputs)
  # Assuming your model outputs logits, apply softmax to get probabilities
  probas = torch.nn.functional.softmax(outputs.logits, dim=-1)
  # Get the probabilities for the target class (index 0 in this case)
  return probas.detach().numpy()

explanation = explainer.explain_instance(sample_sentence, predict_proba, num_features=10)
explanation.show_in_notebook()