**Question Answering Task**

In [2]:
# Install necessary libraries
!pip install datasets transformers evaluate seqeval



Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=6503889a6ee3ddbf5b6e96108e2de7185716d576db1de08ea5e6acc3b908ba59
  Stored in directory: /root/.cache/pip/wheels/5f/b8/73/0b2c1a76b701a677653dd79ece07cfabd7457989dbfbdcd8d7
Successfully built seqeval
Installing collected packages: seqeval, evaluate
Successfully installed evaluate-0.4.5 seqeval-1.2.2


In [9]:
import numpy as np
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer,
    pipeline,
    default_data_collator
)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

dataset = load_dataset("squad")
model_name = "distilbert-base-uncased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    contexts = examples["context"]
    answers = examples["answers"]

    inputs = tokenizer(
        questions,
        contexts,
        max_length=384,
        truncation="only_second",
        padding="max_length",
        return_offsets_mapping=True
    )

    start_positions = []
    end_positions = []

    for i, offset in enumerate(inputs["offset_mapping"]):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        token_start_index = 0
        while sequence_ids[token_start_index] != 1:
            token_start_index += 1
        token_end_index = len(offset) - 1
        while sequence_ids[token_end_index] != 1:
            token_end_index -= 1

        if not (offset[token_start_index][0] <= start_char and offset[token_end_index][1] >= end_char):
            start_positions.append(0)
            end_positions.append(0)
        else:
            for idx in range(token_start_index, token_end_index + 1):
                if offset[idx][0] <= start_char and offset[idx][1] >= start_char:
                    start_positions.append(idx)
                    break
            for idx in range(token_end_index, token_start_index - 1, -1):
                if offset[idx][0] <= end_char and offset[idx][1] >= end_char:
                    end_positions.append(idx)
                    break

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    inputs.pop("offset_mapping")
    return inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)

# 🔥 Subsample for fast training
train_subset = tokenized_dataset["train"].select(range(2000))

training_args = TrainingArguments(
    output_dir="./qa_model",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='./logs',
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_subset,
    tokenizer=tokenizer,
    data_collator=default_data_collator
)

trainer.train()
trainer.save_model("./qa_model")

qa_pipeline = pipeline("question-answering", model="./qa_model", tokenizer="./qa_model", device=0 if torch.cuda.is_available() else -1)

result = qa_pipeline({
    "context": "Kaggle is an online platform for data science competitions.",
    "question": "What is Kaggle?"
})
print("Answer:", result["answer"])


Using device: cuda


Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss


Device set to use cuda:0


Answer: an online platform for data science competitions.


**NER task**

In [12]:
# Install necessary libraries
!pip install datasets transformers evaluate seqeval





In [14]:
pip install kaggle




In [17]:
from datasets import load_dataset

dataset = load_dataset("json", data_files={"train": "/content/train.json"})
train_dataset = dataset["train"]


Generating train split: 0 examples [00:00, ? examples/s]

In [20]:
# Extract unique BIO labels
unique_labels = sorted({label for example in train_dataset for label in example["labels"]})
label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {idx: label for label, idx in label2id.items()}

# Convert string labels to integer IDs
def encode_labels(example):
    example["ner_tags"] = [label2id[label] for label in example["labels"]]
    return example

train_dataset = train_dataset.map(encode_labels)


Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

In [21]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=512
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = train_dataset.select(range(2000)).map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [23]:
import numpy as np
import evaluate

metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [id2label[l] for l in label if l != -100]
        for label in labels
    ]
    true_predictions = [
        [id2label[p] for p, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


Downloading builder script: 0.00B [00:00, ?B/s]

In [24]:
from transformers import TrainingArguments, Trainer, default_data_collator

training_args = TrainingArguments(
    output_dir="./ner_model",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='./logs',
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model("./ner_model")


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Step,Training Loss


In [28]:
from transformers import pipeline

ner_pipeline = pipeline(
    "ner",
    model="./ner_model",
    tokenizer="./ner_model",
    aggregation_strategy="simple",
    device=0  # use GPU if available
)

text = (
    "Nathalie Sylla presented a mind map in April 2021. Her email is nathalie.sylla@example.com "
    "and she works at Les Éditions d'Organisation."
)
entities = ner_pipeline(text)
print("Entities:", entities)



Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Entities: [{'entity_group': 'NAME_STUDENT', 'score': np.float32(0.8122435), 'word': 'Nathalie Sylla', 'start': 0, 'end': 14}, {'entity_group': 'NAME_STUDENT', 'score': np.float32(0.4499068), 'word': 'nathal', 'start': 63, 'end': 70}]
