In [1]:
from datasets import load_dataset

squad = load_dataset("squad", split="train")
squad = squad.train_test_split(test_size=0.2, seed=42)

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [3]:
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

In [4]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")




Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# ✅ Use GPU if available
import torch
from transformers import Trainer, TrainingArguments

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"There are {torch.cuda.device_count()} GPU(s) available.")
    print("Device name:", torch.cuda.get_device_name(0))
    model.to(device)
else:
    device = torch.device("cpu")
    print("No GPU found, using CPU.")

# ✅ Training configuration (optimized for disk usage)
training_args = TrainingArguments(
    output_dir="drill04_qa_model",       # folder to save model
    eval_strategy="epoch",         # correct parameter name
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    save_strategy="epoch",               # save checkpoint only once per epoch
    save_total_limit=1,                  # keep only the last checkpoint
    load_best_model_at_end=True,         # optional, keeps best checkpoint
    push_to_hub=False,
    report_to="none",                    # disable wandb or tensorboard logs
    logging_dir=None,                    # avoid creating logging folders
)

# ✅ Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    tokenizer=tokenizer,                 # fixed from "processing_class"
    data_collator=data_collator,
)

# ✅ Try to resume training if a checkpoint exists
import os
last_checkpoint = None
if os.path.isdir(training_args.output_dir):
    from transformers.trainer_utils import get_last_checkpoint
    last_checkpoint = get_last_checkpoint(training_args.output_dir)

if last_checkpoint:
    print(f"Resuming training from checkpoint: {last_checkpoint}")
    trainer.train(resume_from_checkpoint=last_checkpoint)
else:
    print("Starting training from scratch...")
    trainer.train()


There are 1 GPU(s) available.
Device name: NVIDIA GeForce RTX 4060


  trainer = Trainer(


Starting training from scratch...


Epoch,Training Loss,Validation Loss
1,1.3756,1.23928
2,1.1109,1.132986
3,0.9865,1.111809
4,0.871,1.124387


In [6]:
# Save the model and tokenizer
trainer.save_model("./drill04_qa_model")
tokenizer.save_pretrained("./drill04_qa_model")

('./drill04_qa_model\\tokenizer_config.json',
 './drill04_qa_model\\special_tokens_map.json',
 './drill04_qa_model\\vocab.txt',
 './drill04_qa_model\\added_tokens.json',
 './drill04_qa_model\\tokenizer.json')

In [8]:
max_length = 384
stride = 128


def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

validation_dataset = squad["test"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=squad["test"].column_names,
)
len(squad["test"]), len(validation_dataset)

(17520, 17688)

In [9]:
from tqdm.auto import tqdm
import collections
import numpy as np
import evaluate
metric = evaluate.load("squad")

n_best = 20
max_answer_length = 30

def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [10]:
predictions, _, _ = trainer.predict(validation_dataset)
start_logits, end_logits = predictions
compute_metrics(start_logits, end_logits, validation_dataset, squad["test"])

100%|██████████| 17520/17520 [00:26<00:00, 661.58it/s]


{'exact_match': 63.37328767123287, 'f1': 77.44755063039003}

In [None]:
Matched_answer=0
for n in range(0, 200):
    question = squad['test']['question'][n]
    context=squad['test']['context'][n]
    answer=squad['test']['answers'][n]
    
    from transformers import pipeline
    question_answerer = pipeline("question-answering", 
                                 model="./drill04_qa_model",
                                tokenizer="./drill04_qa_model",
                                fp16=True)

    result = question_answerer(question=question, context=context)
    if answer['text'][0]==result['answer']:
        Matched_answer+=1
    print(answer['text'][0],result['answer'],Matched_answer)

print(Matched_answer/200*100)

Device set to use cuda:0
Device set to use cuda:0


Hunting Hunting 1


Device set to use cuda:0


10 Downing Street 10 Downing Street 2
Top 20 Hot 100 Songwriters Top 20 2


Device set to use cuda:0
Device set to use cuda:0


Taksim Square Sultanahmet Square 2
Manhattan Manhattan 3


Device set to use cuda:0
Device set to use cuda:0


reality Theravada 3
President of the Republic President of the Republic 4


Device set to use cuda:0
Device set to use cuda:0


sound economic policymaking and good governance provide a sound economic policymaking and good governance 4
murder by government murder 4


Device set to use cuda:0
Device set to use cuda:0


Vogue Vogue 5
60% 60% 6


Device set to use cuda:0
Device set to use cuda:0


on school collapses school collapses 6
14 14 7


Device set to use cuda:0
Device set to use cuda:0


strangers strangers 8
3,000 3,850,000 8


Device set to use cuda:0
Device set to use cuda:0


Romantic era Romantic era 9
because it was caused by a different fault. a different fault 9


Device set to use cuda:0
Device set to use cuda:0


Beyoncé Beyoncé 10
cyberctm.com cyberctm.com 11


Device set to use cuda:0
Device set to use cuda:0


January 7, 2012 January 7, 2012 12


Device set to use cuda:0


three three 13


Device set to use cuda:0


Governments Governments 14


Device set to use cuda:0


2,791 km (1,734 mi) 2,791 km (1,734 mi) 15


Device set to use cuda:0


three times three 15


Device set to use cuda:0


very destructive XI 15


Device set to use cuda:0


the judges judges 15


Device set to use cuda:0


Warsaw Warsaw, which after 1815 became part of Congress Poland 15


Device set to use cuda:0


Albert Grzymała Julian Fontana 15


Device set to use cuda:0


Paul Volcker Paul Volcker 16


Device set to use cuda:0


35,000 35,000 17


Device set to use cuda:0


John Field John Field 18


Device set to use cuda:0


Foxy Brown Tell 'Em Why U Madd 18


Device set to use cuda:0


Wii Remote the motion sensors and built-in speaker of the Wii Remote 18


Device set to use cuda:0


1759 1759 19


Device set to use cuda:0


eight eight 20


Device set to use cuda:0


48 48 21


Device set to use cuda:0


House of Deréon House of Deréon 22


Device set to use cuda:0


six six 23


Device set to use cuda:0


supervising agencies the supervising agencies 23


Device set to use cuda:0


Do I Make You Proud My Destiny 23


Device set to use cuda:0


a working fluid is heated by the concentrated sunlight lenses or mirrors and tracking systems 23


Device set to use cuda:0


Joseph Broussard Joseph Broussard 24


Device set to use cuda:0


the process of his/her appointment and dismissal functions and powers 24


Device set to use cuda:0


language language 25


Device set to use cuda:0


the Hôtel de France on the Rue Lafitte 38 Rue de la Chaussée-d'Antin 25


Device set to use cuda:0


1838 1838 26


Device set to use cuda:0


Soha Ali Khan Soha Ali Khan 27


Device set to use cuda:0


Ai Weiwei Ai Weiwei 28


Device set to use cuda:0


outside outside 29


Device set to use cuda:0


north north 30


Device set to use cuda:0


JFK International Airport Grand Central Terminal and Pennsylvania Station 30


Device set to use cuda:0


Jax Jax 31


Device set to use cuda:0


Józef Elsner Józef Elsner 32


Device set to use cuda:0


the Album Era Album 32


Device set to use cuda:0


20 km deep 240 km long by 20 km deep 32


Device set to use cuda:0


Midna's helmet helmet 32


Device set to use cuda:0


Gautama Buddha Gautama Buddha 33


Device set to use cuda:0


the Ming dynasty the Tümed Mongols 33


Device set to use cuda:0


advertising theater, finance 33


Device set to use cuda:0


iTunes Store Universal Music Group 33


Device set to use cuda:0


Ford Motor Company Coca Cola 33


Device set to use cuda:0


Warsaw Conservatory Paris 33


Device set to use cuda:0


CDS credit default swaps 33


Device set to use cuda:0


$20 million $20 million 34


Device set to use cuda:0


Patrick Hillery Patrick Hillery 35


Device set to use cuda:0


identifies a path to this cessation the complete cessation of dukkha is possible 35


Device set to use cuda:0


within 72 hours of the main quake. 72 hours 35


Device set to use cuda:0


18,498 18,498 36


Device set to use cuda:0


19 countries 19 36


Device set to use cuda:0


the Ming court missions to the Ming court 36


Device set to use cuda:0


1964 1964 37


Device set to use cuda:0


IMF IMF 38


Device set to use cuda:0


fast economic growth with increasing consumption and purchase of new automobiles fast economic growth 38


Device set to use cuda:0


the US military the US military 39


Device set to use cuda:0


among the most critically acclaimed "most important artist of any art form 39


Device set to use cuda:0


progressive familiar and traditional 39


Device set to use cuda:0


the Death Star the Death Star 40


Device set to use cuda:0


cyber cyber 41


Device set to use cuda:0


Ögedei Khan Ögedei Khan 42


Device set to use cuda:0


American pragmatism American pragmatism 43


Device set to use cuda:0


345 345 44


Device set to use cuda:0


20th century 20th century 45


Device set to use cuda:0


that he was being investigated on suspicion of the crime of inciting subversion he was being investigated on suspicion of the crime of inciting subversion 45


Device set to use cuda:0


Timothy Geithner Timothy Geithner 46


Device set to use cuda:0


understanding understanding of cognition, emotion, motivation, and similar psychological processes 46


Device set to use cuda:0


1901 1901 47


Device set to use cuda:0


the National Library of Australia National Library of Australia 47


Device set to use cuda:0


ischaemic heart disease and cerebrovascular disease ischaemic heart disease and cerebrovascular disease 48


Device set to use cuda:0


man genes to acculturation 48


Device set to use cuda:0


1945 1945 49


Device set to use cuda:0


Hizb ut-Tahrir Hizb ut-Tahrir 50


Device set to use cuda:0


Lady Gaga Lady Gaga 51


Device set to use cuda:0


new motorways new motorways 52


Device set to use cuda:0


foreign correspondents Beijing citizen 52


Device set to use cuda:0


18th century 18th century 53


Device set to use cuda:0


J. B. Lippincott J. B. Lippincott 54


Device set to use cuda:0


Taoiseach Irish Taoiseach 54


Device set to use cuda:0


regulatory framework regulatory framework 55


Device set to use cuda:0


Kelly Clarkson Kelly Clarkson 56


Device set to use cuda:0


Aspiro Tidal 56


Device set to use cuda:0


Chicago's Divinity School Chicago's Divinity School 57


Device set to use cuda:0


26 17 57


Device set to use cuda:0


Silicon Alley Silicon Alley 58


Device set to use cuda:0


Milan operatic 58


Device set to use cuda:0


the Buddhist community the Buddhist 58


Device set to use cuda:0


Solar Total Energy Project (STEP) in Shenandoah, Georgia, USA Solar Total Energy Project 58


Device set to use cuda:0


its Fighting Irish football team Fighting Irish football team 58


Device set to use cuda:0


emic cultural relativism 58


Device set to use cuda:0


four fourth 58


Device set to use cuda:0


how the universe works Process and Reality 58


Device set to use cuda:0


1997–2007 1997–2007 59


Device set to use cuda:0


2014 2009 59


Device set to use cuda:0


Wilhelm Würfel Wilhelm Würfel 60


Device set to use cuda:0


DONDA Adidas 60


Device set to use cuda:0


hydropower plants on its rivers hydropower plants 60


Device set to use cuda:0


David Cook Cook and Archuleta. David Cook 60


Device set to use cuda:0


King's College King's College 61


Device set to use cuda:0


tathāgatagarbha East Asian Buddhism 61


Device set to use cuda:0


$144.5 billion $144.5 billion 62


Device set to use cuda:0


Politecnico Politecnico 63


Device set to use cuda:0


Farrah Franklin Farrah Franklin and Michelle Williams 63


Device set to use cuda:0


Eon's official social media accounts. Eon 63


Device set to use cuda:0


the 2008 World Music Awards 2008 63


Device set to use cuda:0


iTunes third-party 63


Device set to use cuda:0


57.6% 57.6% 64


Device set to use cuda:0


Organization for the Harmonization of Business Law in Africa Organization for the Harmonization of Business Law in Africa 65


Device set to use cuda:0


five five 66


Device set to use cuda:0


virus plasmids 66


Device set to use cuda:0


33% 33% 67


Device set to use cuda:0


Danny Gokey Danny Gokey 68


Device set to use cuda:0


2 GB 128 GB 68


Device set to use cuda:0


Theravada Buddhism Theravada 68


Device set to use cuda:0


Tidal Tidal 69


Device set to use cuda:0


$1.57 million $1.57 million 70


Device set to use cuda:0


Jay Z Jay Z 71


Device set to use cuda:0


1920 1920 72


Device set to use cuda:0


17 17 73


Device set to use cuda:0


Carrefour Carrefour 74


Device set to use cuda:0


southwestern China southwestern China 75


Device set to use cuda:0


1,032,949 1,032,949 76


Device set to use cuda:0


about 1% about 1% 77


Device set to use cuda:0


2 two 77


Device set to use cuda:0


1995 1995 78


Device set to use cuda:0


The Recording Industry Association of America The Recording Industry Association of America (RIAA) 78


Device set to use cuda:0


Kimat Kimat 79
data storage external data storage devices 79


Device set to use cuda:0
Device set to use cuda:0


Nālandā University Nālandā University 80


Device set to use cuda:0


nicotine nicotine 81
1903 1903 82


Device set to use cuda:0
Device set to use cuda:0


the Dominican Republic Dominican Republic, Jamaica, Haiti, and Trinidad and Tobago 82
Carrie Underwood Faith Hill 82


Device set to use cuda:0
Device set to use cuda:0


January 19, 2011 January 19, 2011 83
The word genocide is the combination of the Greek prefix geno- (meaning tribe or race) and caedere (the Latin word for to kill). caedere (the Latin word for to kill). 83


Device set to use cuda:0
Device set to use cuda:0


tandem repeats and interspersed repeats tandem repeats and interspersed repeats 84
2003 2003 85


Device set to use cuda:0
Device set to use cuda:0


November 22 November 22 86


Device set to use cuda:0


time and space time and space 87
Simon Cowell Simon Cowell 88


Device set to use cuda:0
Device set to use cuda:0


The Jesuits The Jesuits 89
2003 2003 90


Device set to use cuda:0
Device set to use cuda:0


Uighur activists Uighur 90


Device set to use cuda:0


intelligence knowledge for applying to solve problems 90


Device set to use cuda:0


15 July 2007 15 July 2007 91


Device set to use cuda:0


Paul Krugman Paul Krugman 92


Device set to use cuda:0


Blenheim Palace in Oxfordshire Blenheim Palace 92


Device set to use cuda:0


1524 1524 93


Device set to use cuda:0


DioGuardi DioGuardi 94


Device set to use cuda:0


1990s and 2000s 1990s and 2000s 95


Device set to use cuda:0


"Runaway" Runaway 95


Device set to use cuda:0


to seek out the Karmapa to seek out the Karmapa 96


Device set to use cuda:0


Van Ness Avenue waterfront pier 96


Device set to use cuda:0


East Asian East Asian 97


Device set to use cuda:0


three six 97


Device set to use cuda:0


4.0 3.3 and 3.4 97


Device set to use cuda:0


New York Knicks the New York Liberty 97


Device set to use cuda:0


weak bass response weak bass 97


Device set to use cuda:0


several concerts concerts 97


Device set to use cuda:0


1958 1958 98


Device set to use cuda:0


gaejang-guk gaejang-guk 99


Device set to use cuda:0


śūnyatā śūnyatā 100


Device set to use cuda:0


satellite images of the quake-stricken areas satellite images 100


Device set to use cuda:0


Jimmy Iovine Jimmy Iovine 101


Device set to use cuda:0


2006 September 12, 2006 101


Device set to use cuda:0


t located 15 km southwest of Dushanbe Ayni Air Base 101


Device set to use cuda:0


flashback flashback 102


Device set to use cuda:0


Carroll O'Connor All in the Family (played by Carroll O'Connor 102


Device set to use cuda:0


Heartless 808s & Heartbreak 102


Device set to use cuda:0


four four 103


Device set to use cuda:0


1898 1911 103


Device set to use cuda:0


BBC Radio 1 BBC Radio 1 104


Device set to use cuda:0


December 2009 December 2009 105


Device set to use cuda:0


Fr. Zahm Fr. Zahm 106


Device set to use cuda:0


83,000 square feet 83,000 106


Device set to use cuda:0


Father John Francis O'Hara Holy Cross Father John Francis O'Hara 106


Device set to use cuda:0


June 2002 June 2002 107


Device set to use cuda:0


artificial intelligence artificial intelligence 108


Device set to use cuda:0


Kara DioGuardi Kara DioGuardi 109


Device set to use cuda:0


Brick game Brick 109


Device set to use cuda:0


American Society for Microbiology (ASM), American Public Health Association (APHA) and the American Medical Association (AMA) the EU 109
Community colleges community school model 109
54.50000000000001


In [None]:
question = "Which car costs $20,000?"
context="The blue car costs $20,000 and the red car costs $25,000."
print(question)
from transformers import pipeline
question_answerer = pipeline("question-answering", 
                             model="./drill04_qa_model",
                            tokenizer="./drill04_qa_model",
                            fp16=True)

result = question_answerer(question=question, context=context)
print(result['answer'])

Device set to use cuda:0


Which car costs $20,000?
The blue car
