In [None]:
!pip install transformers datasets huggingface_hub

In [None]:
import pandas as pd
import numpy as np

splits = {'train': 'data/train-00000-of-00001-ec54fbe500fc3b5c.parquet', 'validation': 'data/validation-00000-of-00001-3cf888b12fff1dd6.parquet'}
train_df = pd.read_parquet("hf://datasets/lucadiliello/newsqa/" + splits["train"])
valid_df = pd.read_parquet("hf://datasets/lucadiliello/newsqa/" + splits["validation"])

In [None]:
train_df.head()

In [None]:
train_df=train_df[0:30000]
train_df

In [None]:
train_df.loc[:,"labels"][2][0]["start"]

In [None]:
type(train_df.iloc[2][2])

In [None]:
train_df.iloc[1,0]

In [None]:
import torch

In [None]:


def read_data(df):

    contexts = []
    questions = []
    answers = []
    length=len(df)
    for i in range(length):
      context=df.iloc[i][0]
      question=df.iloc[i][1]
      answer=df.iloc[i][2][0]
      context={
          'text':context,
      }
      question={
          'text':question,
      }
      answer={
          'text':answer,
      }
      contexts.append(context)
      questions.append(question)
      answers.append(answer)

    return contexts, questions, answers

train_contexts, train_questions, train_answers = read_data(train_df)
val_contexts, val_questions, val_answers = read_data(valid_df)

In [None]:
train_contexts[0]

In [None]:
device="cuda" if torch.cuda.is_available() else "cpu"

In [None]:
import huggingface_hub

from huggingface_hub import notebook_login

notebook_login()

In [None]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

model_name = "deepset/roberta-base-squad2"



model = AutoModelForQuestionAnswering.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)




In [None]:
print(model.config)

### Preprocessing

In [None]:
def add_end_idx(answers, contexts,df):

 for i in range(len(df)):

   start_idx=df.loc[:,"labels"][i][0]["start"]

   end_idx=df.loc[:,"labels"][i][0]["end"]

   answers[i]["start_idx"]=str(start_idx[0])

   answers[i]["end_idx"]=str(end_idx[0])






add_end_idx(train_answers, train_contexts,train_df)

add_end_idx(val_answers, val_contexts,valid_df)

In [None]:
def tokenize_in_batches(tokenizer, contexts, questions, batch_size=1000):
    encodings = {'input_ids': [], 'attention_mask': [],"offset_mapping":[]}

    # Iterate over the data in chunks
    for i in range(0, len(contexts), batch_size):
        batch_contexts = contexts[i:i + batch_size]
        batch_questions = questions[i:i + batch_size]

        # Tokenize a small batch
        batch_encodings = tokenizer(
            batch_contexts,
            batch_questions,
            truncation=True,
            padding=True,
            return_tensors=None,
            return_offsets_mapping=True,
        )

        # Append results to the main encodings dictionary
        for key in encodings.keys():
          if key in batch_encodings:
            encodings[key].extend(batch_encodings[key])


    return encodings

In [None]:
train_contexts_list=[d['text'] for d in train_contexts]
train_questions_list=[d['text'] for d in train_questions]
val_contexts_list=[d['text'] for d in val_contexts]
val_questions_list=[d['text'] for d in val_questions]

train_encodings=tokenize_in_batches(tokenizer,train_contexts_list,train_questions_list,batch_size=4096)
val_encodings=tokenize_in_batches(tokenizer,val_contexts_list,val_questions_list,batch_size=4096)

### Token start and end positions

In [None]:
train_answers[0]

In [None]:
def add_token_positions(encodings, answers, tokenizer):
    start_positions = []
    end_positions = []

    # Iterate over the total number of examples
    for i in range(len(answers)):
        answer_start_char = int(answers[i]['start_idx'])
        answer_end_char = int(answers[i]['end_idx']) - 1 # inclusive end char

        # Get the offset mapping for the current example (i)
        offset_map = encodings['offset_mapping'][i]

        start_token = None
        end_token = None

        # 1. Manually find the START token index
        for token_idx, (start_char, end_char) in enumerate(offset_map):
            if start_char <= answer_start_char < end_char:
                start_token = token_idx

            # 2. Manually find the END token index

            if start_char <= answer_end_char < end_char:
                end_token = token_idx


        if start_token is None:
            start_token = tokenizer.model_max_length

        if end_token is None:
            end_token = tokenizer.model_max_length

        start_positions.append(start_token)
        end_positions.append(end_token)

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers, tokenizer)
add_token_positions(val_encodings, val_answers, tokenizer)

In [None]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [None]:
print(f"Input IDs Length: {len(train_encodings['input_ids'])}")
for key, val in train_encodings.items():
    print(f"Length of {key}: {len(val)}")

In [None]:
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
eval_loader=DataLoader(val_dataset,batch_size=16)

optim = torch.optim.AdamW(model.parameters(), lr=5e-5)

for epoch in tqdm(range(3)):
    total_train_loss=0
    for index,batch in enumerate(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()
        total_train_loss+=loss.item()

        if index%400==0:
          print(f"gone through {index*16}/{len(train_loader)*16} training examples")

    avg_train_loss=total_train_loss/(len(train_loader))


    model.eval()
    with torch.inference_mode():
        total_test_loss=0
        for index,batch in enumerate(eval_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            loss = outputs[0]
            total_test_loss+=loss.item()

    avg_test_loss=total_test_loss/(len(eval_loader))

    print(f"epoch: {epoch} |  avg_train_loss : {avg_train_loss}  |   avg_test_loss : {avg_test_loss}")


## Approach 2: Since prev approach didn't work

In [1]:
import pandas as pd
import numpy as np
import huggingface_hub
import torch 
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [2]:
# 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

from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:


splits = {'train': 'data/train-00000-of-00001-ec54fbe500fc3b5c.parquet', 'validation': 'data/validation-00000-of-00001-3cf888b12fff1dd6.parquet'}
train_df = pd.read_parquet("hf://datasets/lucadiliello/newsqa/" + splits["train"])
val_df=pd.read_parquet("hf://datasets/lucadiliello/newsqa/" + splits["validation"])

In [4]:
train_df.head()

Unnamed: 0,context,question,answers,key,labels
0,"NEW DELHI, India (CNN) -- A high court in nort...",What was the amount of children murdered?,[19],da0e6b66e04d439fa1ba23c32de07e50,"[{'end': [295], 'start': [294]}]"
1,"NEW DELHI, India (CNN) -- A high court in nort...",When was Pandher sentenced to death?,[February.],724f6eb9a2814e4fb2d7d8e4de846073,"[{'end': [269], 'start': [261]}]"
2,"NEW DELHI, India (CNN) -- A high court in nort...",The court aquitted Moninder Singh Pandher of w...,[rape and murder],d64cbb90e5134081acfa83d3e702408c,"[{'end': [638], 'start': [624]}]"
3,"NEW DELHI, India (CNN) -- A high court in nort...",who was acquitted,[Moninder Singh Pandher],fd7177ee6f1f4d62becd983a0305f503,"[{'end': [216], 'start': [195]}]"
4,"NEW DELHI, India (CNN) -- A high court in nort...",who was sentenced,[Moninder Singh Pandher],cd25c69f631349748ccdeccaace66463,"[{'end': [216], 'start': [195]}]"


In [None]:
!pip install --upgrade --force-reinstall transformers huggingface_hub

In [5]:
train_contexts,train_questions=train_df["context"],train_df["question"]
val_contexts,val_questions=val_df["context"],val_df["question"]

In [6]:
len(train_df)


74160

In [7]:
train_df.iloc[2,4][0]['start']


array([624])

In [8]:
train_df["labels"][2][0]

{'end': array([638]), 'start': array([624])}

In [9]:
def add_start_end_idx(dataframe):
    answers=[]
    for i in range(len(dataframe)):
        answer=dataframe.iloc[i,2][0]
        start_idx=dataframe.iloc[i,4][0]["start"]
        end_idx=dataframe.iloc[i,4][0]["end"]
        answers.append({
            'text':answer,
            'start_idx':start_idx,
            'end_idx':end_idx,
        })
    return answers

train_answers=add_start_end_idx(train_df)
val_answers=add_start_end_idx(val_df)

In [None]:
!pip install --upgrade --force-reinstall numpy scikit-learn

In [12]:
import transformers
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(list(train_contexts), list(train_questions), truncation=True, padding=True)
val_encodings = tokenizer(list(val_contexts), list(val_questions), truncation=True, padding=True)

In [15]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['start_idx'][0]))
        end_positions.append(encodings.char_to_token(i, answers[i]['end_idx'][0] - 1))
        # if None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [16]:
import torch

class NewsQA_Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = NewsQA_Dataset(train_encodings)
val_dataset = NewsQA_Dataset(val_encodings)

In [17]:
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

2025-10-23 06:45:14.798196: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761201914.999642     174 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761201915.066973     174 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.4 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.


AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.4 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 712, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.11/dist-package

AttributeError: _ARRAY_API not found

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm  
from torch.cuda.amp import autocast, GradScaler

scaler=GradScaler()
model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)
epochs=5

for epoch in range(epochs):
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")
    total_loss=0
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        with autocast():
          outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
          loss = outputs[0]
        scaler.scale(loss).backward()
        scaler.step(optim)
        scaler.update()
        total_loss+=(loss.item())
        progress_bar.set_postfix({'loss': f"{loss.item():.4f}"})

    avg_loss=total_loss/(len(train_loader))
    print(f"epoch : {epoch+1}  |   avg_train_loss : {avg_loss}")



  scaler=GradScaler()


Epoch 1/5:   0%|          | 0/4635 [00:00<?, ?it/s]

  with autocast():


epoch : 1  |   avg_train_loss : 2.2916560040322707


Epoch 2/5:   0%|          | 0/4635 [00:00<?, ?it/s]

epoch : 2  |   avg_train_loss : 1.5191193713982385


Epoch 3/5:   0%|          | 0/4635 [00:00<?, ?it/s]

epoch : 3  |   avg_train_loss : 1.1325321708553688


Epoch 4/5:   0%|          | 0/4635 [00:00<?, ?it/s]

epoch : 4  |   avg_train_loss : 0.8709159991212647


Epoch 5/5:   0%|          | 0/4635 [00:00<?, ?it/s]

epoch : 5  |   avg_train_loss : 0.6924096364548034


In [27]:
def exact_match_score(predictions,true_labels):
    return (predictions.strip().lower()==true_labels.strip().lower())

def f1_score(predictions,true_labels):
    predictions = predictions.lower().split()
    true_labels = true_labels.lower().split()
    matches=set(predictions) & set(true_labels)
    if not matches:
       return 0.0
    precision=len(matches)/len(predictions)
    recall=len(matches)/len(true_labels)
    return 2 * precision * recall / (precision + recall)
  
 

In [28]:
model.eval()
f1_scores, em_scores = [], []

with torch.no_grad():
    for item in tqdm(val_dataset, desc="Evaluating"):
        input_ids = item["input_ids"].unsqueeze(0).to(device)
        attention_mask = item["attention_mask"].unsqueeze(0).to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        start_pred = torch.argmax(outputs.start_logits, dim=1).item()
        end_pred = torch.argmax(outputs.end_logits, dim=1).item()

        start_true = item["start_positions"].item()
        end_true = item["end_positions"].item()

        pred_answer = tokenizer.decode(input_ids[0][start_pred:end_pred+1], skip_special_tokens=True).strip()
        true_answer = tokenizer.decode(input_ids[0][start_true:end_true+1], skip_special_tokens=True).strip()

        f1_scores.append(f1_score(pred_answer, true_answer))
        em_scores.append(exact_match_score(pred_answer, true_answer))

Evaluating:   0%|          | 0/4212 [00:00<?, ?it/s]

In [31]:
avg_em_score=np.mean(em_scores)
avg_f1_score=np.mean(f1_scores)
print(f"Average em_score: {avg_em_score : .4f}")
print(f"Average_f1_score: {avg_f1_score : .4f}")

Average em_score:  0.4036
Average_f1_score:  0.5332


In [33]:

from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [34]:
output_dir = "./my_finetuned_model_local"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir) 

hub_model_name = "Anirudh/Roberta"


model.push_to_hub(hub_model_name)
tokenizer.push_to_hub(hub_model_name)

HfHubHTTPError: (Request ID: Root=1-68f9e4a4-4074126816fcf7da61789a4e;e4fc53d3-d094-4f2f-8a06-67611668e30b)

403 Forbidden: You don't have the rights to create a model under the namespace "Anirudh".
Cannot access content at: https://huggingface.co/api/repos/create.
Make sure your token has the correct permissions.