In [1]:
!pip install transformers



In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AdamW
from torch.utils.data import DataLoader, Dataset
import json
from tqdm import tqdm



In [3]:
!pip install datasets



In [4]:
# Load the NewsQA dataset
from datasets import load_dataset
newsqa_dataset = load_dataset('lucadiliello/newsqa')

Downloading and preparing dataset parquet/lucadiliello--newsqa to /root/.cache/huggingface/datasets/parquet/lucadiliello--newsqa-206550e86bcc3ded/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/29.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.63M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/parquet/lucadiliello--newsqa-206550e86bcc3ded/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

### **Get data 📁**

Let's extract our data and store them into some data structures.

In [5]:
def read_newsqa_data(dataset):
    contexts = []
    questions = []
    answers = []
    string_ans = []

    for item in dataset:
        context = item['context']
        question = item['question']
        answer = {'answer_start': item['labels'][0]['start'][0], 'answer_end': item['labels'][0]['end'][0]}  # Assuming there's only one answer
        string_answer = item['answers'][0]
        
        contexts.append(context)
        questions.append(question)
        answers.append(answer)
        string_ans.append(string_answer)
    return contexts, questions, answers, string_ans

In [6]:
train_contexts, train_questions, train_answers, train_str_ans = read_newsqa_data(newsqa_dataset['train'].select(list(range(50000))))
valid_contexts, valid_questions, valid_answers, valid_str_ans = read_newsqa_data(newsqa_dataset['validation'].select(list(range(1000))))

In [7]:
valid_str_ans[:5]

['three different videos',
 'getting his chest waxed,',
 'environmental',
 'his chest',
 'Harrison Ford']

### **Tokenization 🔢**

In [8]:
# Initialize the RoBERTa tokenizer
tokenizer = AutoTokenizer.from_pretrained('deepset/roberta-base-squad2')
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
valid_encodings = tokenizer(valid_contexts, valid_questions, truncation=True, padding=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Next we need to convert our character start/end positions to token start/end positions. Why is that? Because our words converted into tokens, so the answer start/end needs to show the index of start/end token which contains the answer and not the specific characters in the context.

In [9]:
# Convert character start/end positions to token start/end positions
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        char_start = answers[i]['answer_start']
        char_end = answers[i]['answer_end']

        token_start = encodings.char_to_token(i, char_start)
        token_end = encodings.char_to_token(i, char_end)

        start_positions.append(token_start)
        end_positions.append(token_end)

        if token_start is None:
            start_positions[-1] = tokenizer.model_max_length
        if token_end is None:
            end_positions[-1] = tokenizer.model_max_length

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

In [10]:
add_token_positions(train_encodings, train_answers)

In [11]:
add_token_positions(valid_encodings, valid_answers)

In [12]:
class NewsQA_Dataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

## Creating the dataset using the class

In [13]:
train_dataset = NewsQA_Dataset(train_encodings)
valid_dataset = NewsQA_Dataset(valid_encodings)

In [14]:
# Create dataloaders for training and validation
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16)

## Importing the model

In [15]:
# Initialize the RoBERTa model for question answering
model = AutoModelForQuestionAnswering.from_pretrained('deepset/roberta-base-squad2')

Downloading model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

In [16]:
num_layers = model.config.num_hidden_layers
print(f"Number of layers: {num_layers}")

Number of layers: 12


In [17]:
num_layers_to_freeze = 8
for param in model.roberta.embeddings.parameters():
    param.requires_grad = False
for layer in model.roberta.encoder.layer[:num_layers_to_freeze]:
    for param in layer.parameters():
        param.requires_grad = False

In [18]:
# Check if GPU is available and move the model accordingly
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

RobertaForQuestionAnswering(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (Lay

### Model Hyperparameters

In [19]:
# Initialize the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)
# Training loop
num_epochs = 15



## Training the Model

In [20]:
model.train()

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}', dynamic_ncols=True):
        inputs = {key: value.to(device) for key, value in batch.items()}

        # Forward pass
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    
    # Calculate and print the average loss for this epoch
    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1} - Avg Loss: {avg_loss:.4f}')


Epoch 1: 100%|██████████| 3125/3125 [24:33<00:00,  2.12it/s]


Epoch 1 - Avg Loss: 2.5670


Epoch 2: 100%|██████████| 3125/3125 [24:32<00:00,  2.12it/s]


Epoch 2 - Avg Loss: 1.7634


Epoch 3: 100%|██████████| 3125/3125 [24:31<00:00,  2.12it/s]


Epoch 3 - Avg Loss: 1.5220


Epoch 4: 100%|██████████| 3125/3125 [24:31<00:00,  2.12it/s]


Epoch 4 - Avg Loss: 1.3306


Epoch 5: 100%|██████████| 3125/3125 [24:31<00:00,  2.12it/s]


Epoch 5 - Avg Loss: 1.1518


Epoch 6: 100%|██████████| 3125/3125 [24:32<00:00,  2.12it/s]


Epoch 6 - Avg Loss: 1.0040


Epoch 7: 100%|██████████| 3125/3125 [24:32<00:00,  2.12it/s]


Epoch 7 - Avg Loss: 0.8881


Epoch 8: 100%|██████████| 3125/3125 [24:31<00:00,  2.12it/s]


Epoch 8 - Avg Loss: 0.7809


Epoch 9: 100%|██████████| 3125/3125 [24:31<00:00,  2.12it/s]


Epoch 9 - Avg Loss: 0.6928


Epoch 10: 100%|██████████| 3125/3125 [24:31<00:00,  2.12it/s]


Epoch 10 - Avg Loss: 0.6217


Epoch 11: 100%|██████████| 3125/3125 [24:30<00:00,  2.12it/s]


Epoch 11 - Avg Loss: 0.5658


Epoch 12: 100%|██████████| 3125/3125 [24:30<00:00,  2.13it/s]


Epoch 12 - Avg Loss: 0.5162


Epoch 13: 100%|██████████| 3125/3125 [24:30<00:00,  2.12it/s]


Epoch 13 - Avg Loss: 0.4784


Epoch 14: 100%|██████████| 3125/3125 [24:30<00:00,  2.13it/s]


Epoch 14 - Avg Loss: 0.4466


Epoch 15: 100%|██████████| 3125/3125 [24:30<00:00,  2.12it/s]

Epoch 15 - Avg Loss: 0.4182





## Saving the Model

In [21]:
# Save the fine-tuned model if needed
model.save_pretrained('local_fine_tuned_roberta_on_newsqa')
tokenizer.save_pretrained('local_fine_tuned_roberta_on_newsqa')

('local_fine_tuned_roberta_on_newsqa/tokenizer_config.json',
 'local_fine_tuned_roberta_on_newsqa/special_tokens_map.json',
 'local_fine_tuned_roberta_on_newsqa/vocab.json',
 'local_fine_tuned_roberta_on_newsqa/merges.txt',
 'local_fine_tuned_roberta_on_newsqa/added_tokens.json',
 'local_fine_tuned_roberta_on_newsqa/tokenizer.json')

In [22]:
 # Initialize the tokenizer and model
fine_tuned_tokenizer = AutoTokenizer.from_pretrained('local_fine_tuned_roberta_on_newsqa')
fine_tuned_model = AutoModelForQuestionAnswering.from_pretrained('local_fine_tuned_roberta_on_newsqa')

In [23]:
fine_tuned_model.to(device)

RobertaForQuestionAnswering(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (Lay

# Inference

In [24]:
# Perform inference
question = "What is Ford getting waxed?"
context = "What could be more powerful than the tears of a Native American Indian? Wax on, wax off: Does it make you want to save the rainforests? Iron Eyes Cody was the face of the Keep American Beautiful campaign of 1971 whose tears marked the plight of the environment, but more importantly kept the problems of pollution in the minds of millions. From teary Native Americans to witty skits or doom-ladened eco-horror scenarios, the environmental campaign video then has long been a powerful tool for environmental groups to spread their message and raise pubic attention. The rise of YouTube and other video sharing web sites has now meant that individuals can broadcast their own eco-awareness messages and form their own social action networks. But what makes a good video and how much impact do they have? Is it better to be funny or shocking? When you see Harrison Ford getting his chest waxed, do you immediately think about saving the rainforests? Or does the sight of celebrity pontificating about the plight of the environment make you want to watch their next film rather calculate your carbon footprint. We've featured three different videos that we like and want to know which ones you think are the best. Watch the featured videos » Let us know which eco videos have got you going by using the Sound Off box below. Or, e-mail us at ecosolutions@cnn.com. We also want to feature your own environmental videos here on CNN's Eco Solutions. Use the iReport form to send in your film and you could find your environmental efforts make even more impact than Harrison Ford's chest."

In [25]:
# Tokenize the passage and question
inputs = fine_tuned_tokenizer.encode_plus(question, context, return_tensors="pt")
inputs.to(device)

# Perform inference
with torch.no_grad():
    outputs = fine_tuned_model(**inputs)
    start_idx = torch.argmax(outputs[0])
    end_idx = torch.argmax(outputs[1]) + 1

# Get the answer text from the passage
answer = fine_tuned_tokenizer.convert_tokens_to_string(fine_tuned_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start_idx:end_idx]))

print("Question:", question)
print("Answer:", answer)

Question: What is Ford getting waxed?
Answer:  his chest


In [26]:
def get_prediction(context, question):
    
    model.eval()
    inputs = fine_tuned_tokenizer.encode_plus(question, context, return_tensors='pt',truncation=True).to(device)
    
    with torch.no_grad():
        outputs = fine_tuned_model(**inputs)

    answer_start = torch.argmax(outputs[0])
    answer_end = torch.argmax(outputs[1]) + 1

    answer = fine_tuned_tokenizer.convert_tokens_to_string(fine_tuned_tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))
    return answer

def normalize_text(s):
  """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
  import string, re
  def remove_articles(text):
    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
    return re.sub(regex, " ", text)
  def white_space_fix(text):
    return " ".join(text.split())
  def remove_punc(text):
    exclude = set(string.punctuation)
    return "".join(ch for ch in text if ch not in exclude)
  def lower(text):
    return text.lower()

  return white_space_fix(remove_articles(remove_punc(lower(s))))

def exact_match(prediction, truth):
    return bool(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
  pred_tokens = normalize_text(prediction).split()
  truth_tokens = normalize_text(truth).split()

  # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
  if len(pred_tokens) == 0 or len(truth_tokens) == 0:
    return int(pred_tokens == truth_tokens)

  common_tokens = set(pred_tokens) & set(truth_tokens)

  # if there are no common tokens then f1 = 0
  if len(common_tokens) == 0:
    return 0

  prec = len(common_tokens) / len(pred_tokens)
  rec = len(common_tokens) / len(truth_tokens)

  return round(2 * (prec * rec) / (prec + rec), 2)

def question_answer(context, question,answer):
  prediction = get_prediction(context,question)
  em_score = exact_match(prediction, answer)
  f1_score = compute_f1(prediction, answer)

  '''print(f'Question: {question}')
  print(f'Prediction: {prediction}')
  print(f'True Answer: {answer}')
  print(f'Exact match: {em_score}')
  print(f'F1 score: {f1_score}\n')'''
    
  return f1_score

In [27]:
f1=0
for contexts, question, answer in zip(valid_contexts[:], valid_questions[:], valid_str_ans[:]):
    f1 += question_answer(contexts, question, answer)
avg_f1_score=f1/1000

In [28]:
print(f"Average F1 score={avg_f1_score}")

Average F1 score=0.6245899999999998
