In [3]:
!pip install -qq transformers[sentencepiece]==4.35.2 datasets==2.16.1 evaluate==0.4.1
!sudo apt-get install libomp-dev
!pip install -qq faiss-gpu


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libomp-14-dev libomp5-14
Suggested packages:
  libomp-14-doc
The following NEW packages will be installed:
  libomp-14-dev libomp-dev libomp5-14
0 upgraded, 3 newly installed, 0 to remove and 45 not upgraded.
Need to get 738 kB of archives.
After this operation, 8,991 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 libomp5-14 amd64 1:14.0.0-1ubuntu1.1 [389 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 libomp-14-dev amd64 1:14.0.0-1ubuntu1.1 [347 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libomp-dev amd64 1:14.0-55~exp2 [3,074 B]
Fetched 738 kB in 1s (1,011 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debc

In [4]:
import numpy as np
import collections
import torch
import faiss
import evaluate

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from tqdm.auto import tqdm

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

#

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [6]:
DATASET_NAME = 'squad_v2'
raw_datasets = load_dataset(DATASET_NAME , split ='train+validation')
raw_datasets


Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 142192
})

In [7]:
#Loai bỏ những câu khôg có đáp án
raw_datasets = raw_datasets.filter (lambda x: len (x['answers']['text']) > 0)


Filter:   0%|          | 0/142192 [00:00<?, ? examples/s]

In [8]:
MODEL_NAME = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [9]:
def cls_pooling(model_output):
  return model_output.last_hidden_state[:, 0]

In [10]:
def get_embeddings(text_list):
  encoded_input = tokenizer(
      text_list, padding=True, truncation=True, return_tensors='pt')
  encoded_input = {k : v.to(device) for k, v in encoded_input.items()}
  model_output = model(**encoded_input)
  return cls_pooling(model_output)

In [11]:
EMBEDDING_COLUMN = 'question_embedding'
embeddings_dataset = raw_datasets.map(lambda x: { EMBEDDING_COLUMN : get_embeddings (x['question']).detach().cpu().numpy()[0]})


Map:   0%|          | 0/92749 [00:00<?, ? examples/s]

In [12]:
embeddings_dataset.add_faiss_index(column = EMBEDDING_COLUMN )


  0%|          | 0/93 [00:00<?, ?it/s]

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'question_embedding'],
    num_rows: 92749
})

In [22]:
input_question = 'When did Beyonce start becoming popular ?'

input_quest_embedding = get_embeddings([input_question]).cpu().detach().numpy()

TOP_K = 5
scores , samples = embeddings_dataset.get_nearest_examples (EMBEDDING_COLUMN,input_quest_embedding,k= TOP_K)
for idx , score in enumerate ( scores ) :
 print (f'Top { idx + 1}: tScore : { score }')
 print (f'Question : { samples ["question"][ idx ]} ')
 print (f'Context : { samples ["context"][ idx ]} ')
 print ()


Top 1: tScore : 0.0
Question : When did Beyonce start becoming popular? 
Context : Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy". 

Top 2: tScore : 2.6135313510894775
Question : When did Beyoncé rise to fame? 
Context : Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record produce

In [23]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("question-answering", model="Duchaha/distilbert-finetuned-squadv2")

In [24]:
print (f'Input question : { input_question }')
for idx , score in enumerate ( scores ) :
 question = samples ["question"][ idx]
 context = samples ["context"][ idx]
 answer = pipe (
 question = question ,
 context = context
 )
 print (f'Top { idx + 1}\ tScore : { score }')
 print (f'Context : { context }')
 print (f'Answer : { answer }')
 print ()


Input question : When did Beyonce start becoming popular ?
Top 1\ tScore : 0.0
Context : Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
Answer : {'score': 0.637919008731842, 'start': 276, 'end': 286, 'answer': 'late 1990s'}

Top 2\ tScore : 2.6135313510894775
Context : Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981)