In [13]:
from datasets import load_dataset, Dataset
import spacy
from tqdm.auto import tqdm
from collections import defaultdict
from nltk import sent_tokenize
import joblib
import numpy as np
import pandas as pd
import os
from openai import OpenAI
import random
from transformers import AutoTokenizer, DPRQuestionEncoder,DPRContextEncoder
import torch

In [2]:
!nvidia-smi

Wed Dec  6 16:42:34 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.12             Driver Version: 535.104.12   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          On  | 00000000:00:05.0 Off |                    0 |
| N/A   54C    P0             227W / 400W |  34321MiB / 81920MiB |     83%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-SXM4-80GB          On  | 00000000:00:06.0 Off |  

In [5]:
data = load_dataset("Seongill/nq_squad")["train"]
df = pd.DataFrame(data)

In [8]:
print("Before:", len(df))
df = df.drop_duplicates(subset=["question"], keep="first")
print("After:" , len(df))

Before: 97888
After: 97888


In [14]:
ctxs = list(set(df["context"].tolist()))
questions = df["question"].tolist()
questions_embed, ctxs_embed = [], []

tokenizer = AutoTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
model = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base").to("cuda")

for i in tqdm(range(0, len(questions), 1028), desc="Question Embedding..."):
    batch = questions[i:i+1028]
    output = tokenizer(batch, padding="max_length", truncation=True, max_length=128, return_tensors="pt").to("cuda")
    with torch.no_grad():
        embeddings = model(**output).pooler_output.detach().cpu().numpy() # [args.batch_size, hidden_dim]
    questions_embed.extend([emb for emb in embeddings])

Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.weight', 'question_encoder.bert_model.pooler.dense.bias']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Question Embedding...:   0%|          | 0/96 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/492 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Context Embedding...:   0%|          | 0/21 [00:00<?, ?it/s]

IndexError: list index out of range

In [26]:
ctxs_embed = []
tokenizer = AutoTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
model = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base").to("cuda")
for i in tqdm(range(0, len(ctxs), 1024), desc="Context Embedding..."):
    batch = ctxs[i:i+1024]
    output = tokenizer(batch, padding="max_length", truncation=True, max_length=512, return_tensors="pt").to("cuda")
    with torch.no_grad():
        embeddings = model(**output).pooler_output.detach().cpu().numpy() # [args.batch_size, hidden_dim]
    ctxs_embed.extend([emb for emb in embeddings])

Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Context Embedding...:   0%|          | 0/21 [00:00<?, ?it/s]

In [27]:
text_to_emb = {}
for k, v in zip(ctxs, ctxs_embed):
    text_to_emb[k] = v / np.linalg.norm(v)
norm_ctxs_embed = np.array(ctxs_embed) / np.linalg.norm(ctxs_embed, axis=1, keepdims=True)

In [62]:
norm_questions_embed = np.array(questions_embed) / np.linalg.norm(questions_embed, axis=1, keepdims=True)

In [63]:
q_to_emb = {}
for k, v in zip(questions, questions_embed):
    q_to_emb[k] = v / np.linalg.norm(v)

In [48]:
answer = [d["text"][0] for d in df["answers"].tolist()]

In [52]:
for q, a in zip(questions, answer):
    if a == "n":
        print(q, a)
    if a == "in":
        print(q, a)

What conjunction would be used to join a person's surnames? in
n n
What letter represents the index of refraction? n


In [69]:
def random_select(text, answer):
    item = random.choice(ctxs)
    cnt = 0
    if answer in ["n", "in", ""]:
        return None
    while (item == text) or (answer in item):
        item = random.choice(ctxs)
        cnt += 1
        if cnt > 100:
            return None
    return item
def similarity_select(text, answer, question):
    c_emb = text_to_emb[text]
    q_emb = q_to_emb[question]
    c_output = np.matmul(norm_ctxs_embed, c_emb.T)
    q_output = np.matmul(norm_ctxs_embed, q_emb.T)
    mean_output = (c_output + q_output) / 2
    topk_idx = list(np.argpartition(mean_output, -20)[-20:])
    for idx in topk_idx:
        if answer not in ctxs[idx]:
            return ctxs[idx]

In [70]:
random_answer, similar_answer = [], []
for row in tqdm(df.iterrows()):
    text = row[1]["context"]
    answer = row[1]["answers"]["text"][0]
    question = row[1]["question"]
    random_answer.append(random_select(text, answer))
    similar_answer.append(similarity_select(text, answer,question))

0it [00:00, ?it/s]

In [71]:
#df["random_answer"] = random_answer
df["similar_answer_v2"] = similar_answer

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["similar_answer_v2"] = similar_answer


In [73]:
df = df.drop(["random_context", "similarity_context"], axis=1)
df

Unnamed: 0,id,title,context,question,answers,masked_query,query_embedding,random_answer,similar_answer,similar_answer_v2
0,5733be284776f41900661182,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,"{'text': ['Saint Bernadette Soubirous'], 'answ...",To whom did the Virgin Mary allegedly appear i...,"[-0.04606574401259422, 0.015957484021782875, -...","Before the St. Elizabeth's flood (1421), the M...",In 2014 the Notre Dame student body consisted ...,The doctrines of the Assumption or Dormition o...
1,5733be284776f4190066117f,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,"{'text': ['a copper statue of Christ'], 'answe...",What is in front of [MASK]?,"[0.19923186302185059, 0.06610594689846039, 0.2...",The government broadened land ownership by ret...,In 2014 the Notre Dame student body consisted ...,"St. Patrick's Street, the main street of the c..."
2,5733be284776f41900661180,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,"{'text': ['the Main Building'], 'answer_start'...",[MASK] at [MASK] is beside to which structure?,"[0.3204971253871918, 0.20272424817085266, 0.22...",In the 1520s during the Protestant Reformation...,In 2014 the Notre Dame student body consisted ...,One of the most dramatic parts of the museum i...
3,5733be284776f41900661181,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,{'text': ['a Marian place of prayer and reflec...,What is the Grotto at [MASK]?,"[0.020319189876317978, 0.21503326296806335, 0....","Domestic dogs inherited complex behaviors, suc...",In 2014 the Notre Dame student body consisted ...,"The structure known as ""Virgil's tomb"" is foun..."
4,5733be284776f4190066117e,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,{'text': ['a golden statue of the Virgin Mary'...,What sits on top of [MASK] at [MASK]?,"[0.21866606175899506, 0.32595109939575195, 0.0...","In modern-day Germany, the Holy Roman Empire c...",In 2014 the Notre Dame student body consisted ...,In the 18 years under the presidency of Edward...
...,...,...,...,...,...,...,...,...,...,...
98164,5737aafd1c456719005744fb,Force,"The pound-force has a metric counterpart, less...",What is the metric term less used than the New...,"{'text': ['kilogram-force', 'pound-force', 'ki...",What is the metric term less used than the [MA...,"[0.05857216566801071, 0.13753995299339294, -0....",The stated objective of most intellectual prop...,A unit load is defined as 100 mA in USB 1.x an...,"Since forces are perceived as pushes or pulls,..."
98165,5737aafd1c456719005744fc,Force,"The pound-force has a metric counterpart, less...",What is the kilogram-force sometimes reffered ...,"{'text': ['kilopond', 'kilopond', 'kilopond', ...",What is the kilogram-force sometimes reffered ...,"[0.13551265001296997, 0.04391534999012947, 0.1...",The serial format changed for the 2005 revival...,A unit load is defined as 100 mA in USB 1.x an...,The total energy of a system can be subdivided...
98166,5737aafd1c456719005744fd,Force,"The pound-force has a metric counterpart, less...",What is a very seldom used unit of mass in the...,"{'text': ['slug', 'metric slug', 'metric slug'...",What is a very seldom used unit of mass in the...,"[0.13060449063777924, 0.13672053813934326, -0....","Each cardinal takes on a titular church, eithe...",A unit load is defined as 100 mA in USB 1.x an...,A watt balance is an instrument for comparing ...
98167,5737aafd1c456719005744fe,Force,"The pound-force has a metric counterpart, less...",What seldom used term of a unit of force equal...,"{'text': ['kip', 'kip', 'kip', 'kip', 'kip'], ...",What seldom used term of a unit of force equal...,"[-0.10803636163473129, 0.0015136328293010592, ...","President Franklin D. Roosevelt promoted a ""go...",A unit load is defined as 100 mA in USB 1.x an...,A static equilibrium between two forces is the...


In [74]:
df["answer"] = df["answers"].apply(lambda x: x["text"][0])

In [75]:
dataset = Dataset.from_pandas(df)
dataset.push_to_hub("squad_missing_answer")

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/49 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/49 [00:00<?, ?ba/s]

In [19]:
responses.choices[0].logprobs.tokens

['\n\n', 'As', ' of', ' January', ' ', '202', '2', ',', ' the', ' president']

In [21]:
responses.choices[0].logprobs.top_logprobs

[{'\n\n': -0.11258054,
  '\n': -2.5332043,
  ' \n\n': -4.746234,
  'The': -5.9145384,
  '\n\n\n': -6.4242067},
 {'As': -0.042069715,
  'The': -3.47413,
  'Currently': -5.265245,
  'Joe': -5.463491,
  'Donald': -8.244761},
 {' of': -0.009028853,
  ' an': -5.287681,
  ' a': -5.7988505,
  ' the': -8.499787,
  ' this': -8.600926},
 {' January': -0.78186524,
  ' ': -1.2735778,
  ' November': -2.7348604,
  ' October': -2.9311075,
  ' February': -3.7668085},
 {' ': -6.80205e-05,
  ',': -10.268138,
  ' of': -11.059773,
  '<|endoftext|>': -11.653062,
  '202': -12.60736},
 {'202': -0.08436187,
  '20': -2.5598962,
  '21': -6.6945915,
  '11': -8.0714855,
  '201': -8.331005},
 {'2': -0.5250918,
  '1': -0.90931386,
  '0': -5.188664,
  '3': -9.381581,
  '<|endoftext|>': -11.086154},
 {',': -0.00012296606,
  ' the': -10.333541,
  ',the': -10.737315,
  '.': -11.466929,
  '<|endoftext|>': -11.554799},
 {' the': -0.11537982,
  ' Joe': -2.236422,
  ' Joseph': -6.6280236,
  ' President': -7.9406114,
  ' Th

In [81]:
dataset["validation"][0]

{'id': '100303db73e4051089035f246d0aeef2b12c4e47',
 'title': 'Newcastle_upon_Tyne',
 'context': "Another green space in Newcastle is the Town Moor, lying immediately north of the city centre. It is larger than London's famous Hyde Park and Hampstead Heath put together and the freemen of the city have the right to graze cattle on it. The right incidentally extends to the pitch of St. James' Park, Newcastle United Football Club's ground, though this is not exercised, although the Freemen do collect rent for the loss of privilege. Honorary freemen include Bob Geldof, King Harald V of Norway, Bobby Robson, Alan Shearer, the late Nelson Mandela and the Royal Shakespeare Company. The Hoppings funfair, said to be the largest travelling funfair in Europe, is held here annually in June.",
 'question': 'Where is the Hoppings funfair held?',
 'answers': {'text': ['Town Moor'], 'answer_start': [40]},
 'metadata': {'split': 'validation', 'model_in_the_loop': 'Combined'}}

In [79]:
from datasets import load_dataset

dataset = load_dataset("adversarial_qa", 'adversarialQA')

Downloading data:   0%|          | 0.00/9.02M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/30000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [100]:
def make_ex():
    item = dataset.shuffle()["validation"][0]
    question = item["question"]
    context = item["context"]
    answer = item["answers"]["text"][0]
    print(f"Based on the context below, answer the question. You should just reply with answer string, not a whole sentence.\nContext: {context}\n\nQuestion: {question}\nAnswer: ")
    print()
    print()
    print(f"Answer: {answer}")

In [103]:
make_ex()

Based on the context below, answer the question. You should just reply with answer string, not a whole sentence.
Context:Genghis Khan is regarded as one of the prominent leaders in Mongolia's history. He is responsible for the emergence of the Mongols as a political and ethnic identity because there was no unified identity between the tribes that had cultural similarity. He reinforced many Mongol traditions and provided stability and unity during a time of almost endemic warfare between tribes. He is also given credit for the introduction of the traditional Mongolian script and the creation of the Ikh Zasag (Great Administration), the first written Mongolian law. "Ikh Zasag law adopted during Genghis Khan’s time in Mongolia had points to punish illegal matters related to corruption and bribery very heavily," Mongolian President Tsakhiagiin Elbegdorj noted. President Elbegdorj sees Genghis Khan as a leader from whom to learn for anti-corruption efforts as Genghis Khan sought equal prote

In [104]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Wed Dec  6 16:41:38 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.12             Driver Version: 535.104.12   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          On  | 00000000:00:05.0 Off |                    0 |
| N/A   60C    P0             252W / 400W |  75000MiB / 81920MiB |     87%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-SXM4-80GB          On  | 00000000:00:06.0 Off |  

In [105]:
!squeue

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
               475       gpu     test jeongseo  R    2:28:13      1 gpu-1
               477       gpu    bienc  yikyung  R    1:34:18      1 gpu-1
