# Install and Load Packages

In [20]:
!pip install transformers
!pip install datasets
!pip install faiss-gpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [21]:
import torch
import torch.nn as nn
import tqdm
import pandas as pd
import faiss
import faiss.contrib.torch_utils

from transformers import DPRContextEncoder, DPRContextEncoderTokenizerFast
from datasets import Dataset
from google.colab import auth, drive
from google.cloud import bigquery

In [22]:
auth.authenticate_user()
print('Authenticated')

drive.mount('/content/drive')
data_path = '/content/drive/MyDrive/nlp/data/'

Authenticated
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load Data

In [4]:
project_id = 'calcium-vial-368801'
client = bigquery.Client(project=project_id)

In [5]:
wiki_passages = client.query('''
SELECT DISTINCT doc_id, title, text
FROM `calcium-vial-368801.prod_datasets.final_wikipedia_documents_3`
LIMIT 250000
''').to_dataframe()

In [6]:
wiki_passages['passage_append'] = wiki_passages['title'] + ' [SEP] ' + wiki_passages['text']

# Data Loader

In [7]:
ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/492 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.


In [8]:
class MyDataset(Dataset):
    def __init__(self, dataframe, p_tokenizer):
        self.dataframe = dataframe
        self.p_tokenizer = p_tokenizer

        self.p_embed = p_tokenizer(
            self.dataframe['passage_append'].tolist(),
            return_tensors='pt',
            truncation=True,
            max_length=512,
            padding='max_length'
        )

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        return self.p_embed[index]


def collate_fn(batch):
    batchsize = len(batch)

    ctx_tensor = torch.LongTensor(
        [[sample.ids, sample.attention_mask, sample.type_ids] for sample in batch]
        )

    return ctx_tensor

BATCH_SIZE = 20

dataloader_train = torch.utils.data.DataLoader(
    MyDataset(wiki_passages, ctx_tokenizer),
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_fn
    )

In [9]:
import gc

del ctx_tokenizer

gc.collect()
torch.cuda.empty_cache()

# Model

In [10]:
# https://discuss.huggingface.co/t/finetuning-dpr-on-custom-dataset/4170
ctx_model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base').to("cuda")

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [11]:
gc.collect()
torch.cuda.empty_cache()

In [12]:
res = faiss.StandardGpuResources()
no_qg_index = faiss.read_index(data_path + 'nq_train_passage_encodings/nq_train_3_no_qg_greedy_passage_index')
gpu_index_no_qg = faiss.index_cpu_to_gpu(res, 0, no_qg_index)

In [13]:
qg_50_index = faiss.read_index(data_path + 'nq_train_passage_encodings/nq_train_3_qg_50_greedy_passage_index')
gpu_index_qg_50 = faiss.index_cpu_to_gpu(res, 0, qg_50_index)

In [14]:
class PassageEncoder(nn.Module):
    def __init__(self, p_encoder, index1, index2):
        super().__init__()
        self.p_encoder = p_encoder
        self.index1 = index1
        self.index2 = index2

    def forward(self, passage):
        encoded = self.p_encoder(passage[:, 0, :], passage[:, 1, :], passage[:, 2, :]).pooler_output.contiguous()

        self.index1.add(encoded)
        self.index2.add(encoded)


In [15]:
pEncoder = PassageEncoder(ctx_model, gpu_index_no_qg, gpu_index_qg_50)

for i in tqdm.notebook.tqdm(dataloader_train, total=len(dataloader_train)):
    pEncoder(i.to("cuda"))
    torch.cuda.empty_cache()

  0%|          | 0/12500 [00:00<?, ?it/s]

In [16]:
faiss.write_index(faiss.index_gpu_to_cpu(gpu_index_no_qg), data_path + 'wiki_nq_train_passage_encodings/easy_wiki_nq_train_3_no_qg_greedy_passage_index')
print(gpu_index_no_qg.ntotal)

276634


In [17]:
faiss.write_index(faiss.index_gpu_to_cpu(gpu_index_qg_50), data_path + 'wiki_nq_train_passage_encodings/easy_wiki_nq_train_3_qg_50_greedy_passage_index')
print(gpu_index_qg_50.ntotal)

276634


In [18]:
# Make an index with the passages and queries
import json
import numpy as np

with open(data_path + 'nq_train_passage_encodings/nq_train_3_no_qg_greedy_passage_idx2docid.json') as f:
    old_index = json.loads(f.read())

for i in np.arange(len(old_index), gpu_index_no_qg.ntotal):
    old_index[str(i)] = wiki_passages.doc_id[i - len(old_index)]

with open(data_path + 'wiki_nq_train_passage_encodings/easy_wiki_nq_train_3_no_qg_greedy_passage_idx2docid.json', 'w') as f:
    f.write(json.dumps(old_index))

In [24]:
# Make an index with the passages and queries
with open(data_path + 'nq_train_passage_encodings/nq_train_3_qg_50_greedy_passage_idx2docid.json') as f:
    old_index = json.loads(f.read())

for i in np.arange(len(old_index), gpu_index_qg_50.ntotal):
    old_index[str(i)] = wiki_passages.doc_id[i - len(old_index)]

with open(data_path + 'wiki_nq_train_passage_encodings/easy_wiki_nq_train_3_qg_50_greedy_passage_idx2docid.json', 'w') as f:
    f.write(json.dumps(old_index))