# Install and Load Packages

In [None]:
!pip install transformers
!pip install datasets
!pip install faiss-gpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 13.9 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 63.4 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 53.0 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 36.7 MB

In [None]:
import torch
import torch.nn as nn
import tqdm
import pandas as pd
import faiss
import faiss.contrib.torch_utils

from transformers import DPRContextEncoder, DPRContextEncoderTokenizerFast
from datasets import Dataset
from google.colab import auth, drive
from google.cloud import bigquery

In [None]:
auth.authenticate_user()
print('Authenticated')

drive.mount('/content/drive')
data_path = '/content/drive/MyDrive/nlp/data/'

Authenticated
Mounted at /content/drive


# Load Data

In [None]:
project_id = 'calcium-vial-368801'
client = bigquery.Client(project=project_id)

In [None]:
dt_train_queries = client.query('''
SELECT DISTINCT query_id, doc_id, title, text, questions
FROM `calcium-vial-368801.staging.nq_train_documents_1_qg_25_beam`
''').to_dataframe()

In [None]:
len(dt_train_queries)

26634

In [None]:
doc_titles = dt_train_queries[['doc_id', 'title']]

In [None]:
dt_train_queries['passage_append'] = dt_train_queries['title'] + ' [SEP] ' + dt_train_queries['text'] + '[SEP]' + dt_train_queries['questions']

In [None]:
queries_text = client.query('''
SELECT DISTINCT query_id, text AS query
FROM `calcium-vial-368801.beir_nq_train.train_query_lookup`
''').to_dataframe()

In [None]:
dt_train_clean = pd.merge(
    dt_train_queries,
    queries_text,
    on='query_id',
    how='left'
)[["query_id", "query", "passage_append"]].drop_duplicates()

# Data Loader

In [None]:
ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/492 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.


In [None]:
class MyDataset(Dataset):
    def __init__(self, dataframe, p_tokenizer):
        self.dataframe = dataframe
        self.p_tokenizer = p_tokenizer

        self.p_embed = p_tokenizer(
            self.dataframe['passage_append'].tolist(),
            return_tensors='pt',
            truncation=True,
            max_length=512,
            padding='max_length'
        )

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        return self.p_embed[index]


def collate_fn(batch):
    batchsize = len(batch)

    ctx_tensor = torch.LongTensor(
        [[sample.ids, sample.attention_mask, sample.type_ids] for sample in batch]
        )

    return ctx_tensor

BATCH_SIZE = 20

dataloader_train = torch.utils.data.DataLoader(
    MyDataset(dt_train_clean, ctx_tokenizer),
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_fn
    )

In [None]:
import gc

del ctx_tokenizer

gc.collect()
torch.cuda.empty_cache()

# Model

In [None]:
# https://discuss.huggingface.co/t/finetuning-dpr-on-custom-dataset/4170
ctx_model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base').to("cuda")

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
res = faiss.StandardGpuResources()
index = faiss.IndexFlatL2(768)
gpu_index = faiss.index_cpu_to_gpu(res, 0, index)

In [None]:
class PassageEncoder(nn.Module):
    def __init__(self, p_encoder, index):
        super().__init__()
        self.p_encoder = p_encoder
        self.index = index

    def forward(self, passage):
        self.index.add(self.p_encoder(passage[:, 0, :], passage[:, 1, :], passage[:, 2, :]).pooler_output.contiguous())


In [None]:
pEncoder = PassageEncoder(ctx_model, gpu_index)

for i in tqdm.notebook.tqdm(dataloader_train, total=len(dataloader_train)):
    pEncoder(i.to("cuda"))
    torch.cuda.empty_cache()

  0%|          | 0/1332 [00:00<?, ?it/s]

In [None]:
faiss.write_index(faiss.index_gpu_to_cpu(gpu_index), data_path + 'nq_train_passage_encodings/nq_train_1_qg_25_beam_passage_index')
print(gpu_index.ntotal)

In [None]:
import json

idx2docid = dt_train_queries['doc_id'].to_dict()

with open(data_path + 'nq_train_passage_encodings/nq_train_1_qg_25_beam_passage_idx2docid.json', 'w') as f:
   f.write(json.dumps(idx2docid))