In [1]:
import pickle
import torch
import torch.nn.functional as F
import os
os.environ["CUDA_VISIBLE_DEVICES"]="7"


device = torch.device("cuda")

In [2]:
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset

In [3]:


def cos_sim(a: torch.Tensor, b: torch.Tensor):
    """
    Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
    :return: Matrix with res[i][j]  = cos_sim(a[i], b[j])
    """
    if not isinstance(a, torch.Tensor):
        a = torch.tensor(a)

    if not isinstance(b, torch.Tensor):
        b = torch.tensor(b)

    if len(a.shape) == 1:
        a = a.unsqueeze(0)

    if len(b.shape) == 1:
        b = b.unsqueeze(0)

    a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
    b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
    return torch.mm(a_norm, b_norm.transpose(0, 1))


with open("/home/icml01/multi_rag/RAG/Search-in-the-Chain/data/subcor.pkl", "rb") as file:
    all_sub_corpus_embedding_ls = pickle.load(file)


with open("query.pkl", "rb") as file:
    query_embeddings = pickle.load(file)
query_embeddings = [row[-2:] for row in query_embeddings]
query_count = len(query_embeddings)

  return torch.load(io.BytesIO(b))


In [4]:
max_len = max(len(tensor) for row in query_embeddings for tensor in row) + 1
processed_tensors = []
for row in query_embeddings:
    concatenated_tensor = torch.cat(row, dim=0)
    padding_size = max_len - concatenated_tensor.size(0)  # 计算需要填充的长度
    # pad 参数: (left, right) -> (前向填充, 后向填充)，我们在第一维填充
    padded_tensor = F.pad(concatenated_tensor, (0, 0, padding_size, 0))  # 填充到 maxlen
    processed_tensors.append(
        padded_tensor
    )  # Stack to create a tensor of shape (maxlen, 768)
final_tensor = torch.stack(processed_tensors).to(device)  # Shape: [85, maxlen, 768]
final_tensor.shape

torch.Size([7405, 246, 768])

In [10]:
max(row.shape[0] for row in all_sub_corpus_embedding_ls for tensor in row)

1

In [5]:
# 将所有 sub_corpus_embeddings 放到一个大 tensor 中，减少内存传输
all_sub_corpus_embeddings_tensor = torch.stack(all_sub_corpus_embedding_ls).to(device)  # [5233329, 2, 768]

# 批处理大小
batch_size = 64  # 根据你的 GPU 内存调整批次大小
num_batches = len(all_sub_corpus_embedding_ls) // batch_size + 1

all_cos_scores = []

dataset = TensorDataset(all_sub_corpus_embeddings_tensor)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

# for batch_idx, (sub_corpus_embeddings_batch,) in enumerate(tqdm(dataloader)):
#     cos_scores_batch = raw_score_text(final_tensor, sub_corpus_embeddings_batch, device)
#     all_cos_scores.append(cos_scores_batch.cpu())  # 将结果从 GPU 移动到 CPU，防止内存溢出

In [6]:
dataloader_iter = iter(dataloader)
first_batch = next(dataloader_iter)[0]
first_batch

tensor([[[ 0.0037,  0.0124, -0.0096,  ..., -0.0142, -0.0061,  0.0484]],

        [[-0.0007,  0.0007, -0.0181,  ..., -0.0014, -0.0194, -0.0014]],

        [[-0.0012, -0.0089,  0.0156,  ...,  0.0110, -0.0032,  0.0421]],

        ...,

        [[ 0.0130,  0.0052, -0.0245,  ..., -0.0145,  0.0140,  0.0474]],

        [[-0.0036,  0.0019, -0.0134,  ..., -0.0222, -0.0134,  0.0174]],

        [[ 0.0056,  0.0049, -0.0345,  ...,  0.0084, -0.0209,  0.0270]]],
       device='cuda:0')

In [65]:
sub_corpus_embeddings = all_sub_corpus_embedding_ls[0]

In [66]:
final_tensor = final_tensor.to(device)
sub_corpus_embeddings = sub_corpus_embeddings.to(device)

batch_size, num_queries, embedding_dim = final_tensor.shape
sub_corpus_embeddings_norm = torch.nn.functional.normalize(
    sub_corpus_embeddings, p=2, dim=1
)
final_tensor_norm = torch.nn.functional.normalize(final_tensor, p=2, dim=-1)

print(final_tensor_norm.shape, sub_corpus_embeddings_norm.shape)

cos_sim_matrix = torch.matmul(final_tensor_norm, sub_corpus_embeddings_norm.T)

print(cos_sim_matrix.shape)

# max_cos_sim, _ = torch.max(cos_sim_matrix, dim=-1)

# print(max_cos_sim.shape)

last_query_embeddings = final_tensor[:, -1, :]
last_query_cos_sim2 = torch.matmul(last_query_embeddings, sub_corpus_embeddings.T)  # [:, 0]

print(last_query_cos_sim.shape)

curr_scores_ls = cos_sim_matrix[:, :-1]

curr_scores_ls[curr_scores_ls == 0] = 1
curr_scores_ls[curr_scores_ls < 0] = 0

prod_scores2 = torch.prod(curr_scores_ls, dim=1)

print(prod_scores.shape)

torch.Size([7405, 246, 768]) torch.Size([1, 768])
torch.Size([7405, 246, 1])
torch.Size([7405, 64])
torch.Size([7405, 64])


In [73]:
combined_column_tensor = torch.stack((prod_scores, last_query_cos_sim), dim=-1)  # shape: [7405, 64, 2]

combined_tensor = combined_column_tensor.permute(1, 0, 2)  # shape: [64, 7405, 2]

combined_tensor.shape

torch.Size([64, 7405, 2])

In [88]:
import torch

from tqdm import tqdm


def raw_score_text(final_tensor, sub_corpus_embeddings, device):
    final_tensor = final_tensor.to(device)
    sub_corpus_embeddings = sub_corpus_embeddings.reshape(64, 768).to(device)

    sub_corpus_embeddings_norm = torch.nn.functional.normalize(
        sub_corpus_embeddings, p=2, dim=1
    )
    final_tensor_norm = torch.nn.functional.normalize(final_tensor, p=2, dim=-1)
    # print(final_tensor_norm.shape, sub_corpus_embeddings_norm.shape)
  
    cos_sim_matrix = torch.matmul(final_tensor_norm, sub_corpus_embeddings_norm.T)

    last_query_embeddings = final_tensor[:, -1, :]
    last_query_cos_sim = torch.matmul(last_query_embeddings, sub_corpus_embeddings_norm.T)  # [:, 0]

    curr_scores_ls = cos_sim_matrix[:, :-1]

    curr_scores_ls[curr_scores_ls == 0] = 1
    curr_scores_ls[curr_scores_ls < 0] = 0

    prod_scores = torch.prod(curr_scores_ls, dim=1)
    combined_column_tensor = torch.stack((prod_scores, last_query_cos_sim), dim=-1)  # shape: [7405, 64, 2]

    combined_tensor = combined_column_tensor.permute(1, 0, 2)  # shape: [64, 7405, 2]
    

    return combined_tensor


def process_all_embeddings(final_tensor, all_sub_corpus_embedding_ls, device):
    # 将所有 sub_corpus_embeddings 放到一个大 tensor 中，减少内存传输
    all_sub_corpus_embeddings_tensor = torch.stack(all_sub_corpus_embedding_ls).to(
        device
    )  # [5233329, 2, 768]

    # 批处理大小
    batch_size = 64  # 根据你的 GPU 内存调整批次大小
    num_batches = len(all_sub_corpus_embedding_ls) // batch_size + 1

    all_cos_scores = []

    # 使用 DataLoader 进行批量处理
    dataset = TensorDataset(all_sub_corpus_embeddings_tensor)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    for batch_idx, (sub_corpus_embeddings_batch,) in enumerate(tqdm(dataloader)):
        cos_scores_batch = raw_score_text(
            final_tensor, sub_corpus_embeddings_batch, device
        )
        # print(cos_scores_batch.shape)
        all_cos_scores.append(cos_scores_batch)  # 将结果从 GPU 移动到 CPU，防止内存溢出

    return torch.cat(all_cos_scores, dim=0)


all_cos_scores = process_all_embeddings(
    final_tensor, all_sub_corpus_embedding_ls, device
)

100%|██████████| 4/4 [00:00<00:00, 556.18it/s]


In [85]:
all_cos_scores.shape

torch.Size([256, 7405, 2])

## 准确性测试

In [5]:
all_cos_scores = []

for sub_corpus_embeddings in tqdm(all_sub_corpus_embedding_ls):
    cos_scores = []
    for query_itr in range(query_count):
        curr_query_embedding_ls = query_embeddings[query_itr]
        if type(curr_query_embedding_ls) is list:  # [3,768][1,768]
            full_curr_scores_ls = []

            for sub_q_ls_idx in range(len(curr_query_embedding_ls)):
                curr_query_embedding = curr_query_embedding_ls[sub_q_ls_idx]
                curr_scores = 1

                if curr_query_embedding.shape[0] == 1:
                    curr_scores_ls = torch.max(
                        cos_sim(
                            curr_query_embedding.to(device),
                            sub_corpus_embeddings.to(device),
                        ),
                        dim=-1,
                    )[0]
                    curr_scores = curr_scores_ls
                    full_curr_scores_ls.append(curr_scores.item())
                    continue

                curr_scores_ls = cos_sim(
                    curr_query_embedding.to(device), sub_corpus_embeddings.to(device)
                )  # , dim=-1)

                curr_scores = torch.max(torch.prod(curr_scores_ls, dim=0))
                full_curr_scores_ls.append(curr_scores.item())

            curr_scores = torch.tensor(full_curr_scores_ls)

        cos_scores.append(curr_scores)

    cos_scores = torch.stack(cos_scores)
    all_cos_scores.append(cos_scores)
all_cos_scores_tensor = torch.stack(all_cos_scores, dim=-1)

100%|██████████| 256/256 [09:12<00:00,  2.16s/it]


In [14]:
all_cos_scores_tensor

NameError: name 'all_cos_scores_tensor' is not defined

In [47]:
def raw_score_fnc_batch(final_tensor, sub_corpus_embeddings, device):
    final_tensor = final_tensor.to(device) 
    sub_corpus_embeddings = sub_corpus_embeddings.to(device) 
   
    sub_corpus_embeddings_norm = torch.nn.functional.normalize(sub_corpus_embeddings, p=2, dim=1).to(device)
    final_tensor_norm = torch.nn.functional.normalize(final_tensor, p=2, dim=-1).to(device)

    cos_sim_matrix = torch.matmul(final_tensor_norm, sub_corpus_embeddings_norm.T)

    max_cos_sim, _ = torch.max(cos_sim_matrix, dim=-1) 

    last_query_embeddings = final_tensor[:, -1, :].to(device)  # Shape: [batch_size, embedding_dim]
    # last_corpus_embedding = sub_corpus_embeddings[-1].to(device).unsqueeze(0)
    last_query_cos_sim = torch.matmul(last_query_embeddings, sub_corpus_embeddings_norm.T)

    curr_scores_ls = max_cos_sim[:, :-1]  # Exclude the last query from max similarity scores
    curr_scores_ls[curr_scores_ls == 0] = 1
    curr_scores_ls[curr_scores_ls < 0] = 0

    # Compute product for each batch
    prod_scores = torch.prod(curr_scores_ls, dim=1)

    # Concatenate product scores with last query similarity
    final_scores = torch.cat([prod_scores.unsqueeze(1), last_query_cos_sim], dim=1)  # Shape: [batch_size, 2]

    del sub_corpus_embeddings_norm, final_tensor_norm, cos_sim_matrix
    del last_query_embeddings, curr_scores_ls, prod_scores

    return final_scores


In [48]:
all_cos_scores = []

for sub_corpus_embeddings in tqdm(all_sub_corpus_embedding_ls):
    raw_score_fnc_batch(final_tensor, sub_corpus_embeddings, device)
    all_cos_scores.append(cos_scores)
all_cos_scores_tensor2 = torch.stack(all_cos_scores, dim=-1)

  0%|          | 0/256 [00:00<?, ?it/s]


IndexError: max(): Expected reduction dim 2 to have non-zero size.

In [45]:
all_cos_scores_tensor2

tensor([[[0.3493, 0.3493, 0.3493,  ..., 0.3493, 0.3493, 0.3493],
         [0.7284, 0.7284, 0.7284,  ..., 0.7284, 0.7284, 0.7284]],

        [[0.6829, 0.6829, 0.6829,  ..., 0.6829, 0.6829, 0.6829],
         [0.7197, 0.7197, 0.7197,  ..., 0.7197, 0.7197, 0.7197]],

        [[0.0366, 0.0366, 0.0366,  ..., 0.0366, 0.0366, 0.0366],
         [0.7078, 0.7078, 0.7078,  ..., 0.7078, 0.7078, 0.7078]],

        ...,

        [[0.6837, 0.6837, 0.6837,  ..., 0.6837, 0.6837, 0.6837],
         [0.6819, 0.6819, 0.6819,  ..., 0.6819, 0.6819, 0.6819]],

        [[0.6888, 0.6888, 0.6888,  ..., 0.6888, 0.6888, 0.6888],
         [0.7156, 0.7156, 0.7156,  ..., 0.7156, 0.7156, 0.7156]],

        [[0.7623, 0.7623, 0.7623,  ..., 0.7623, 0.7623, 0.7623],
         [0.7638, 0.7638, 0.7638,  ..., 0.7638, 0.7638, 0.7638]]])

In [12]:
def raw_score_text(final_tensor, sub_corpus_embeddings, device):
    final_tensor = final_tensor.to(device)
    sub_corpus_embeddings = sub_corpus_embeddings.reshape(sub_corpus_embeddings.shape[0], sub_corpus_embeddings.shape[-1]).to(device)

    sub_corpus_embeddings_norm = torch.nn.functional.normalize(
        sub_corpus_embeddings, p=2, dim=1
    )
    final_tensor_norm = torch.nn.functional.normalize(final_tensor, p=2, dim=-1)
    # print(final_tensor_norm.shape, sub_corpus_embeddings_norm.shape)
  
    cos_sim_matrix = torch.matmul(final_tensor_norm, sub_corpus_embeddings_norm.T)
    # cos_sim_matrix = 1 - torch.cdist(final_tensor_norm, sub_corpus_embeddings_norm, p=2)

    last_query_embeddings = final_tensor[:, -1, :]
    last_query_cos_sim = torch.matmul(last_query_embeddings, sub_corpus_embeddings_norm.T)  # [:, 0]

    curr_scores_ls = cos_sim_matrix[:, :-1]

    curr_scores_ls[curr_scores_ls == 0] = 1
    curr_scores_ls[curr_scores_ls < 0] = 0

    prod_scores = torch.prod(curr_scores_ls, dim=1)
    #     combined_column_tensor = torch.stack((prod_scores, last_query_cos_sim), dim=-1)  # shape: [7405, 64, 2]

    #     combined_tensor = combined_column_tensor.permute(1, 0, 2)  # shape: [64, 7405, 2]
    combined_column_tensor = torch.stack((prod_scores, last_query_cos_sim), dim=1)
    
    return combined_column_tensor


def process_all_embeddings(final_tensor, all_sub_corpus_embedding_ls, device):
    # 将所有 sub_corpus_embeddings 放到一个大 tensor 中，减少内存传输
    all_sub_corpus_embeddings_tensor = torch.stack(all_sub_corpus_embedding_ls).to(
        device
    )  # [5233329, 2, 768]

    # 批处理大小
    batch_size = 64  # 根据你的 GPU 内存调整批次大小
    num_batches = len(all_sub_corpus_embedding_ls) // batch_size + 1

    all_cos_scores = []

    # 使用 DataLoader 进行批量处理
    dataset = TensorDataset(all_sub_corpus_embeddings_tensor)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    for batch_idx, (sub_corpus_embeddings_batch,) in enumerate(tqdm(dataloader)):
        cos_scores_batch = raw_score_text(
            final_tensor, sub_corpus_embeddings_batch, device
        )
        # print(cos_scores_batch.shape)
        all_cos_scores.append(cos_scores_batch)  # 将结果从 GPU 移动到 CPU，防止内存溢出

    return torch.cat(all_cos_scores, dim=-1)

In [13]:

max_len = max(len(tensor) for row in query_embeddings for tensor in row) + 1
processed_tensors = []
for row in query_embeddings:
    concatenated_tensor = torch.cat(row, dim=0)
    padding_size = max_len - concatenated_tensor.size(0)  # 计算需要填充的长度
    # pad 参数: (left, right) -> (前向填充, 后向填充)，我们在第一维填充
    padded_tensor = F.pad(concatenated_tensor, (0, 0, padding_size, 0))  # 填充到 maxlen
    processed_tensors.append(padded_tensor)  # Stack to create a tensor of shape (maxlen, 768)
final_tensor = torch.stack(processed_tensors).to(device)  # Shape: [85, maxlen, 768]


all_cos_scores_tensor3 = process_all_embeddings(
    final_tensor, all_sub_corpus_embedding_ls, device
)


100%|██████████| 4/4 [00:00<00:00, 1112.40it/s]


In [29]:
torch.allclose(all_cos_scores_tensor2.to(device), all_cos_scores_tensor.to(device), atol=1e-4)

True

In [28]:
for i in range(len(all_cos_scores_tensor)):
    if torch.allclose(all_cos_scores_tensor[i].to(device), all_cos_scores_tensor2[i].to(device), atol=1e-8) == False:
        print(i, all_cos_scores_tensor[i], all_cos_scores_tensor2[i])

6112 tensor([[7.1458e-04, 5.6004e-04, 6.7163e-04, 5.5277e-04, 7.0052e-04, 8.1139e-04,
         6.3611e-04, 6.2774e-04, 8.9557e-04, 2.7258e-03, 3.3861e-03, 4.0402e-03,
         1.8828e-03, 3.7404e-04, 9.1746e-04, 1.5563e-03, 8.9393e-04, 5.0665e-03,
         5.3028e-04, 2.3694e-03, 1.3258e-03, 7.9352e-04, 4.7992e-04, 7.3784e-04,
         7.3341e-04, 5.4741e-04, 9.0094e-04, 6.3873e-04, 9.4538e-04, 4.8560e-04,
         3.5923e-04, 6.6698e-04, 7.1406e-04, 5.0124e-04, 5.7574e-04, 3.8341e-04,
         4.4072e-04, 7.3782e-04, 8.9007e-04, 1.1099e-03, 4.7547e-04, 5.3047e-04,
         4.4848e-04, 7.8726e-04, 9.5942e-04, 6.2513e-04, 5.9103e-04, 1.0195e-03,
         8.1925e-04, 4.5063e-04, 8.1568e-04, 5.9795e-04, 3.9928e-04, 1.5620e-03,
         2.0011e-04, 2.9032e-04, 8.9621e-04, 3.2047e-04, 3.7742e-04, 1.2984e-03,
         5.8138e-04, 9.4139e-04, 4.9820e-04, 8.0688e-04, 6.0886e-04, 3.2376e-04,
         2.8058e-04, 4.2585e-04, 8.0448e-04, 6.6469e-04, 4.9975e-04, 8.5412e-04,
         9.1000e-04, 7.

In [11]:
cos_scores = []
sub_corpus_embeddings = all_sub_corpus_embedding_ls[0]
for query_itr in range(query_count):
    curr_query_embedding_ls = query_embeddings[query_itr]
    if type(curr_query_embedding_ls) is list:  # [3,768][1,768]
        full_curr_scores_ls = []

        for sub_q_ls_idx in range(len(curr_query_embedding_ls)):
            curr_query_embedding = curr_query_embedding_ls[sub_q_ls_idx]
            curr_scores = 1

            if curr_query_embedding.shape[0] == 1:
                curr_scores_ls = torch.max(
                    cos_sim(
                        curr_query_embedding.to(device),
                        sub_corpus_embeddings.to(device),
                    ),
                    dim=-1,
                )[0]
                curr_scores = curr_scores_ls
                full_curr_scores_ls.append(curr_scores.item())
                continue

            curr_scores_ls = cos_sim(
                curr_query_embedding.to(device), sub_corpus_embeddings.to(device)
            )  # , dim=-1)

            curr_scores = torch.max(torch.prod(curr_scores_ls, dim=0))
            full_curr_scores_ls.append(curr_scores.item())

        curr_scores = torch.tensor(full_curr_scores_ls)

    cos_scores.append(curr_scores)

cos_scores = torch.stack(cos_scores)

In [12]:
cos_scores

tensor([[0.2829, 0.6970],
        [0.6639, 0.7010],
        [0.0336, 0.6900],
        ...,
        [0.6611, 0.6751],
        [0.6724, 0.6932],
        [0.6914, 0.6980]])

In [13]:
def raw_score_text(final_tensor, sub_corpus_embeddings, device):
    final_tensor = final_tensor.to(device)
    sub_corpus_embeddings = sub_corpus_embeddings.to(
        device
    )  # Move sub_corpus_embeddings to device

    batch_size, num_queries, embedding_dim = final_tensor.shape

    sub_corpus_embeddings_tensor = sub_corpus_embeddings # [:-1]

    sub_corpus_embeddings_norm = torch.nn.functional.normalize(
        sub_corpus_embeddings_tensor, p=2, dim=1
    ).to(device)

    final_tensor_norm = torch.nn.functional.normalize(final_tensor, p=2, dim=-1).to(
        device
    )

    cos_sim_matrix = torch.matmul(final_tensor_norm, sub_corpus_embeddings_norm.T)

    # Compute max cosine similarity for each query across corpus embeddings
    max_cos_sim, _ = torch.max(cos_sim_matrix, dim=-1)  # Shape: [batch_size, num_queries]

    # Handle the special case for the last embedding
    last_query_embeddings = final_tensor[:, -1, :].to(device)  # Shape: [batch_size, embedding_dim]
    last_query_cos_sim = cos_sim(last_query_embeddings, sub_corpus_embeddings)[:, 0]  # Shape: [batch_size]

    # Combine scores
    curr_scores_ls = max_cos_sim[:, :-1]  # Exclude the last query from max similarity scores
    curr_scores_ls[curr_scores_ls == 0] = 1
    curr_scores_ls[curr_scores_ls < 0] = 0

    # Compute product for each batch
    prod_scores = torch.prod(curr_scores_ls, dim=1) 

    # Concatenate product scores with last query similarity
    final_scores = torch.cat([prod_scores.unsqueeze(1), last_query_cos_sim.unsqueeze(1)], dim=1)  # Shape:
    
    return final_scores


cos_scores = raw_score_text(final_tensor, sub_corpus_embeddings, device)
cos_scores

tensor([[0.2829, 0.6970],
        [0.6639, 0.7010],
        [0.0336, 0.6900],
        ...,
        [0.6611, 0.6751],
        [0.6724, 0.6932],
        [0.6914, 0.6980]], device='cuda:0')

# qrel

In [1]:
from beir.datasets.data_loader import GenericDataLoader

  from tqdm.autonotebook import tqdm


In [2]:
data_path = "/home/icml01/multi_rag/RAG/Search-in-the-Chain/data/hotpotqa/hotpotqa"

corpus, queries, qrels = GenericDataLoader(
    data_folder=data_path).load(split="test"
)

  0%|          | 0/5233329 [00:00<?, ?it/s]

IOStream.flush timed out


In [7]:
corpus['12']

{'text': 'Anarchism is a political philosophy that advocates self-governed societies based on voluntary institutions. These are often described as stateless societies, although several authors have defined them more specifically as institutions based on non-hierarchical free associations. Anarchism holds the state to be undesirable, unnecessary and harmful.',
 'title': 'Anarchism'}

In [3]:
queries

{'5a8b57f25542995d1e6f1371': 'Were Scott Derrickson and Ed Wood of the same nationality?',
 '5a8c7595554299585d9e36b6': 'What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?',
 '5a85ea095542994775f606a8': 'What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?',
 '5adbf0a255429947ff17385a': 'Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?',
 '5a8e3ea95542995a26add48d': 'The director of the romantic comedy "Big Stone Gap" is based in what New York city?',
 '5abd94525542992ac4f382d2': '2014 S/S is the debut album of a South Korean boy group that was formed by who?',
 '5a85b2d95542997b5ce40028': 'Who was known by his stage name Aladin and helped organizations improve their performance as a consultant?',
 '5a87ab905542996e4f3088c1': 'The arena where the Lewiston Maineiacs played their home games can seat how many 

In [10]:
corpus['2816539']

{'text': 'Scott Derrickson (born July 16, 1966) is an American director, screenwriter and producer. He lives in Los Angeles, California. He is best known for directing horror films such as "Sinister", "The Exorcism of Emily Rose", and "Deliver Us From Evil", as well as the 2016 Marvel Cinematic Universe installment, "Doctor Strange."',
 'title': 'Scott Derrickson'}

In [8]:
qrels

{'5a8b57f25542995d1e6f1371': {'2816539': 1, '10520': 1},
 '5a8c7595554299585d9e36b6': {'33022480': 1, '804602': 1},
 '5a85ea095542994775f606a8': {'12342237': 1, '18974107': 1},
 '5adbf0a255429947ff17385a': {'9421721': 1, '20395866': 1},
 '5a8e3ea95542995a26add48d': {'41146297': 1, '5382358': 1},
 '5abd94525542992ac4f382d2': {'43665973': 1, '40901645': 1},
 '5a85b2d95542997b5ce40028': {'507437': 1, '282635': 1},
 '5a87ab905542996e4f3088c1': {'1922186': 1, '1922204': 1},
 '5a7bbb64554299042af8f7cc': {'39354179': 1, '1316127': 1},
 '5a8db19d5542994ba4e3dd00': {'215890': 1, '10244818': 1},
 '5a7166395542994082a3e814': {'28635126': 1, '163327': 1},
 '5a877e5d5542993e715abf7d': {'11835533': 1, '101813': 1},
 '5ab3b0bf5542992ade7c6e39': {'7038717': 1, '2837025': 1},
 '5ab56e32554299637185c594': {'16605491': 1, '17626850': 1},
 '5ab6d09255429954757d337d': {'14286443': 1, '43223': 1},
 '5a75e05c55429976ec32bc5f': {'10426459': 1, '95743': 1},
 '5ab3e45655429976abd1bcd4': {'38281459': 1, '531687'