In [144]:
import torch
import torch.nn as nn
import pickle
import numpy as np
import faiss

NUM_HIDDEN_NODES = 1536
NUM_HIDDEN_LAYERS = 1
DROPOUT_RATE = 0.1
CURRENT_DEVICE = "cuda:3"
# REVERSE_RANKER_PATH = "/datadrive/ruohan/rerank/train_query_50000_morepos/reverse_alpha0.5_layer1_residual1000_100_1000_0.0001_768.model"
# 1 layer 100 nearest neightbor: 0.7488
# REVERSE_RANKER_PATH = "/datadrive/ruohan/rerank/train_query_50000_morepos/reverse_alpha0.5_layer1_residual1000_100_1000_0.0001_768.model"
# Active Learning
REVERSE_RANKER_PATH = "/datadrive/ruohan/rerank/train_query_50000_morepos/reverse_alpha0.5_layer1_residual_active_learning1000_100_1000_0.0001_768.model"
# 1 layer 1000 nearest neighbor:
# REVERSE_RANKER_PATH = "/datadrive/ruohan/rerank/n_1000/reverse_alpha0.5_layer1_residual1000_100_1000_0.0001_768.model"
PASSAGE_NP_PATH = "/home/jianx/results/passage_0__emb_p__data_obj_0.pb"
PASSAGE_MAP_PATH = "/datadrive/jianx/data/annoy/100_ance_passage_map.dict"
QUERY_TRAIN_NP_PATH = "/home/jianx/results/query_0__emb_p__data_obj_0.pb"
QUERY_TEST_NP_PATH = "/home/jianx/results/test_query_0__emb_p__data_obj_0.pb"
QUERY_MAP_PATH = "/datadrive/jianx/data/annoy/100_ance_query_train_map.dict"
RERANK_TRUE_PATH = "/datadrive/jianx/data/results/rerank_search_rankings_100_100_flat.csv"
QUERY_DEV_NP_PATH = "/home/jianx/results/dev_query_0__emb_p__data_obj_0.pb"
TRAIN_RERANK_PATH = "/datadrive/jianx/data/train_data/ance_rerank_testing_rank100_nqueries50000_20000_Sep_09_19:41:09.csv"

In [145]:
# Define the network
class ResidualNet(torch.nn.Module):

    def __init__(self, embed_size):
        super(ResidualNet, self).__init__()
        
        self.input = nn.Linear(embed_size, NUM_HIDDEN_NODES)
        self.relu = nn.ReLU()
        self.normlayer = nn.LayerNorm(NUM_HIDDEN_NODES)
        self.dropout = nn.Dropout(p=DROPOUT_RATE)
        self.output = nn.Linear(NUM_HIDDEN_NODES, embed_size)

    def forward(self, x):
        identity = x
        out = x
        for i in range(NUM_HIDDEN_LAYERS):
            out = self.input(out)
            out = self.relu(out)
            out = self.normlayer(out)
            out = self.dropout(out)
            out = self.output(out)
            out += identity
#             out = self.relu(out)
        return out

    def parameter_count(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)
def obj_reader(path):
    with open(path, 'rb') as handle:
        return pickle.load(handle, encoding="bytes")
def obj_writer(obj, path):
    with open(path, 'wb') as handle:
        pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)
# Load ground truth ranking
def load_true_dict(k, path):
    true_dict = {}
    with open(path, "r") as file:
        for line in file:
            qid = int(line.split(",")[0])
            pid = int(line.split(",")[1])
            rank = int(line.split(",")[2])
            if rank > k:
                continue
            if pid not in true_dict.keys():
                true_dict[pid] = {}
            true_dict[pid][qid] = rank
    return true_dict
def load_true_dict_query(k, path):
    true_dict = {}
    with open(path, "r") as file:
        for line in file:
            qid = int(line.split(",")[0])
            pid = int(line.split(",")[1])
            rank = int(line.split(",")[2])
            if rank > k:
                continue
            if qid not in true_dict.keys():
                true_dict[qid] = {}
            true_dict[qid][pid] = rank
    return true_dict
def transform_np_transformation(query_np, b=500):
    n = int(query_np.shape[0]/b) + 1

    corpus_output = []
    for i in range(n):
        start = i * b
        end = (i + 1) * b
        if i == n-1:
            end = query_np.shape[0]
        q_embed = query_np[start:end,:]
        q_embed = torch.from_numpy(q_embed).to(CURRENT_DEVICE)
        corpus_output.append(reverse_ranker(q_embed).detach().cpu().numpy())
    corpus_np = np.concatenate(corpus_output[:-1])
    corpus_np = np.concatenate((corpus_np, corpus_output[-1]))
    print(corpus_np.shape)
    return corpus_np
# 6. Find n nearest queries of a passage
# 7. Compare with the groud truth
def evaluate_reverse_ranker(pred_rank, true_rank, k = 100):
    top_true = []
    top_pred = []
    for pid, qids in pred_rank.items():
        n_top_true = len(true_rank.get(pid, {}))
        temp_pred = np.fromiter(qids.values(), dtype=int)
        n_top_pred = sum((temp_pred != 0) & (temp_pred <= k))
        top_true.append(n_top_true)
        top_pred.append(n_top_pred)
    return top_true, top_pred

def generate_pred_rank(query_index, true_dict, baseline_dict, passage_embed, 
                       qid_mapping, pid_reverse_mapping, n=100, k=100):
    pid_list = list(baseline_dict.keys())
    p_embed_list = []
    all_results = {}
    print("Begin append.")
    for i, pid in enumerate(pid_list):
        if i >= n:
            break
        pid_r = pid_reverse_mapping[pid]
        p_embed = np.array(passage_embed[pid_r])
        p_embed_list.append(p_embed)
    p_embed_all = np.stack(p_embed_list)
    print("Finish append.")
    print("Begin search.")
    _, near_qids = query_index.search(p_embed_all, k)
    print("Finish search.")
    for i, pid in enumerate(pid_list):
        temp_results = {}
        for qid in near_qids[i]:
            qid = qid_mapping[qid]
            try:
                rank = true_dict[pid][qid]
            except:
                rank = 0
            temp_results[qid] = rank
        all_results[pid] = temp_results
    return all_results

N_PASSAGE = 100
TRAIN_PASSAGE = 200000
def load_train(path, N_PASSAGE, TRAIN_PASSAGE):
    with open(path) as file:
        my_dict = {}
        count = 0
        for line in file:
            count += 1
            if count <= TRAIN_PASSAGE * 100:
                continue            
            if count > (TRAIN_PASSAGE + N_PASSAGE) * 100:
                break
            tokens = line.split(",")
            pid = int(tokens[0])
            qid = int(tokens[1])
            rank = int(tokens[2].rstrip())
            if pid not in my_dict:
                my_dict[pid] = {}
            my_dict[pid][qid] = rank
    return my_dict
def load_train_dict(path):
    with open(path, "r") as file:
        pos_dict = {}
        neg_dict = {}
        count = 0
        for line in file:
            tokens = line.split(",")
            pid = int(tokens[0])
            qid = int(tokens[1])
            rank = int(tokens[2].rstrip())
            if rank == 0:
                if pid not in neg_dict:
                    neg_dict[pid] = {}
                neg_dict[pid][qid] = 200
            else:
                if pid not in pos_dict:
                    pos_dict[pid] = {}
                pos_dict[pid][qid] = rank
    return pos_dict, neg_dict
def count_unique_queries(train_pos, train_neg):
    unique_queries = set()
    for pid, qids in train_pos.items():
        for qid, rank in qids.items():
            unique_queries.add(qid)
    for pid, qids in train_neg.items():
        for qid, rank in qids.items():
            unique_queries.add(qid)
    print(len(unique_queries))
    return unique_queries
def compare_with_baseline(query_index, true_dict_100, forward_baseline_rank, passage_embed, 
                          qid_mapping, pid_reverse_mapping,n):
    pred_rank = generate_pred_rank(query_index, true_dict_100, forward_baseline_rank, 
                                   passage_embed, qid_mapping, pid_reverse_mapping, n=n)
    top_true, top_pred = evaluate_reverse_ranker(pred_rank, true_dict_100, k = 100)
    print("New model: {}".format(np.mean(top_pred)/np.mean(top_true)))
    top_true_baseline, top_pred_baseline = evaluate_reverse_ranker(forward_baseline_rank, true_dict_100, k = 100)
    print("Baseline model: {}".format(np.mean(top_pred_baseline)/np.mean(top_true_baseline)))
    return top_true, top_pred, top_true_baseline, top_pred_baseline, pred_rank
def delete_zeros(myDict):
    out_dict = {key:val for key, val in myDict.items() if val != 0}
    return set(list(out_dict.keys()))
def compare_specific_passage(pred_rank_test1, forward_baseline_rank_test1, n):
    count = 0
    count_loss = 0
    all_count = 0
    all_count_loss = 0
    for i in range(n):
        pid = list(pred_rank_test1.keys())[i]
        pred = delete_zeros(pred_rank_test1[pid])
        baseline = delete_zeros(forward_baseline_rank_test1[pid])
        diff = pred - baseline
        if diff != set():
            count += 1
            all_count += len(diff)
#             print("Newly found {}".format(diff))
        diff_loss = baseline - pred
        if diff_loss != set():
            count_loss += 1
            all_count_loss += len(diff_loss)
#             print("Lose {}".format(diff_loss))
    print("Percentage of passages with newly found exposing queries: {}".format(count/n))
    print("Percentage of passages that lose originaly found exposing queries: {}".format(count_loss/n))
    print("Number of newly found exposing queries:{} Number of lost exposing queries:{} Net gain:{}".format(all_count, all_count_loss, all_count - all_count_loss))

In [146]:
checkpoint = torch.load(REVERSE_RANKER_PATH)
reverse_ranker = ResidualNet(embed_size=768)
reverse_ranker.load_state_dict(checkpoint['model'])
reverse_ranker.to(CURRENT_DEVICE)
reverse_ranker.eval()

ResidualNet(
  (input): Linear(in_features=768, out_features=1536, bias=True)
  (relu): ReLU()
  (normlayer): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (output): Linear(in_features=1536, out_features=768, bias=True)
)

In [6]:
print("Load passages.")
passage_np = obj_reader(PASSAGE_NP_PATH)
pid_mapping = obj_reader(PASSAGE_MAP_PATH)
print("Load queries.")
query_np = obj_reader(QUERY_TRAIN_NP_PATH)
qid_mapping = obj_reader(QUERY_MAP_PATH)
print("Load pre-processed results.")
true_rerank_dict_100 = load_true_dict(100, RERANK_TRUE_PATH)
pid_reverse_mapping = {v: k for k, v in pid_mapping.items()}

Load passages.
Load queries.
Load pre-processed results.


In [147]:
query_new_np = transform_np_transformation(query_np)
passage_new_np = transform_np_transformation(passage_np)

(502939, 768)
(8841823, 768)


In [148]:
# 5. Add the new numpy array to Flatindex
dim = query_new_np.shape[1]
query_index = faiss.IndexFlatIP(dim)
query_index.add(query_new_np)

## Train on 1000 nearest neighbor

In [112]:
forward_baseline_rank_test_rerank = load_train(TRAIN_RERANK_PATH, N_PASSAGE = 20000, TRAIN_PASSAGE = 0)
top_true_test_rerank, top_pred_test_rerank, top_true_baseline_test_rerank, top_pred_baseline_test_rerank, pred_rank_test_rerank = compare_with_baseline(query_index, true_rerank_dict_100, forward_baseline_rank_test_rerank, passage_new_np, qid_mapping, pid_reverse_mapping, n=20000)
compare_specific_passage(pred_rank_test_rerank, forward_baseline_rank_test_rerank, n=20000)

Begin append.
Finish append.
Begin search.
Finish search.
New model: 0.716904768535279
Baseline model: 0.6476649307972931
Percentage of passages with newly found exposing queries: 0.37325
Percentage of passages that lose originaly found exposing queries: 0.20745
Number of newly found exposing queries:13866 Number of lost exposing queries:6407 Net gain:7459


In [113]:
train_pos_dict, train_neg_dict = load_train_dict("/datadrive/jianx/data/train_data/ance_rerank_training_rank100_nqueries50000_200000_Sep_09_19:41:09.csv")
unique_queries = count_unique_queries(train_pos_dict, train_neg_dict)
print("{} queries have been seen by the reverse ranker during training.".format(len(unique_queries)))
expose = count_unique_queries(pred_rank_test_rerank, pred_rank_test_rerank)
expose_append = count_unique_queries(pred_rank_test_rerank, pred_rank_test_rerank)
print("{}/{} of the exposing queries have not seen by the reverse ranker while training.".format(len(expose_append
       - unique_queries), len(expose_append)))

49769
49769 queries have been seen by the reverse ranker during training.
393902
393902
354228/393902 of the exposing queries have not seen by the reverse ranker while training.


## Train on 100 nearest neighbor

In [119]:
forward_baseline_rank_test_rerank = load_train(TRAIN_RERANK_PATH, N_PASSAGE = 20000, TRAIN_PASSAGE = 0)
top_true_test_rerank, top_pred_test_rerank, top_true_baseline_test_rerank, top_pred_baseline_test_rerank, pred_rank_test_rerank = compare_with_baseline(query_index, true_rerank_dict_100, forward_baseline_rank_test_rerank, passage_new_np, qid_mapping, pid_reverse_mapping, n=20000)
compare_specific_passage(pred_rank_test_rerank, forward_baseline_rank_test_rerank, n=20000)

Begin append.
Finish append.
Begin search.
Finish search.
New model: 0.7488094906569385
Baseline model: 0.6476649307972931
Percentage of passages with newly found exposing queries: 0.4022
Percentage of passages that lose originaly found exposing queries: 0.16355
Number of newly found exposing queries:15758 Number of lost exposing queries:4862 Net gain:10896


In [120]:
train_pos_dict, train_neg_dict = load_train_dict("/datadrive/jianx/data/train_data/ance_rerank_training_rank100_nqueries50000_200000_Sep_09_19:41:09.csv")
unique_queries = count_unique_queries(train_pos_dict, train_neg_dict)
print("{} queries have been seen by the reverse ranker during training.".format(len(unique_queries)))
expose = count_unique_queries(pred_rank_test_rerank, pred_rank_test_rerank)
expose_append = count_unique_queries(pred_rank_test_rerank, pred_rank_test_rerank)
print("{}/{} of the exposing queries have not seen by the reverse ranker while training.".format(len(expose_append
       - unique_queries), len(expose_append)))

49769
49769 queries have been seen by the reverse ranker during training.
410181
410181
368930/410181 of the exposing queries have not seen by the reverse ranker while training.


# Redundant

In [149]:
forward_baseline_rank_test_rerank = load_train(TRAIN_RERANK_PATH, N_PASSAGE = 20000, TRAIN_PASSAGE = 0)
top_true_test_rerank, top_pred_test_rerank, top_true_baseline_test_rerank, top_pred_baseline_test_rerank, pred_rank_test_rerank = compare_with_baseline(query_index, true_rerank_dict_100, forward_baseline_rank_test_rerank, passage_new_np, qid_mapping, pid_reverse_mapping, n=20000)
compare_specific_passage(pred_rank_test_rerank, forward_baseline_rank_test_rerank, n=20000)

Begin append.
Finish append.
Begin search.
Finish search.
New model: 0.7592804032415272
Baseline model: 0.6476649307972931
Percentage of passages with newly found exposing queries: 0.4078
Percentage of passages that lose originaly found exposing queries: 0.1403
Number of newly found exposing queries:16101 Number of lost exposing queries:4077 Net gain:12024


In [151]:
train_pos_dict, train_neg_dict = load_train_dict("/datadrive/jianx/data/train_data/ance_rerank_training_rank100_nqueries50000_200000_Sep_09_19:41:09.csv")
unique_queries = count_unique_queries(train_pos_dict, train_neg_dict)
print("{} queries have been seen by the reverse ranker during training.".format(len(unique_queries)))
expose = count_unique_queries(pred_rank_test_rerank, pred_rank_test_rerank)
expose_append = count_unique_queries(pred_rank_test_rerank, pred_rank_test_rerank)
print("{}/{} of the exposing queries have not seen by the reverse ranker while training.".format(len(expose_append
       - unique_queries), len(expose_append)))

49769
49769 queries have been seen by the reverse ranker during training.
419111
419111
376621/419111 of the exposing queries have not seen by the reverse ranker while training.


In [46]:
train_pos_dict, train_neg_dict = load_train_dict("/datadrive/jianx/data/train_data/ance_rerank_training_rank100_nqueries50000_200000_Sep_09_19:41:09.csv")
unique_queries = count_unique_queries(train_pos_dict, train_neg_dict)
print("{} queries have been seen by the reverse ranker during training.".format(len(unique_queries)))
expose = count_unique_queries(pred_rank_test_rerank, pred_rank_test_rerank)
expose_append = count_unique_queries(pred_rank_test_rerank, pred_rank_test_rerank)
print("{}/{} of the exposing queries have not seen by the reverse ranker while training.".format(len(expose_append
       - unique_queries), len(expose_append)))

49769
49769 queries have been seen by the reverse ranker during training.
192729
192729
173545/192729 of the exposing queries have not seen by the reverse ranker while training.


In [17]:
RESULT_PATH = "/datadrive/ruohan/reverse_ranker/residual/train_query_50000_morepos_layer1_reludel/"
obj_writer(forward_baseline_rank_test_rerank, RESULT_PATH + "forward_baseline_rank_test.pickle")
obj_writer(top_true_baseline_test_rerank, RESULT_PATH + "top_true_test.pickle")
obj_writer(top_pred_test_rerank, RESULT_PATH + "top_pred_test.pickle")
obj_writer(top_true_baseline_test_rerank, RESULT_PATH + "top_true_baseline_test.pickle")
obj_writer(top_pred_baseline_test_rerank, RESULT_PATH + "top_pred_baseline_test.pickle")
obj_writer(pred_rank_test_rerank, RESULT_PATH + "pred_rank_test.pickle")

# Generate new training data

In [19]:
true_query_dict = load_true_dict_query(100, RERANK_TRUE_PATH)

In [63]:
train_pos_dict, train_neg_dict = load_train_dict("/datadrive/jianx/data/train_data/ance_rerank_training_rank100_nqueries50000_200000_Sep_09_19:41:09.csv")

In [71]:
passages_list = list(train_neg_dict.keys())

In [72]:
passages_list_wrong = [pid_reverse_mapping[i] for i in passages_list]

In [17]:
qid_mapping = obj_reader(QUERY_MAP_PATH)
queries_list = [qid_mapping[i] for i in range(50000)]

In [30]:
pids_set = set(passages_list)
for qid in queries_list:
    temp_dict = true_query_dict.get(qid, {})
    if temp_dict == {}:
        continue
    temp_list = list(temp_dict.keys())
    temp_list = list(set(temp_list) & pids_set)
    for pid in temp_list:
        if pid not in train_pos_dict:
            train_pos_dict[pid] = {}
        train_pos_dict[pid][qid] = temp_dict[pid]

In [31]:
len(train_pos_dict)

199541

In [34]:
obj_writer(train_pos_dict, "/datadrive/ruohan/data/ance_rerank_training_rank100_nqueries50000_200000_Sep_09_19:41:09.dict")

In [33]:
count = 0
for pid, qids in train_pos_dict.items():
    if len(qids) > 100:
        print(qids)
    if len(qids) < 100:
        print(qids)
    count += 1
    if count > 100:
        break

{819199: 95}
{685143: 42}
{1146692: 36}
{4882: 13, 449760: 90}
{528374: 82}
{1184718: 17}
{208476: 8}
{908613: 41}
{876069: 69}
{625050: 43}
{52094: 13, 848198: 36, 46066: 64}
{698422: 38}
{100672: 83}
{35927: 11}
{317681: 75, 49309: 62}
{1034791: 34}
{818920: 8, 598248: 84, 187194: 58}
{587725: 25}
{314854: 13, 679448: 74}
{44940: 89}
{610096: 6}
{92138: 29}
{594595: 43}
{406672: 75, 522164: 66}
{880291: 79}
{806193: 42}
{778438: 10}
{744260: 6, 721554: 36}
{101778: 47, 316768: 10, 1151400: 2, 307312: 61}
{220349: 57}
{378102: 66}
{211259: 15, 965609: 58}
{932358: 18}
{375722: 11, 255082: 99}
{1157681: 94, 697642: 93}
{917168: 14, 1022758: 20}
{1043987: 38, 367571: 50}
{1144744: 22}
{819979: 63}
{664458: 25}
{86270: 63}
{209704: 41}
{321825: 14, 436538: 24}
{903066: 21}
{941879: 60, 336724: 94}
{208833: 94}
{770138: 40, 1023556: 82}
{511782: 21, 200375: 22, 172932: 62}
{208614: 68}
{1007897: 29, 608398: 50}
{742580: 74}
{576345: 90}
{135915: 23}
{660379: 11, 99700: 43, 660587: 68}
{57

In [20]:
# Generate training data for forward ranker
# to learn the BM25 filtering
true_full_query_100 = load_true_dict_query(100, "/datadrive/jianx/data/results/all_search_rankings_100_100_flat.csv")

In [21]:
len(true_full_query_100) - len(true_query_dict)

1706

In [31]:
with open("/datadrive/ruohan/data/rerank_learnBM25.csv",'w') as f:
    for qid in queries_list:
        if qid not in true_query_dict:
            continue
        pids = list(true_full_query_100[qid].keys())
        for pid in pids:
            rank = true_query_dict[qid].get(pid, 0)
            f.write('{},{},{}\n'.format(qid, pid, rank))

In [32]:
with open("/datadrive/ruohan/data/rerank_learnBM25_morepos.csv",'w') as f:
    for qid in queries_list:
        if qid not in true_query_dict:
            continue
        pids = list(true_full_query_100[qid].keys())
        temp_dict = true_query_dict[qid]
        out_pos_pids = list(set(list(temp_dict.keys()))-set(pids))
        for pid in pids:
            rank = temp_dict.get(pid, 0)
            f.write('{},{},{}\n'.format(qid, pid, rank))
        for pidout in out_pos_pids:
            rankout = temp_dict.get(pidout)
            f.write('{},{},{}\n'.format(qid, pidout, rankout))

In [105]:
def generate_index(obj_np):
    dim = obj_np.shape[1]
    out_index = faiss.IndexFlatIP(dim)
    out_index.add(obj_np)
    return out_index
def generate_ground_truth_true_id(out_index, test_np, qid_mapping, pid_mapping, passages_list, true_dict, k=100):
    _, near_pids = out_index.search(test_np, k)
    results = {}
    for i, pid in enumerate(passages_list):
        if pid not in results:
            results[pid] = {}
        for qid in near_pids[i,:]:
            qid_true = qid_mapping[qid]
            try:
                rank = true_dict[qid_true][pid]
            except:
                rank = 0
            if rank > 0:
                print(rank)
            results[pid][qid_true] = rank
    return results

In [None]:
query_partial_index = generate_index(query_np[:50000,:])
passage_selected_np = passage_np[passages_list_wrong,:]

In [106]:
train_data_1000 = generate_ground_truth_true_id(query_partial_index, passage_selected_np, qid_mapping, pid_mapping,
                                                passages_list,true_rerank_dict_100,1000)

93
41
72
34
90
21
64
19
41
63
56
6
64
59
63
97
63
89
8
79
89
84
37
45
82
7
35
39
47
67
1
32
92
67
44
86
61
62
54
32
34
77
1
38
2
7
17
27
89
93
93
6
50
73
67
88
29
78
9
88
17
12
33
44
44
44
19
75
21
90
14
25
59
62
86
62
9
16
72
59
56
20
89
35
70
89
79
35
79
33
81
25
93
83
64
28
55
16
78
97
78
36
29
88
87
21
16
44
97
91
34
23
51
26
17
69
82
100
51
68
86
10
88
9
91
83
46
78
22
34
50
71
63
51
48


In [81]:
with open("/datadrive/ruohan/data/rerank_top1000_train.csv",'w') as f:
    for qid, results in train_data_1000.items():
        for pid, rank in results.items():
            f.write('{},{},{}\n'.format(pid, qid, rank))

## Generate training data from current model

In [124]:
forward_baseline_rank_test_rerank = load_train("/datadrive/jianx/data/train_data/ance_rerank_training_rank100_nqueries50000_200000_Sep_09_19:41:09.csv", N_PASSAGE = 200000, TRAIN_PASSAGE = 0)
top_true_test_rerank, top_pred_test_rerank, top_true_baseline_test_rerank, top_pred_baseline_test_rerank, pred_rank_test_rerank = compare_with_baseline(query_index, true_rerank_dict_100, forward_baseline_rank_test_rerank, passage_new_np, qid_mapping, pid_reverse_mapping, n=200000)
compare_specific_passage(pred_rank_test_rerank, forward_baseline_rank_test_rerank, n=200000)

Begin append.
Finish append.
Begin search.
Finish search.
New model: 0.7330567000369039
Baseline model: 0.1340413618659141
Percentage of passages with newly found exposing queries: 0.92135
Percentage of passages that lose originaly found exposing queries: 0.11285
Number of newly found exposing queries:125691 Number of lost exposing queries:2844 Net gain:122847


In [134]:
new_query_index = generate_index(query_new_np[:50000,:])

In [137]:
p_true_list = list(pred_rank_test_rerank.keys())
p_fake_list = [pid_reverse_mapping[i] for i in p_true_list]
new_train_passages = passage_new_np[p_fake_list,:]

In [138]:
_, new_near = new_query_index.search(new_train_passages, 100)

In [142]:
obj_writer(query_new_np, "/datadrive/ruohan/data/active_query_np.pb")

In [143]:
obj_writer(passage_new_np, "/datadrive/ruohan/data/active_passage_np.pb")