In [6]:
relevant = {
    0 : [225007,
345673,
125657,
40553,
386423,
108338,
18062,
210591,
377723,
376919,
216657,
247154,
40352,
40444,
282843,
336769,
234002,
83792,
139208,
96798,
73057],
    1 : [
        295121,
295097,
294635,
294662,
294736,
294635,
295166,
4762,
4788,
4801,
4804,
4807,
4816,
4875,
4885,
4894,
4653,
4603,
4617,
97858,
97893,
97967,
97971,
97974,
    ]
}

In [7]:
import numpy as np 
import requests 
import simplejson 

RERANK = 50

queries = ["I will kill you", "Freddy Krueger Nightmare on Elm Street"]

for q_id, query in enumerate(queries):
    url = "http://localhost:8983/solr/conversations/select"
    url += "?rows=50"
    url += "&q.op=OR"
    url += "&q={0}".format(query)
    url += "&wt=json"
    url += "&defType=edismax"
    url += "&qf=transcript^5.0 movie^3.0 actors^2.0 characters^1.0"
    url += "&fl=id,transcript, score, imdb_votes, imdb_rating, movie, [features]" 
    url += "&rq={{!ltr model=myModel reRankDocs=100 efi.text={0}}}".format(query)
    
    response = requests.request("GET", url)
    try:
        json = simplejson.loads(response.text)
    except simplejson.JSONDecodeError:
        print(q_id)
    
    if "error" in json:
        print(q_id)
        
    # extract the features
    results_features = []
    results_targets = []
    results_ranks = [] # holds the ranks of relevant documents 
    add_data = False
    
    for (rank, document) in enumerate(json["response"]["docs"]):
        
        features = document["[features]"].split(",")
        feature_array = []
        for feature in features:
            feature_array.append(feature.split("=")[1])
        
        feature_array = np.array(feature_array, dtype="float32")
        results_features.append(feature_array)
        
        doc_id = document["id"]
        doc_id = int(doc_id)

        # Check if the document is relevant
        if doc_id in relevant[q_id]:
            results_ranks.append(rank+1)
            results_targets.append(1)
            add_data = True
        else:
            results_targets.append(0)
    
    print(results_features)
    print(results_targets)
    print(results_ranks)
    
    if add_data:
        np.save("{0}_X.npy".format(q_id), np.array(results_features))
        np.save("{0}_y.npy".format(q_id), np.array(results_targets))
        np.save("{0}_rank.npy".format(q_id), np.array(results_ranks))
    

[array([ 0.25369245, 27.015167  ], dtype=float32), array([ 0.04132334, 25.256144  ], dtype=float32), array([8.5022049e-03, 2.4017912e+01], dtype=float32), array([ 0.4208317, 23.914581 ], dtype=float32), array([ 0.2251236, 23.802486 ], dtype=float32), array([ 0.25981688, 23.717548  ], dtype=float32), array([ 0.37222487, 23.473614  ], dtype=float32), array([ 0.2251236, 23.403242 ], dtype=float32), array([ 0.08303542, 23.247538  ], dtype=float32), array([ 0.03800593, 22.811167  ], dtype=float32), array([ 0.05607639, 22.71423   ], dtype=float32), array([1.7686494e-02, 2.2644855e+01], dtype=float32), array([ 0.07138513, 22.476604  ], dtype=float32), array([ 0.02666807, 22.359653  ], dtype=float32), array([1.9644506e-02, 2.2139509e+01], dtype=float32), array([ 0.02393734, 21.995106  ], dtype=float32), array([ 0.09833939, 21.995106  ], dtype=float32), array([ 0.19448943, 21.904963  ], dtype=float32), array([ 0.19095498, 21.681314  ], dtype=float32), array([ 0.1250623, 21.56304  ], dtype=float

In [8]:
import glob
import numpy as np

rank_files = glob.glob("*_rank.npy")
suffix_len = len("_rank.npy")

RERANK = 50

ranks = []
casenumbers = []
Xs = []
ys = []

for rank_file in rank_files:
    X = np.load(rank_file[:-suffix_len] + "_X.npy") # simply load the features . Example 0_X.npy
    casenumbers.append(rank_file[:-suffix_len]) # get the case number
    
    if X.shape[0] != RERANK:
        print(rank_file[:-suffix_len])
        continue
    
    print(rank_file)
    print(np.load(rank_file))
    rank = np.load(rank_file)[0] # load the rank index
    ranks.append(rank)
    
    y = np.load(rank_file[:-suffix_len] + "_y.npy")
    Xs.append(X)
    ys.append(y)

print("RANKS:" , ranks)
ranks = np.array(ranks)
total_queries = len(ranks)
print("Total queries:", total_queries)
print("Top 1: {0}".format((ranks == 1).sum() / total_queries))
print("Top 3: {0}".format((ranks <= 3).sum() / total_queries))
print("Top 5: {0}".format((ranks <= 5).sum() / total_queries))
print("Top 10: {0}".format((ranks <= 10).sum() / total_queries))    

0_rank.npy
[ 1  2 13 14 18 20 25 50]
1_rank.npy
[ 3  4  9 11 12 13 14 15 16 17 18 21 22 24 26 28 32 36 46]
RANKS: [1, 3]
Total queries: 2
Top 1: 0.5
Top 3: 1.0
Top 5: 1.0
Top 10: 1.0


In [9]:
from scipy.stats import rankdata
from sklearn.svm import LinearSVC

X = np.concatenate(Xs,0)

y = np.concatenate(ys)


train_per = 0.8

train_cutoff = int(train_per * len(ranks)) * RERANK

train_X = X[:train_cutoff]

train_y = y[:train_cutoff]

test_X = X[train_cutoff:]

test_y = y[train_cutoff:]

print("TEST Y:", test_y)

model = LinearSVC()
model.fit(train_X, train_y)
preds = model._predict_proba_lr(test_X)


#print(preds[0:50,1]) # print the probability of being relevant (?)

n_test = int(len(test_y) / RERANK)
new_ranks = []
for i in range(n_test):
    start = i * RERANK
    end = start + RERANK
    scores = preds[start:end,1] # get the probability of being relevant
    score_ranks = rankdata(-scores) # rank the scores [1,2,3,...]
    old_rank = np.argmax(test_y[start:end]) 
    print("OLD RANK:", old_rank)
    new_rank = score_ranks[old_rank]
    new_ranks.append(new_rank)    

print("NEW RANKs:", new_ranks)
new_ranks = np.array(new_ranks)
print("Total queries:", n_test)
print("Top 1: {0}".format((new_ranks == 1).sum() / n_test))
print("Top 3: {0}".format((new_ranks <= 3).sum() / n_test))
print("Top 5: {0}".format((new_ranks <= 5).sum() / n_test))
print("Top 10: {0}".format((new_ranks <= 10).sum() / n_test))

TEST Y: [0 0 1 1 0 0 0 0 1 0 1 1 1 1 1 1 1 1 0 0 1 1 0 1 0 1 0 1 0 0 0 1 0 0 0 1 0
 0 0 0 0 0 0 0 0 1 0 0 0 0]
OLD RANK: 2
NEW RANKs: [3.0]
Total queries: 1
Top 1: 0.0
Top 3: 1.0
Top 5: 1.0
Top 10: 1.0




In [10]:
Xs = []
for rank_file in rank_files:
    X = np.load(rank_file[:-suffix_len] + "_X.npy")
    if X.shape[0] != RERANK:
        print(rank_file[:-suffix_len])
        continue
    
    rank = np.load(rank_file)[0]
    pos_example = X[rank-1]
    
    for (i,neg_example) in enumerate(X):
        if i == rank-1:
            continue
        Xs.append(np.concatenate([pos_example, neg_example]))
    

X = np.stack(Xs)
dim = int(X.shape[1] / 2)

train_per = 0.8
train_cutoff = int(train_per * len(ranks)) * (RERANK-1)

train_X = X[:train_cutoff]
train_y = X[train_cutoff:]

In [12]:
#!pip install keras
from keras import backend 
from keras.callbacks import ModelCheckpoint
from keras.layers import Activation, Add, Dense, Input, Lambda
from keras.models import Model

y = np.ones((train_X.shape[0],1))

INPUT_DIM = 5
h_1_dim = 64
h_2_dim = h_1_dim // 2
h_3_dim = h_2_dim // 2

# Model 
h_1 = Dense(h_1_dim, activation="relu")
h_2 = Dense(h_2_dim, activation="relu")
h_3 = Dense(h_3_dim, activation="relu")
s = Dense(1)

# Relevant document score
rel_doc = Input(shape=(INPUT_DIM,), dtype="float32")
h_1_rel = h_1(rel_doc)
h_2_rel = h_2(h_1_rel)
h_3_rel = h_3(h_2_rel)
rel_score = s(h_3_rel)

# Irrelevant document score
irrel_doc = Input(shape=(INPUT_DIM,), dtype="float32")
h_1_irrel = h_1(irrel_doc)
h_2_irrel = h_2(h_1_irrel)
h_3_irrel = h_3(h_2_irrel)
irrel_score = s(h_3_irrel)

# Subtract scores
negated_irrel_score = Lambda(lambda x: -1*x, output_shape=(1,))(irrel_score)
diff = Add()([rel_score, negated_irrel_score])

# Pass difference through sigmoid function
prob = Activation("sigmoid")(diff)

# Build model
model = Model(inputs=[rel_doc, irrel_doc], outputs=prob)
model.compile(optimizer="adagrad", loss="binary_crossentropy")

ModuleNotFoundError: No module named 'tensorflow'