In [None]:
import pickle
import pandas as pd
from pyspark.ml.linalg import SparseVector
import numpy as np

In [None]:
def contentTF(row):
    return row.qContentTF.dot(row.contentTF)

def contentTFIDF(row):
    return row.contentTFScore * row.contentIDFScore
    return row.contentTF.dot(row.qContentIDF)

def titleTFIDF(row):
    return row.titleTFScore * row.titleIDFScore
    return  row.titleTF.dot(row.qTitleIDF)

def titleTF(row):
    return row.qTitleTF.dot(row.titleTF)

def titleIDF(row):
    return row.titleHotVec[0].dot(row.qTitleIDF)

def contentIDF(row):
    return row.contentHotVec[0].dot(row.qContentIDF)

def to_numpy(input_vector, mul_hot_vector):
    input_idx = input_vector.indices
    hot_idx = set(mul_hot_vector.indices)
    values = input_vector.values
        
    output = []
    for in_i, val in zip(input_idx, values):
        if in_i in hot_idx:
            output.append(val)
            
    return np.asarray(output)

def contentBM25(row, b, k1):
    tf_np = to_numpy(row.contentTF, row.contentHotVec[0])
    idf_np = to_numpy(row.contentIDF, row.contentHotVec[0])
    
    nom = (tf_np * (k1 + 1))
    denom = tf_np + k1 * (1 - b + (b * (row.contentLength/mean_doc_df.avgContentSize[0])))
    
    return (idf_np * (nom / denom)).sum()

def titleBM25(row, b, k1):
    tf_np = to_numpy(row.titleTF, row.titleHotVec[0])
    idf_np = to_numpy(row.titleIDF, row.titleHotVec[0])
    
    nom = (tf_np * (k1 + 1))
    denom = tf_np + k1 * (1 - b + (b * (row.titleLength/mean_doc_df.avgTitleSize[0])))
    
    return (idf_np * (nom / denom)).sum()


In [None]:
mean_doc_df = pd.read_pickle("mean_doc")
score_df = pd.read_pickle("clueweb-df-with-pagerank")

def to_sparse(row, column):
    size = row[column].size
    indices = row[column].indices
    values = [1.0 for _ in indices]
    return [SparseVector(size, indices, values)]

def get_doc2query(path):
    doc2query = []
    with open(path, "r") as f:
        for row in f:
            query_id, _, doc_id, rel_score = row.rstrip().split(" ")
            if doc_id in doc2query:
                print(doc_id)
            doc2query.append([int(query_id), doc_id, rel_score])
    return pd.DataFrame(doc2query, columns=["queryID", "docID", "relScore"])

# Prepare query_df
query_df = pd.read_pickle("query_df")
query_df = query_df[['id','contentIDF', "titleIDF", 'contentTF', "titleTF"]]
query_df.columns = ["queryID", "qContentIDF", "qTitleIDF", "qContentTF", "qTitleTF"]
query_df['queryID'] = query_df['queryID'].apply(lambda x: int(x))

# Add multihot vectors for the queries.
query_df["titleHotVec"] = query_df.apply(lambda x: to_sparse(x, "qTitleIDF"), axis=1)
query_df["contentHotVec"] = query_df.apply(lambda x: to_sparse(x, "qContentIDF"), axis=1)
query_df["contentIDFScore"] = query_df.apply(lambda x: contentIDF(x), axis=1)
query_df["titleIDFScore"] = query_df.apply(lambda x: titleIDF(x), axis=1)


# # Get a dataframe with the query / document pairs
doc2query = get_doc2query("../../highlightGenerator/storage/TREC/rels")
with_q_id = doc2query.merge(score_df, on=["docID"])

# # Create one dataframe with all query's and their scores
with_query = query_df.merge(with_q_id, on="queryID")

with_query

In [None]:
with_query["contentTFScore"] = with_query.apply(lambda x: contentTF(x), axis=1)
with_query["contentIDFScore"] = with_query.apply(lambda x: contentIDF(x), axis=1)
with_query["contentTFIDFScore"] = with_query.apply(lambda x: contentTFIDF(x), axis=1)
with_query["titleTFScore"] = with_query.apply(lambda x: titleTF(x), axis=1)
with_query["titleIDFScore"] = with_query.apply(lambda x: titleIDF(x), axis=1)
with_query["titleTFIDFScore"] = with_query.apply(lambda x: titleTFIDF(x), axis=1)

# BM25 is calculated with b = 0.8, k1 = 2.5 and k2 = 0 )the k2 term is therefor discared.
with_query["contentBM25"] = with_query.apply(lambda x: contentBM25(x, 0.8, 2.5), axis=1)
with_query["titleBM25"] = with_query.apply(lambda x: titleBM25(x, 0.8, 2.5), axis=1)
with_query["pagerank"] = with_query["pagerank"] * 10e5

with_query

In [None]:
logged = with_query.copy()

logged["contentBM25"] = np.log(with_query["contentBM25"] + 1e-7)
logged["titleBM25"] = np.log(with_query["titleBM25"] + 1e-7)
logged["contentTFIDFScore"] = np.log(with_query["contentTFIDFScore"] + 1e-7)
logged["titleTFIDFScore"] = np.log(with_query["titleTFIDFScore"] + 1e-7)
logged["contentLength"] = np.log(with_query["contentLength"] + 1e-7)
logged["contentTFScore"] = np.log(with_query["contentTFScore"] + 1e-7)
logged["titleTFScore"] = np.log(with_query["titleTFScore"] + 1e-7)
logged["pagerank"] = np.log(with_query["pagerank"] + 1e-7)
logged["titleLength"] = np.log(with_query["titleLength"] + 1e-7)
logged["pagerank"] = np.log(with_query["pagerank"] + 1e-7)
logged["contentIDFScore"] = np.log(with_query["contentIDFScore"] + 1e-7)
logged["titleIDFScore"] = np.log(with_query["titleIDFScore"] + 1e-7)

logged

In [None]:
def to_letor(df, path):
    with open(path, "w") as f:
        for row in df.iterrows():
            start = "{} qid:{}".format(row[1]["relScore"], row[1]["queryID"])
            score_df = row[1].drop(['queryID', 'docID', 'relScore'])
            scores = " ".join([str(i) + ":" + '{0:.10f}'.format(x) for i,x in enumerate(score_df)])
            print([x for x in score_df.keys()])
            return
            end = "#docid = {}".format(row[1]["docID"])
            letor = "{} {} {}\n".format(start, scores, end)
            f.write(letor)


In [2]:

import pandas as pd
pd.read_pickle("clueweb-df-with-pagerank")

Unnamed: 0,docID,pagerank,contentTF,contentIDF,titleTF,titleIDF,contentLength,titleLength
0,clueweb12-0000tw-35-20780,1.417858e-09,"(37.0, 2.0, 20.0, 6.0, 23.0, 11.0, 14.0, 7.0, ...","(7.087885396658067, 3.1176823062542898, 4.2237...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.672...",778,11
1,clueweb12-0000wb-07-16070,1.310056e-09,"(5.0, 0.0, 5.0, 6.0, 20.0, 1.0, 2.0, 1.0, 0.0,...","(0.9578223508997388, 0.0, 1.0559393919143298, ...","(2.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, ...","(2.538852167198697, 0.0, 0.0, 0.0, 2.458435823...",515,15
2,clueweb12-0000wb-33-34663,1.187196e-09,"(321.0, 1.0, 138.0, 133.0, 177.0, 118.0, 77.0,...","(61.49219492776323, 1.5588411531271449, 29.143...","(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 2.4584358231725068, 0.0, ...",4934,5
3,clueweb12-0003wb-11-32584,1.318697e-09,"(3.0, 0.0, 2.0, 9.0, 0.0, 10.0, 1.0, 0.0, 0.0,...","(0.5746934105398434, 0.0, 0.42237575676573197,...","(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 2.4584358231725068, 0.0, ...",326,12
4,clueweb12-0005wb-19-29156,1.207472e-09,"(17.0, 0.0, 4.0, 4.0, 10.0, 9.0, 13.0, 5.0, 0....","(3.256595993059112, 0.0, 0.8447515135314639, 0...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, ...","(1.2694260835993485, 0.0, 0.0, 0.0, 0.0, 0.0, ...",542,10
5,clueweb12-0008wb-21-02753,1.271471e-09,"(100.0, 0.0, 87.0, 74.0, 123.0, 33.0, 126.0, 4...","(19.156447017994775, 0.0, 18.37334541930934, 1...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4270,1
6,clueweb12-0008wb-39-11217,1.424112e-09,"(13.0, 0.0, 3.0, 5.0, 0.0, 7.0, 4.0, 0.0, 8.0,...","(2.490338112339321, 0.0, 0.6335636351485979, 1...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",482,4
7,clueweb12-0008wb-97-31282,1.240710e-09,"(110.0, 0.0, 19.0, 35.0, 32.0, 33.0, 9.0, 9.0,...","(21.072091719794255, 0.0, 4.012569689274454, 7...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.2694260835993485, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1210,6
8,clueweb12-0010wb-33-00503,1.328765e-09,"(150.0, 0.0, 60.0, 36.0, 63.0, 46.0, 78.0, 16....","(28.734670526992165, 0.0, 12.67127270297196, 7...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.672...",3235,6
9,clueweb12-0011wb-30-21099,1.669595e-09,"(46.0, 0.0, 48.0, 23.0, 64.0, 10.0, 4.0, 1.0, ...","(8.811965628277598, 0.0, 10.137018162377567, 4...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","(0.0, 0.0, 2.2566070641099714, 0.0, 0.0, 0.0, ...",1266,6
