In [12]:
import pickle
import pandas as pd
from pyspark.ml.linalg import SparseVector
import numpy as np

In [13]:
def contentTF(row):
    return row.qContentTF.dot(row.contentTF)


def contentTFIDF(row):
    return row.contentTFScore * row.contentIDFScore
    return row.contentTF.dot(row.qContentIDF)

def titleTFIDF(row):
    return row.titleTFScore * row.titleIDFScore
    return  row.titleTF.dot(row.qTitleIDF)

def titleTF(row):
    return row.qTitleTF.dot(row.titleTF)

def titleIDF(row):
    return row.titleHotVec[0].dot(row.qTitleIDF)

def contentIDF(row):
    return row.contentHotVec[0].dot(row.qContentIDF)


def to_numpy(input_vector, mul_hot_vector):
    input_idx = input_vector.indices
    hot_idx = set(mul_hot_vector.indices)
    values = input_vector.values
        
    output = []
    for in_i, val in zip(input_idx, values):
        if in_i in hot_idx:
            output.append(val)
            
    return np.asarray(output)


def contentBM25(row, b, k1):
    tf_np = to_numpy(row.contentTF, row.contentHotVec[0])
    idf_np = to_numpy(row.contentIDF, row.contentHotVec[0])
    
    nom = (tf_np * (k1 + 1))
    denom = tf_np + k1 * (1 - b + (b * (row.contentLength/mean_doc_df.avgContentSize[0])))
    
    return (idf_np * (nom / denom)).sum()


def titleBM25(row, b, k1):
    tf_np = to_numpy(row.titleTF, row.titleHotVec[0])
    idf_np = to_numpy(row.titleIDF, row.titleHotVec[0])
    
    nom = (tf_np * (k1 + 1))
    denom = tf_np + k1 * (1 - b + (b * (row.titleLength/mean_doc_df.avgTitleSize[0])))
    
    return (idf_np * (nom / denom)).sum()

In [14]:
mean_doc_df = pd.read_pickle("mean_doc")
score_df = pd.read_pickle("clueweb-df-with-pagerank")

def to_sparse(row, column):
    size = row[column].size
    indices = row[column].indices
    values = [1.0 for _ in indices]
    return [SparseVector(size, indices, values)]

def get_doc2query(path):
    doc2query = []
    with open(path, "r") as f:
        for row in f:
            query_id, _, doc_id, rel_score = row.rstrip().split(" ")
            if doc_id in doc2query:
                print(doc_id)
            doc2query.append([int(query_id), doc_id, rel_score])
    return pd.DataFrame(doc2query, columns=["queryID", "docID", "relScore"])

# Prepare query_df
query_df = pd.read_pickle("query_df")
query_df = query_df[['id','contentIDF', "titleIDF", 'contentTF', "titleTF"]]
query_df.columns = ["queryID", "qContentIDF", "qTitleIDF", "qContentTF", "qTitleTF"]
query_df['queryID'] = query_df['queryID'].apply(lambda x: int(x))

# Add multihot vectors for the queries.
query_df["titleHotVec"] = query_df.apply(lambda x: to_sparse(x, "qTitleIDF"), axis=1)
query_df["contentHotVec"] = query_df.apply(lambda x: to_sparse(x, "qContentIDF"), axis=1)
query_df["contentIDFScore"] = query_df.apply(lambda x: contentIDF(x), axis=1)
query_df["titleIDFScore"] = query_df.apply(lambda x: titleIDF(x), axis=1)


# # Get a dataframe with the query / document pairs
doc2query = get_doc2query("../../highlightGenerator/storage/TREC/rels")
with_q_id = doc2query.merge(score_df, on=["docID"])

# # Create one dataframe with all query's and their scores
with_query = query_df.merge(with_q_id, on="queryID")

with_query

Unnamed: 0,queryID,qContentIDF,qTitleIDF,qContentTF,qTitleTF,titleHotVec,contentHotVec,contentIDFScore,titleIDFScore,docID,relScore,pagerank,contentTF,contentIDF,titleTF,titleIDF,contentLength,titleLength
0,201,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",12.650404,19.252718,clueweb12-0000tw-05-12114,1,6.791157e-08,"(177.0, 0.0, 73.0, 95.0, 62.0, 83.0, 48.0, 31....","(33.906911221850756, 0.0, 15.416715121949217, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.9202287395954594, 0.0, 0.0, 0.0, 0.0, ...",3510,12
1,201,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",12.650404,19.252718,clueweb12-0000wb-30-01951,0,6.329691e-09,"(2.0, 2.0, 6.0, 3.0, 2.0, 11.0, 3.0, 7.0, 4.0,...","(0.38312894035989553, 3.1176823062542898, 1.26...","(1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.2694260835993485, 1.9202287395954594, 0.0, ...",469,17
2,201,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",12.650404,19.252718,clueweb12-0000wb-60-01497,1,3.490371e-09,"(140.0, 5.0, 52.0, 85.0, 48.0, 96.0, 39.0, 57....","(26.819025825192686, 7.794205765635724, 10.981...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.2694260835993485, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4405,11
3,201,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",12.650404,19.252718,clueweb12-0005wb-86-10798,1,1.460235e-08,"(51.0, 0.0, 26.0, 28.0, 23.0, 30.0, 10.0, 12.0...","(9.769787979177336, 0.0, 5.490884837954516, 5....","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.2694260835993485, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1240,8
4,201,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",12.650404,19.252718,clueweb12-0100tw-14-21268,1,2.986690e-09,"(27.0, 0.0, 6.0, 11.0, 12.0, 15.0, 9.0, 9.0, 3...","(5.17224069485859, 0.0, 1.2671272702971959, 2....","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 2.293516335148023, 0.0, 0.0, 0...",650,12
5,201,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",12.650404,19.252718,clueweb12-0100tw-31-09278,0,1.212091e-09,"(60.0, 0.0, 36.0, 31.0, 26.0, 43.0, 16.0, 12.0...","(11.493868210796865, 0.0, 7.6027636217831756, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 2.2566070641099714, 0.0, 0.0, 0.0, ...",1477,8
6,201,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",12.650404,19.252718,clueweb12-0100tw-31-09279,0,1.191583e-09,"(103.0, 0.0, 45.0, 43.0, 22.0, 47.0, 18.0, 15....","(19.73114042853462, 0.0, 9.50345452722897, 8.7...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 2.2566070641099714, 0.0, 0.0, 0.0, ...",1869,8
7,201,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",12.650404,19.252718,clueweb12-0100tw-52-04001,0,5.509952e-09,"(6.0, 0.0, 2.0, 9.0, 3.0, 6.0, 1.0, 4.0, 1.0, ...","(1.1493868210796867, 0.0, 0.42237575676573197,...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.9202287395954594, 0.0, 0.0, 0.0, 0.0, ...",175,4
8,201,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",12.650404,19.252718,clueweb12-0106wb-18-19516,1,1.176489e-09,"(4.0, 0.0, 53.0, 6.0, 9.0, 2.0, 24.0, 7.0, 6.0...","(0.7662578807197911, 0.0, 11.192957554291898, ...","(2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(2.538852167198697, 0.0, 0.0, 0.0, 0.0, 0.0, 0...",5484,15
9,201,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",12.650404,19.252718,clueweb12-0108wb-22-26598,0,4.965251e-09,"(38.0, 1.0, 2.0, 8.0, 97.0, 51.0, 0.0, 183.0, ...","(7.279449866838015, 1.5588411531271449, 0.4223...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",11762,12


In [15]:
with_query["contentTFScore"] = with_query.apply(lambda x: contentTF(x), axis=1)
# with_query["contentIDFScore"] = with_query.apply(lambda x: contentIDF(x), axis=1)
with_query["contentTFIDFScore"] = with_query.apply(lambda x: contentTFIDF(x), axis=1)
with_query["titleTFScore"] = with_query.apply(lambda x: titleTF(x), axis=1)
# with_query["titleIDFScore"] = with_query.apply(lambda x: titleIDF(x), axis=1)
with_query["titleTFIDFScore"] = with_query.apply(lambda x: titleTFIDF(x), axis=1)

# BM25 is calculated with b = 0.8, k1 = 2.5 and k2 = 0 )the k2 term is therefor discared.
# with_query["contentBM25"] = with_query.apply(lambda x: np.log(contentBM25(x, 0.75, 1.2) + 1e-7), axis=1)
# with_query["titleBM25"] = with_query.apply(lambda x: np.log(titleBM25(x, 0.75, 1.2) + 1e-7), axis=1)
with_query["contentBM25"] = with_query.apply(lambda x: contentBM25(x, 0.8, 2.5), axis=1)
with_query["titleBM25"] = with_query.apply(lambda x: titleBM25(x, 0.8, 2.5), axis=1)
with_query["pagerank"] = with_query["pagerank"] * 10e5
with_query


Unnamed: 0,queryID,qContentIDF,qTitleIDF,qContentTF,qTitleTF,titleHotVec,contentHotVec,contentIDFScore,titleIDFScore,docID,...,titleTF,titleIDF,contentLength,titleLength,contentTFScore,contentTFIDFScore,titleTFScore,titleTFIDFScore,contentBM25,titleBM25
0,201,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",12.650404,19.252718,clueweb12-0000tw-05-12114,...,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.9202287395954594, 0.0, 0.0, 0.0, 0.0, ...",3510,12,36.0,455.414556,2.0,38.505437,515.135134,14.435451
1,201,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",12.650404,19.252718,clueweb12-0000wb-30-01951,...,"(1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.2694260835993485, 1.9202287395954594, 0.0, ...",469,17,8.0,101.203235,2.0,38.505437,122.854844,11.253291
2,201,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",12.650404,19.252718,clueweb12-0000wb-60-01497,...,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.2694260835993485, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4405,11,32.0,404.812939,2.0,38.505437,399.197601,15.300791
3,201,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",12.650404,19.252718,clueweb12-0005wb-86-10798,...,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.2694260835993485, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1240,8,34.0,430.113748,2.0,38.505437,613.767323,18.655776
4,201,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",12.650404,19.252718,clueweb12-0100tw-14-21268,...,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 2.293516335148023, 0.0, 0.0, 0...",650,12,13.0,164.455256,1.0,19.252718,215.702889,7.293713
5,201,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",12.650404,19.252718,clueweb12-0100tw-31-09278,...,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 2.2566070641099714, 0.0, 0.0, 0.0, ...",1477,8,44.0,556.617791,2.0,38.505437,809.036816,18.655776
6,201,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",12.650404,19.252718,clueweb12-0100tw-31-09279,...,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 2.2566070641099714, 0.0, 0.0, 0.0, ...",1869,8,44.0,556.617791,2.0,38.505437,778.878783,18.655776
7,201,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",12.650404,19.252718,clueweb12-0100tw-52-04001,...,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.9202287395954594, 0.0, 0.0, 0.0, 0.0, ...",175,4,6.0,75.902426,2.0,38.505437,100.311211,26.363310
8,201,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",12.650404,19.252718,clueweb12-0106wb-18-19516,...,"(2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(2.538852167198697, 0.0, 0.0, 0.0, 0.0, 0.0, 0...",5484,15,0.0,0.000000,0.0,0.000000,0.000000,0.000000
9,201,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",12.650404,19.252718,clueweb12-0108wb-22-26598,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",11762,12,0.0,0.000000,0.0,0.000000,0.000000,0.000000


In [16]:
logged = with_query.copy()

logged["contentBM25"] = np.log(with_query["contentBM25"] + 1e-7)
logged["titleBM25"] = np.log(with_query["titleBM25"] + 1e-7)
logged["contentTFIDFScore"] = np.log(with_query["contentTFIDFScore"] + 1e-7)
logged["titleTFIDFScore"] = np.log(with_query["titleTFIDFScore"] + 1e-7)
logged["contentLength"] = np.log(with_query["contentLength"] + 1e-7)
logged["contentTFScore"] = np.log(with_query["contentTFScore"] + 1e-7)
logged["titleTFScore"] = np.log(with_query["titleTFScore"] + 1e-7)
logged["pagerank"] = np.log(with_query["pagerank"] + 1e-7)
logged["titleLength"] = np.log(with_query["titleLength"] + 1e-7)
logged["pagerank"] = np.log(with_query["pagerank"] + 1e-7)
logged["contentIDFScore"] = np.log(with_query["contentIDFScore"] + 1e-7)
logged["titleIDFScore"] = np.log(with_query["titleIDFScore"] + 1e-7)

logged

Unnamed: 0,queryID,qContentIDF,qTitleIDF,qContentTF,qTitleTF,titleHotVec,contentHotVec,contentIDFScore,titleIDFScore,docID,...,titleTF,titleIDF,contentLength,titleLength,contentTFScore,contentTFIDFScore,titleTFScore,titleTFIDFScore,contentBM25,titleBM25
0,201,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2.537689,2.957652,clueweb12-0000tw-05-12114,...,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.9202287395954594, 0.0, 0.0, 0.0, 0.0, ...",8.163371,2.484907,3.583519,6.121208,6.931472e-01,3.650799,6.244429,2.669687
1,201,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2.537689,2.957652,clueweb12-0000wb-30-01951,...,"(1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.2694260835993485, 1.9202287395954594, 0.0, ...",6.150603,2.833213,2.079442,4.617131,6.931472e-01,3.650799,4.811004,2.420661
2,201,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2.537689,2.957652,clueweb12-0000wb-60-01497,...,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.2694260835993485, 0.0, 0.0, 0.0, 0.0, 0.0, ...",8.390496,2.397895,3.465736,6.003425,6.931472e-01,3.650799,5.989457,2.727905
3,201,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2.537689,2.957652,clueweb12-0005wb-86-10798,...,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.2694260835993485, 0.0, 0.0, 0.0, 0.0, 0.0, ...",7.122867,2.079442,3.526361,6.064050,6.931472e-01,3.650799,6.419616,2.926156
4,201,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2.537689,2.957652,clueweb12-0100tw-14-21268,...,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 2.293516335148023, 0.0, 0.0, 0...",6.476972,2.484907,2.564949,5.102639,1.000000e-07,2.957652,5.373902,1.987013
5,201,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2.537689,2.957652,clueweb12-0100tw-31-09278,...,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 2.2566070641099714, 0.0, 0.0, 0.0, ...",7.297768,2.079442,3.784190,6.321879,6.931472e-01,3.650799,6.695844,2.926156
6,201,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2.537689,2.957652,clueweb12-0100tw-31-09279,...,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 2.2566070641099714, 0.0, 0.0, 0.0, ...",7.533159,2.079442,3.784190,6.321879,6.931472e-01,3.650799,6.657855,2.926156
7,201,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2.537689,2.957652,clueweb12-0100tw-52-04001,...,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.9202287395954594, 0.0, 0.0, 0.0, 0.0, ...",5.164786,1.386294,1.791759,4.329449,6.931472e-01,3.650799,4.608277,3.271973
8,201,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2.537689,2.957652,clueweb12-0106wb-18-19516,...,"(2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(2.538852167198697, 0.0, 0.0, 0.0, 0.0, 0.0, 0...",8.609590,2.708050,-16.118096,-16.118096,-1.611810e+01,-16.118096,-16.118096,-16.118096
9,201,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2.537689,2.957652,clueweb12-0108wb-22-26598,...,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",9.372629,2.484907,-16.118096,-16.118096,-1.611810e+01,-16.118096,-16.118096,-16.118096


In [17]:
def to_letor(df, path):
    with open(path, "w") as f:
        for row in df.iterrows():
            start = "{} qid:{}".format(row[1]["relScore"], row[1]["queryID"])
            score_df = row[1].drop(['queryID', 'docID', 'relScore'])
#             scores = " ".join([str(i) + ":" + str(x) for i,x in enumerate(score_df)])
            scores = " ".join([str(i) + ":" + '{0:.10f}'.format(x) for i,x in enumerate(score_df)])
            print([x for x in score_df.keys()])
            return
            end = "#docid = {}".format(row[1]["docID"])
            letor = "{} {} {}\n".format(start, scores, end)
            f.write(letor)
        


In [18]:
TREC_df = logged[['queryID', 'docID', 'relScore', 'pagerank', 'contentLength', 'contentTFScore', 'contentIDFScore', 'contentTFIDFScore', 'contentBM25', 'titleLength', 'titleTFScore', 'titleIDFScore', 'titleTFIDFScore', 'titleBM25']]
# TREC_df.to_csv("trec", sep=' ', index=False, encoding='utf-8')
to_letor(TREC_df, "LETOR_scores")

['pagerank', 'contentLength', 'contentTFScore', 'contentIDFScore', 'contentTFIDFScore', 'contentBM25', 'titleLength', 'titleTFScore', 'titleIDFScore', 'titleTFIDFScore', 'titleBM25']
