In [1]:
class Document(object):
    def __init__(self, doc_id, title, annotation, list_order):
        self.id = doc_id
        self.title = title
        self.annotation = annotation
        self.list_order = list_order   
        
    def __repr__(self):
        return "doc {}: id={}, title={}".format(self.list_order, self.id, self.title)
    
    
class NormalizedDocument(object):
    def __init__(self, doc_id_norm, title_norm, annotation_norm, list_order):
        self.id = doc_id_norm
        self.title_norm = title_norm
        self.annotation_norm = annotation_norm
        self.list_order = list_order   
        
    def __repr__(self):
        return "doc {}: id={}, title={}".format(self.list_order, self.id, self.title_norm)
    
    def document_title_length(self):
        return len(self.title_norm)
    
    def document_annotation_length(self):
        return len(self.annotation_norm)
    
    
class Query(object):
    def __init__(self, q_id, q_order, q_text):
        self.q_id = q_id
        self.q_text = q_text
        self.q_order = q_order
        
    def __repr__(self):
        return "query {}: id={}, text={}".format(self.q_order, self.q_id, self.q_text) 
    
class NormalizedQuery(object):
    def __init__(self, q_id, q_order, q_text_norm):
        self.q_id = q_id
        self.q_order = q_order
        self.q_text_norm = q_text_norm
        
    def __repr__(self):
        return "query {}: id={}, text={}".format(self.q_order, self.q_id, self.q_text_norm) 
 

Парсер:

In [2]:
class Parser(object):
    def __init__(self, doc_file, total_docs, query_file, total_queries):
        self.doc_file = doc_file
        self.total_docs = total_docs
        self.query_file = query_file
        self.total_queries = total_queries

        
    def _read_document_data(self, data_f, order, id_line):
        if not id_line:
            id_line = data_f.readline()

        id_line = id_line.split(" ")
        doc_id = None
        if id_line[0] == ".I":
            doc_id = int(id_line[1])

        title_arr = []
        doc_title = None
        if data_f.readline().strip() == ".T":
            title_line = ""
            while title_line.strip() != ".A":
                title_arr.append(title_line)
                title_line = data_f.readline()

            doc_title = "".join(title_arr)

        while data_f.readline().strip() != ".W":
            continue

        annotation_arr = []
        while True:
            ann_line = data_f.readline()
            if ann_line.startswith(".I") or not ann_line:
                break
            annotation_arr.append(ann_line)

        doc_annotation = "".join(annotation_arr)  

        doc = Document(doc_id, doc_title, doc_annotation, order)

        return doc, ann_line
    
    
    def _read_query_data(self, data_f, order, prev_line):
        if not prev_line:
            id_line = data_f.readline()
            if not id_line:
                return None, None
        else:
            id_line = prev_line

        id_line_arr = id_line.split(" ")
        q_id = None
        if id_line_arr[0] == ".I":
            q_id = int(id_line_arr[1])

        data_f.readline()    

        q_arr = []
        while True:
            q_line = data_f.readline()
            if q_line.startswith(".I") or not q_line:
                break
            q_arr.append(q_line)

        q_text = "".join(q_arr)  

        query = Query(q_id, order, q_text)

        return query, q_line

    
    def parse_document_data(self):
        sc_docs = []
        with open(self.doc_file) as f:
            curr_line = None
            for doc_index in xrange(self.total_docs):
                curr_doc, curr_line = self._read_document_data(f, doc_index, curr_line)
                sc_docs.append(curr_doc)
                
        return sc_docs
   

    def parse_queries(self):
        queries_local = []
        query_ind = 0
        with open(self.query_file) as f:
            curr_line = None
            while True:
                curr_query, curr_line = self._read_query_data(f, query_ind, curr_line)
                if not curr_query and not curr_line:
                    break
                queries_local.append(curr_query)
                query_ind = query_ind + 1
                if query_ind >= self.total_queries:
                    break
        
        return queries_local
    

Нормализатор. Для нормализации использовалась библиотека nltk (токенизация, стоп-слова и стемминг).

In [3]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import string


class Normalizer(object):
    def __init__(self):
        self.stopwords = stopwords.words('english') + list(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()
        self.stemmer = PorterStemmer()
    
    def _normalize_tokens(self, tokens):
        return [self.stemmer.stem(w) for w in tokens if w not in self.stopwords]

    def _tokenize_text(self, text):
        return [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    
    def normalize_document(self, doc):
        title = doc.title
        title_tokens = self._tokenize_text(title)  
        lem_title_tokens = self._normalize_tokens(title_tokens)

        annotation = doc.annotation
        ann_tokens = self._tokenize_text(annotation)  
        lem_ann_tokens = self._normalize_tokens(ann_tokens)

        normalized_doc = NormalizedDocument(doc.id, lem_title_tokens, lem_ann_tokens, doc.list_order)

        return normalized_doc

    def normalize_query(self, query):
        q_text = query.q_text
        q_tokens = self._tokenize_text(q_text)
        q_text_norm = self._normalize_tokens(q_tokens)

        normalized_query = NormalizedQuery(query.q_id, query.q_order, q_text_norm)
        return normalized_query
    

Применение нормализатора для нормализации документов (как по заглавию, так и по тексту аннотации):

In [4]:
TOTAL_DOCS = 1400
TOTAL_QUERIES = float("inf")
DOCS_FILE = "cran.all.1400"
QUERIES_FILE = "cran.qry"


parser = Parser(DOCS_FILE, TOTAL_DOCS, QUERIES_FILE, TOTAL_QUERIES)  

scientific_documents = parser.parse_document_data()
queries = parser.parse_queries()

normalizer = Normalizer()

normalized_docs = []
normalized_queries = []

for sc_d in scientific_documents:
    norm_sc_d = normalizer.normalize_document(sc_d)
    normalized_docs.append(norm_sc_d)
    
for q in queries:
    norm_q = normalizer.normalize_query(q)
    normalized_queries.append(norm_q)


Инвертированный индекс:

In [5]:
class InvIndex(object):
    def __init__(self, words_lists):
        self.inv_index_dict = dict()
        self.inv_index_frq = dict()
        self._build_inv_index(words_lists)

    def _build_inv_index(self, words_lists):

        for w_list_ind, w_list in enumerate(words_lists):
            for w in w_list:
                if w not in self.inv_index_dict:
                    self.inv_index_dict[w] = [0] * TOTAL_DOCS
                    self.inv_index_frq[w] = 0
                
                if not self.inv_index_dict[w][w_list_ind]:
                    self.inv_index_frq[w] = self.inv_index_frq[w] + 1
                self.inv_index_dict[w][w_list_ind] = self.inv_index_dict[w][w_list_ind] + 1
                
    def get_statistics(self):
        keys_amount = len(self.inv_index_dict)
        avg_len = float(sum(self.inv_index_frq.values())) / keys_amount
        max_len = max(self.inv_index_frq.values())
        
        return keys_amount, avg_len, max_len



Построение инвертированного индекса для заголовков и основных текстов + статистики:

In [6]:
title_inv_index = InvIndex([d.title_norm for d in normalized_docs])
ann_inv_index = InvIndex([d.annotation_norm for d in normalized_docs])

print "titles:\n amount of keys: {}, average word position length: {}, max word position length: {}".format(
        *title_inv_index.get_statistics())
print "annotations:\n amount of keys: {}, average word position length: {}, max word position length: {}".format(
        *ann_inv_index.get_statistics())

titles:
 amount of keys: 1538, average word position length: 7.04616384915, max word position length: 358
annotations:
 amount of keys: 7047, average word position length: 12.265928764, max word position length: 713


Класс Searcher, осуществляющий поиск релевантных документов по запросу, представлен ниже.

Класс RsvParams - параметры поиска (B, k1, k2, а также средняя длина документов в коллекции).

Метод Searcher.calculate_rsv_for_docs_and_queries осуществляет вычисление RSV для всех документов относительно всех запросов.

Метод Searcher.\_rsv осуществляет вычисление RSV для одного документа по одному запросу, с возможностью нормирования (параметр is_normed)
IDF вычисляется вызовом передаваемой при создании объекта Searcher функции idf_counter. В качестве такой функции исп. функции idf_1 и idf_2, объявленные во второй ячейке снизу.

Для вычисления множителей TF используем функции Searcher.\_tf (вычисляет множитель с k1) и Searcher.\_tf_q (с k2). Заметим, что при $k_2=0$ множителем с k2 равен 1.


In [7]:
import math
import time

class RsvParams(object):
    def __init__(self, b, k1, k2, avg_doc_len):
        self.B = b
        self.K1 = k1
        self.K2 = k2
        self.avg_doc_len = avg_doc_len      
        
        
class Searcher(object):
    def __init__(self, rsv_params, inv_index, idf_counter):
        self.rsv_params = rsv_params
        self.inv_index = inv_index
        self.search_result = []
        self.idf_counter = idf_counter
    
    def _find_docs_via_query(self, documents_param, query):
        matched_index = self._match_query(query, self.inv_index.inv_index_dict)

        matched_docs = [0] * TOTAL_DOCS
        for w in matched_index:
            matched_docs = [x + y for x, y in zip(matched_docs, matched_index[w])]            

        retr_d = self._retrieve_docs_from_matched(matched_docs, documents_param)
        return retr_d
    
    def calculate_rsv_for_docs_and_queries(self, docs, queries_param, norm_prop_name, is_normed = False):
        del self.search_result[:]

        for q in queries_param:
            docs_rsv_list = self.calc_rsv_BM25(docs, q, norm_prop_name, is_normed)
            self.search_result.append((q.q_order, docs_rsv_list))
       
    def calc_rsv_BM25(self, docs, q, norm_prop_name, is_normed = False):
        docs_rsv_list = []
        docs_found = self._find_docs_via_query(docs, q)
        for d in docs_found:
            rsv = self._rsv(q.q_text_norm, d.list_order, getattr(d, norm_prop_name), len(docs), self.inv_index, is_normed)
            docs_rsv_list.append((d.id, rsv))
        
        return docs_rsv_list
        
        
    def _match_query(self, query, inv_index):
        return {k: v for k, v in inv_index.iteritems() if k in query.q_text_norm}

  

    def _retrieve_docs_from_matched(self, matched_docs, docs_collection):
        actual_docs = []
        for i, md in enumerate(matched_docs):
            if md > 0:
                actual_docs.append(docs_collection[i])
        return actual_docs
    
            
    def _rsv(self, query_tokens, doc_order, doc_tokens, total_docs, inv_index, is_normed=False):
        rsv_value = 0
        for token in query_tokens:
            idf_value = self.idf_counter(token, total_docs, inv_index)
            
            tf_value = self._tf(token, doc_order, len(doc_tokens), inv_index)
            tf_q_value = self._tf_q(token, query_tokens)
            
            rsv_value = rsv_value + idf_value * tf_value * tf_q_value

        if is_normed:
            idf_sum = sum([self.idf_counter(token, total_docs, inv_index) for token in query_tokens])
            rsv_value = float(rsv_value) / idf_sum

        
        return rsv_value

    def _tf(self, token, doc_order, doc_len, inv_index):
        frequences = inv_index.inv_index_dict.get(token, [])
        ftd = frequences[doc_order] if doc_order < len(frequences) else 0
        tf_ret = float(ftd * (self.rsv_params.K1 + 1)) / (
            ftd + self.rsv_params.K1 * (1 - self.rsv_params.B + self.rsv_params.B * doc_len / self.rsv_params.avg_doc_len))
        return tf_ret
    
    def _tf_q(self, token, query_tokens):
        if self.rsv_params.K2 == 0:
            return 1
        ftd = len([qt for qt in query_tokens if qt == token])
        tf_ret = float(ftd * (self.rsv_params.K2 + 1)) / (ftd + self.rsv_params.K2)
        return tf_ret
    



Вычисление производится следующим образом: осуществляем поиск релевантных документов, записываем их в файл answer, затем выполняем скрипт eval.py, записываем вывод скрипта в файл result_stats и далее читаем статистики и выводим.

Статистики для $b=0.75$ и $k_1 = 1.2$:

In [8]:
import numpy as np
import subprocess

MAX_RELEVANT = 10
avg_title_len = float(sum([len(d.title_norm) for d in normalized_docs])) / len(normalized_docs)
avg_ann_len = float(sum([len(d.annotation_norm) for d in normalized_docs])) / len(normalized_docs)
B = 0.75
K1 = 1.2

def select_relevant(queries_docs_rvs):
    relevant_docs = []
    for q in queries_docs_rvs:
        relevant_docs.append((q[0] + 1, sorted(
            q[1], key=lambda item: item[1], reverse=True)[:MAX_RELEVANT]))
        
    return relevant_docs


def write_relevant_to_file(relevant_docs_param):
    with open("answer", 'w') as f:
        for q in relevant_docs_param:
            for d in q[1]:
                f.write("{0} {1}\n".format(q[0], d[0]))
                
def idf_1(token, total_docs, inv_index):
    N = total_docs
    Nt = inv_index.inv_index_frq.get(token, 0)
    idf_ret = math.log10(1 + float(N - Nt + 0.5) / (Nt + 0.5))
    return idf_ret

def idf_2(token, total_docs, inv_index):
    N = total_docs
    Nt = inv_index.inv_index_frq.get(token, 0)
    idf_ret = math.log10(float(N + 0.5) / (Nt + 0.5))
    return idf_ret
        
    
def search_stats(b_p, k1_p, k2_p, inv_index_p, avg_doc_len, stats, idf_counter, norm_prop_name, is_normed=False):
    rsv_parameters = RsvParams(b_p, k1_p, k2_p, avg_doc_len)
    searcher = Searcher(rsv_parameters, inv_index_p, idf_counter)

    searcher.calculate_rsv_for_docs_and_queries(normalized_docs, normalized_queries, norm_prop_name, is_normed) 

    relevant_docs = select_relevant(searcher.search_result)
    write_relevant_to_file(relevant_docs)

    with open("result_stats", "w") as write_file:
        subprocess.call("python eval.py", stdout=write_file)
        
    with open("result_stats", "r") as read_file:
        precision = float(read_file.readline().split(' ')[-1])
        recall = float(read_file.readline().split(' ')[-1])
        f_measure = float(read_file.readline().split(' ')[-1])
        map10 = float(read_file.readline().split(' ')[-1])
        stats.append(((b_p, k1_p, k2_p),[precision, recall, f_measure, map10]))

      
    return precision, recall, f_measure, map10


print "b={}, k1={}".format(B, K1)
print search_stats(B, K1, 0, title_inv_index, avg_title_len, [], idf_1, "title_norm")
print search_stats(B, K1, 0, ann_inv_index, avg_ann_len, [], idf_1, "annotation_norm")
        

b=0.75, k1=1.2
(0.252888888889, 0.367804148995, 0.299708799329, 0.290090588169)
(0.293333333333, 0.424124121081, 0.346807302421, 0.36262100865)


Варьирование параметров для поиска наилучшего набора (шаги сетки ниже в коде):

In [9]:

B_values = np.arange(0., 1.01, 0.2)
K1_values = np.arange(1.2, 2.01, 0.2)
K2_values = np.arange(0, 1001, 200)


title_stats = []
ann_stats = []



for b in B_values:
    for k1 in K1_values:
        for k2 in K2_values:
            print "b={}, k1={}, k2={}".format(b, k1, k2)

            print "title (idf_1): ", search_stats(b, k1, k2, title_inv_index, avg_title_len, title_stats, idf_1, "title_norm")
            print "ann (idf_1): ", search_stats(b, k1, k2, ann_inv_index, avg_ann_len, ann_stats, idf_1, "annotation_norm")

            print "title (idf_2): ", search_stats(b, k1, k2, title_inv_index, avg_title_len, title_stats, idf_2, "title_norm")
            print "ann (idf_2): ", search_stats(b, k1, k2, ann_inv_index, avg_ann_len, ann_stats, idf_2, "annotation_norm")

            print "title (idf_1 normed): ", search_stats(b, k1, k2, title_inv_index, avg_title_len, title_stats,
                                                         idf_1, "title_norm", True)
            print "ann (idf_1 normed): ", search_stats(b, k1, k2, ann_inv_index, avg_ann_len, ann_stats, 
                                                       idf_1, "annotation_norm", True)
            print "\n"


   


b=0.0, k1=1.2, k2=0
title (idf_1):  (0.232888888889, 0.342544446385, 0.277268592636, 0.265435766776)
ann (idf_1):  (0.263111111111, 0.385296015272, 0.312691389592, 0.311051109991)
title (idf_2):  (0.232888888889, 0.342544446385, 0.277268592636, 0.265435766776)
ann (idf_2):  (0.263111111111, 0.385296015272, 0.312691389592, 0.311051109991)
title (idf_1 normed):  (0.232888888889, 0.342544446385, 0.277268592636, 0.265435766776)
ann (idf_1 normed):  (0.263111111111, 0.385296015272, 0.312691389592, 0.311051109991)


b=0.0, k1=1.2, k2=200
title (idf_1):  (0.225777777778, 0.330691373532, 0.268344663026, 0.258302987738)
ann (idf_1):  (0.260888888889, 0.381669752646, 0.309927814378, 0.306436782285)
title (idf_2):  (0.225777777778, 0.330691373532, 0.268344663026, 0.258302987738)
ann (idf_2):  (0.260888888889, 0.381669752646, 0.309927814378, 0.306436782285)
title (idf_1 normed):  (0.225777777778, 0.330691373532, 0.268344663026, 0.258302987738)
ann (idf_1 normed):  (0.260888888889, 0.381669752646, 

title (idf_1):  (0.226222222222, 0.33158026242, 0.268951200021, 0.257890152151)
ann (idf_1):  (0.260888888889, 0.381740681717, 0.309951196933, 0.304761365163)
title (idf_2):  (0.226222222222, 0.33158026242, 0.268951200021, 0.257890152151)
ann (idf_2):  (0.260888888889, 0.381740681717, 0.309951196933, 0.304761365163)
title (idf_1 normed):  (0.226222222222, 0.33158026242, 0.268951200021, 0.257890152151)
ann (idf_1 normed):  (0.260888888889, 0.381740681717, 0.309951196933, 0.304761365163)


b=0.0, k1=1.6, k2=1000
title (idf_1):  (0.226222222222, 0.33158026242, 0.268951200021, 0.257890152151)
ann (idf_1):  (0.260888888889, 0.381740681717, 0.309951196933, 0.304761365163)
title (idf_2):  (0.226222222222, 0.33158026242, 0.268951200021, 0.257890152151)
ann (idf_2):  (0.260888888889, 0.381740681717, 0.309951196933, 0.304761365163)
title (idf_1 normed):  (0.226222222222, 0.33158026242, 0.268951200021, 0.257890152151)
ann (idf_1 normed):  (0.260888888889, 0.381740681717, 0.309951196933, 0.3047613

title (idf_1):  (0.232444444444, 0.337067459758, 0.275145990262, 0.269157852524)
ann (idf_1):  (0.273777777778, 0.397938601081, 0.324383175176, 0.329792197867)
title (idf_2):  (0.232444444444, 0.337067459758, 0.275145990262, 0.269157852524)
ann (idf_2):  (0.273777777778, 0.397938601081, 0.324383175176, 0.329763979172)
title (idf_1 normed):  (0.232444444444, 0.337067459758, 0.275145990262, 0.269157852524)
ann (idf_1 normed):  (0.273777777778, 0.397938601081, 0.324383175176, 0.329792197867)


b=0.2, k1=1.2, k2=600
title (idf_1):  (0.232444444444, 0.337067459758, 0.275145990262, 0.26914462501)
ann (idf_1):  (0.273777777778, 0.397938601081, 0.324383175176, 0.329792197867)
title (idf_2):  (0.232444444444, 0.337067459758, 0.275145990262, 0.26914462501)
ann (idf_2):  (0.273777777778, 0.397938601081, 0.324383175176, 0.329792197867)
title (idf_1 normed):  (0.232444444444, 0.337067459758, 0.275145990262, 0.26914462501)
ann (idf_1 normed):  (0.273777777778, 0.397938601081, 0.324383175176, 0.32979

title (idf_1):  (0.243111111111, 0.355459943818, 0.288741866769, 0.280330657736)
ann (idf_1):  (0.281777777778, 0.411433229909, 0.334480381708, 0.342285527421)
title (idf_2):  (0.243111111111, 0.355459943818, 0.288741866769, 0.280330657736)
ann (idf_2):  (0.281777777778, 0.411433229909, 0.334480381708, 0.342285527421)
title (idf_1 normed):  (0.243111111111, 0.355459943818, 0.288741866769, 0.280330657736)
ann (idf_1 normed):  (0.281777777778, 0.411433229909, 0.334480381708, 0.342285527421)


b=0.2, k1=1.8, k2=200
title (idf_1):  (0.234222222222, 0.339381281739, 0.27716231662, 0.269866882226)
ann (idf_1):  (0.276888888889, 0.402755660565, 0.328167326398, 0.334202609809)
title (idf_2):  (0.234222222222, 0.339381281739, 0.27716231662, 0.269866882226)
ann (idf_2):  (0.276888888889, 0.402755660565, 0.328167326398, 0.334202609809)
title (idf_1 normed):  (0.234222222222, 0.339381281739, 0.27716231662, 0.269866882226)
ann (idf_1 normed):  (0.276888888889, 0.402755660565, 0.328167326398, 0.33420

title (idf_2):  (0.238222222222, 0.346612482637, 0.28237310535, 0.274902807172)
ann (idf_2):  (0.28, 0.406059987683, 0.331448557247, 0.342625643879)
title (idf_1 normed):  (0.238222222222, 0.346612482637, 0.28237310535, 0.274902807172)
ann (idf_1 normed):  (0.28, 0.406059987683, 0.331448557247, 0.342514532768)


b=0.4, k1=1.2, k2=1000
title (idf_1):  (0.238222222222, 0.346612482637, 0.28237310535, 0.274902807172)
ann (idf_1):  (0.28, 0.406059987683, 0.331448557247, 0.342514532768)
title (idf_2):  (0.238222222222, 0.346612482637, 0.28237310535, 0.274902807172)
ann (idf_2):  (0.28, 0.406059987683, 0.331448557247, 0.342514532768)
title (idf_1 normed):  (0.238222222222, 0.346612482637, 0.28237310535, 0.274902807172)
ann (idf_1 normed):  (0.28, 0.406059987683, 0.331448557247, 0.342514532768)


b=0.4, k1=1.4, k2=0
title (idf_1):  (0.248444444444, 0.363724814749, 0.295230144846, 0.284677968842)
ann (idf_1):  (0.286666666667, 0.416536747294, 0.339609275269, 0.351869716693)
title (idf_2):  (0.2

title (idf_2):  (0.240888888889, 0.347126591984, 0.284410672055, 0.274516157162)
ann (idf_2):  (0.284, 0.411876142747, 0.336188632875, 0.344285556815)
title (idf_1 normed):  (0.240888888889, 0.347126591984, 0.284410672055, 0.274516157162)
ann (idf_1 normed):  (0.284, 0.411876142747, 0.336188632875, 0.344285556815)


b=0.4, k1=1.8, k2=600
title (idf_1):  (0.240888888889, 0.347126591984, 0.284410672055, 0.274516157162)
ann (idf_1):  (0.284, 0.411876142747, 0.336188632875, 0.344285556815)
title (idf_2):  (0.240888888889, 0.347126591984, 0.284410672055, 0.274516157162)
ann (idf_2):  (0.284, 0.411876142747, 0.336188632875, 0.344285556815)
title (idf_1 normed):  (0.240888888889, 0.347126591984, 0.284410672055, 0.274516157162)
ann (idf_1 normed):  (0.284, 0.411876142747, 0.336188632875, 0.344285556815)


b=0.4, k1=1.8, k2=800
title (idf_1):  (0.240888888889, 0.347126591984, 0.284410672055, 0.274516157162)
ann (idf_1):  (0.284, 0.411876142747, 0.336188632875, 0.344285556815)
title (idf_2):  (0

ann (idf_2):  (0.292888888889, 0.423854733593, 0.346406547832, 0.363719118306)
title (idf_1 normed):  (0.252444444444, 0.367335895027, 0.299241198982, 0.288707781837)
ann (idf_1 normed):  (0.292888888889, 0.423854733593, 0.346406547832, 0.363719118306)


b=0.6, k1=1.4, k2=200
title (idf_1):  (0.241333333333, 0.348758225783, 0.285267544859, 0.275691296436)
ann (idf_1):  (0.289333333333, 0.417984973723, 0.341959156222, 0.354544489936)
title (idf_2):  (0.241333333333, 0.348758225783, 0.285267544859, 0.275691296436)
ann (idf_2):  (0.289333333333, 0.417984973723, 0.341959156222, 0.354544489936)
title (idf_1 normed):  (0.241333333333, 0.348758225783, 0.285267544859, 0.275691296436)
ann (idf_1 normed):  (0.289333333333, 0.417984973723, 0.341959156222, 0.354544489936)


b=0.6, k1=1.4, k2=400
title (idf_1):  (0.241333333333, 0.348758225783, 0.285267544859, 0.275691296436)
ann (idf_1):  (0.289333333333, 0.417984973723, 0.341959156222, 0.354516976708)
title (idf_2):  (0.241333333333, 0.3487582257

ann (idf_2):  (0.289777777778, 0.416433106118, 0.341747947634, 0.35311271311)
title (idf_1 normed):  (0.240888888889, 0.34763212349, 0.284580207701, 0.273813066543)
ann (idf_1 normed):  (0.289777777778, 0.416433106118, 0.341747947634, 0.35311271311)


b=0.6, k1=1.8, k2=1000
title (idf_1):  (0.240888888889, 0.34763212349, 0.284580207701, 0.273813066543)
ann (idf_1):  (0.289777777778, 0.416433106118, 0.341747947634, 0.35311271311)
title (idf_2):  (0.240888888889, 0.34763212349, 0.284580207701, 0.273813066543)
ann (idf_2):  (0.289777777778, 0.416433106118, 0.341747947634, 0.35311271311)
title (idf_1 normed):  (0.240888888889, 0.34763212349, 0.284580207701, 0.273813066543)
ann (idf_1 normed):  (0.289777777778, 0.416433106118, 0.341747947634, 0.35311271311)


b=0.6, k1=2.0, k2=0
title (idf_1):  (0.251555555556, 0.365814731006, 0.298111943162, 0.289323266426)
ann (idf_1):  (0.295555555556, 0.427023409061, 0.349329684589, 0.365271216511)
title (idf_2):  (0.251555555556, 0.365814731006, 0.2981

title (idf_2):  (0.240444444444, 0.345558538855, 0.283574088412, 0.271392536603)
ann (idf_2):  (0.293777777778, 0.422475162826, 0.346564202296, 0.35794797878)
title (idf_1 normed):  (0.240444444444, 0.345558538855, 0.283574088412, 0.271392536603)
ann (idf_1 normed):  (0.293777777778, 0.422475162826, 0.346564202296, 0.35794797878)


b=0.8, k1=1.4, k2=600
title (idf_1):  (0.240444444444, 0.345558538855, 0.283574088412, 0.271392536603)
ann (idf_1):  (0.293777777778, 0.422475162826, 0.346564202296, 0.35794797878)
title (idf_2):  (0.240444444444, 0.345558538855, 0.283574088412, 0.271392536603)
ann (idf_2):  (0.293777777778, 0.422475162826, 0.346564202296, 0.35794797878)
title (idf_1 normed):  (0.240444444444, 0.345558538855, 0.283574088412, 0.271392536603)
ann (idf_1 normed):  (0.293777777778, 0.422475162826, 0.346564202296, 0.35794797878)


b=0.8, k1=1.4, k2=800
title (idf_1):  (0.240444444444, 0.345558538855, 0.283574088412, 0.271392536603)
ann (idf_1):  (0.293777777778, 0.422475162826, 0

title (idf_1 normed):  (0.246222222222, 0.357548760135, 0.291621998558, 0.281928213796)
ann (idf_1 normed):  (0.296444444444, 0.427469931106, 0.350099654112, 0.368897594552)


b=0.8, k1=2.0, k2=200
title (idf_1):  (0.236444444444, 0.338658650244, 0.27846818128, 0.261518665491)
ann (idf_1):  (0.289777777778, 0.416501627805, 0.341771019209, 0.356750240055)
title (idf_2):  (0.236444444444, 0.338658650244, 0.27846818128, 0.261518665491)
ann (idf_2):  (0.289777777778, 0.416501627805, 0.341771019209, 0.356750240055)
title (idf_1 normed):  (0.236444444444, 0.338658650244, 0.27846818128, 0.261518665491)
ann (idf_1 normed):  (0.289777777778, 0.416501627805, 0.341771019209, 0.356750240055)


b=0.8, k1=2.0, k2=400
title (idf_1):  (0.236444444444, 0.338658650244, 0.27846818128, 0.261493974133)
ann (idf_1):  (0.290222222222, 0.417983109286, 0.342578575538, 0.357150592789)
title (idf_2):  (0.236444444444, 0.338658650244, 0.27846818128, 0.261493974133)
ann (idf_2):  (0.289777777778, 0.416501627805, 0

title (idf_1 normed):  (0.234666666667, 0.33658875328, 0.276535357081, 0.257780095042)
ann (idf_1 normed):  (0.290666666667, 0.423114594804, 0.344602234686, 0.354623449791)


b=1.0, k1=1.4, k2=1000
title (idf_1):  (0.234666666667, 0.33658875328, 0.276535357081, 0.257780095042)
ann (idf_1):  (0.290666666667, 0.423114594804, 0.344602234686, 0.354623449791)
title (idf_2):  (0.234666666667, 0.33658875328, 0.276535357081, 0.257780095042)
ann (idf_2):  (0.290666666667, 0.423114594804, 0.344602234686, 0.354623449791)
title (idf_1 normed):  (0.234666666667, 0.33658875328, 0.276535357081, 0.257780095042)
ann (idf_1 normed):  (0.290666666667, 0.423114594804, 0.344602234686, 0.354623449791)


b=1.0, k1=1.6, k2=0
title (idf_1):  (0.244444444444, 0.354129474372, 0.289237402109, 0.276577597212)
ann (idf_1):  (0.293333333333, 0.42716017885, 0.347818035862, 0.363558040788)
title (idf_2):  (0.244444444444, 0.354129474372, 0.289237402109, 0.276577597212)
ann (idf_2):  (0.293333333333, 0.42716017885, 0.3

ann (idf_2):  (0.288888888889, 0.419610260092, 0.342190225574, 0.352196680524)
title (idf_1 normed):  (0.231111111111, 0.330337480142, 0.27195601972, 0.250848598863)
ann (idf_1 normed):  (0.288888888889, 0.419610260092, 0.342190225574, 0.352196680524)


b=1.0, k1=2.0, k2=600
title (idf_1):  (0.231111111111, 0.330337480142, 0.27195601972, 0.250848598863)
ann (idf_1):  (0.288888888889, 0.419610260092, 0.342190225574, 0.352196680524)
title (idf_2):  (0.231111111111, 0.330337480142, 0.27195601972, 0.250848598863)
ann (idf_2):  (0.288888888889, 0.419610260092, 0.342190225574, 0.352196680524)
title (idf_1 normed):  (0.231111111111, 0.330337480142, 0.27195601972, 0.250848598863)
ann (idf_1 normed):  (0.288888888889, 0.419610260092, 0.342190225574, 0.352196680524)


b=1.0, k1=2.0, k2=800
title (idf_1):  (0.231111111111, 0.330337480142, 0.27195601972, 0.250848598863)
ann (idf_1):  (0.288888888889, 0.419610260092, 0.342190225574, 0.352196680524)
title (idf_2):  (0.231111111111, 0.330337480142, 0

Анализ результатов. Значения параметров, при которых достигается максимум по precision и recall (отдельно):

In [10]:
title_stats_idf1 = title_stats[0::3]
title_stats_idf2 = title_stats[1::3]
title_stats_normed = title_stats[2::3]
ann_stats_idf1 = ann_stats[0::3]
ann_stats_idf2 = ann_stats[1::3]
ann_stats_normed = ann_stats[2::3]


print "max precision:"
print "titles idf1:"
print max(title_stats_idf1, key=lambda item: item[1][0])
print "annotations idf1:"
print max(ann_stats_idf1, key=lambda item: item[1][0])
print "titles idf2:"
print max(title_stats_idf2, key=lambda item: item[1][0])
print "annotations idf2:"
print max(ann_stats_idf2, key=lambda item: item[1][0])
print "titles idf_normed:"
print max(title_stats_normed, key=lambda item: item[1][0])
print "annotations idf_normed:"
print max(ann_stats_normed, key=lambda item: item[1][0])


print "\n\nmax recall:"
print "titles idf1:"
print max(title_stats_idf1, key=lambda item: item[1][1])
print "annotations idf1:"
print max(ann_stats_idf1, key=lambda item: item[1][1])
print "titles idf2:"
print max(title_stats_idf2, key=lambda item: item[1][1])
print "annotations idf2:"
print max(ann_stats_idf2, key=lambda item: item[1][1])
print "titles idf_normed:"
print max(title_stats_normed, key=lambda item: item[1][1])
print "annotations idf_normed:"
print max(ann_stats_normed, key=lambda item: item[1][1])

max precision:
titles idf1:
((0.60000000000000009, 1.2, 0), [0.252444444444, 0.368277965302, 0.29955331044, 0.290147838106])
annotations idf1:
((0.80000000000000004, 1.7999999999999998, 0), [0.299555555556, 0.43102566911, 0.353461406918, 0.37014233784])
titles idf2:
((0.60000000000000009, 1.2, 0), [0.252444444444, 0.368277965302, 0.29955331044, 0.290147838106])
annotations idf2:
((0.80000000000000004, 1.7999999999999998, 0), [0.299555555556, 0.43102566911, 0.353461406918, 0.37014233784])
titles idf_normed:
((0.60000000000000009, 1.2, 0), [0.252444444444, 0.368277965302, 0.29955331044, 0.290147838106])
annotations idf_normed:
((0.80000000000000004, 1.7999999999999998, 0), [0.299555555556, 0.43102566911, 0.353461406918, 0.37014233784])


max recall:
titles idf1:
((0.60000000000000009, 1.2, 0), [0.252444444444, 0.368277965302, 0.29955331044, 0.290147838106])
annotations idf1:
((0.80000000000000004, 1.7999999999999998, 0), [0.299555555556, 0.43102566911, 0.353461406918, 0.37014233784])
tit

Во-первых, видно, что при поиске по полному тексту аннотаций результаты оказались лучше, чем только по текстам заголовков. Это связано с большим количеством слов в тексте аннотации.

Во-вторых, на результат не повлияло участие множителя с $k_2$; оказалось, что выгоднее всего положить его равным 1.

Далее, нормировка не повлияла на результат, поскольку сумма IDF документов считается относительно термов запроса q, а поиск релевантных документов, очевидно, происходит тоже относительно термов запроса q, т.е. сумма IDF является константой (относительно запроса q).

Использование другой формулы для IDF также не привело к улучшению результата.