In [2]:
import nltk
import string
import numpy as np
import operator
import math
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer


def normalize_line(line):
    words = [stemmer.stem(word) for word in tokenizer.tokenize(line) if word not in stopwords.words('english')]
    return words


documents = {}
N = 0
with open('cran.all.1400', 'r') as f:
    tokenizer = RegexpTokenizer(r'\w+')
    stemmer = SnowballStemmer("english")

    line = f.readline()
    while line:
        document = {}
        document_id = line.split()[1]
    
        f.readline()  # skip .T line

        line = f.readline()
        title_words = []
        while not line.startswith('.A'):
            title_words.extend(normalize_line(line))
            line = f.readline()
        document['title'] = title_words

        line = f.readline()
        while not line.startswith('.W'):
            line = f.readline()

        line = f.readline()
        annotation_words = []
        while not line.startswith('.I') and line:
            annotation_words.extend(normalize_line(line))
            line = f.readline()
        document['annotation'] = annotation_words

        documents[document_id] = document
        N += 1   
    
L_title = 0  # average length of annotation in documents
for document_index, document in documents.iteritems():
    L_title += len(document['annotation'])
    L_title /= float(N)


L_annotation = 0  # average length of annotation in documents
for document_index, document in documents.iteritems():
    L_annotation += len(document['annotation'])
    L_annotation /= float(N)


In [3]:
def create_inverted_index(document_field_name):
    invert_index = {}
    for document_index, document in documents.iteritems():
        for word in document[document_field_name]:
            if word in invert_index.keys() and document_index in invert_index[word].keys():
                pass
            else:
                if word not in invert_index:
                    invert_index[word] = {document_index: document[document_field_name].count(word)}
                else:
                    invert_index[word][document_index] = document[document_field_name].count(word)

    return invert_index

In [25]:
def search_in_index(index, query_words, field_name, b, k1, rsv_function_type, k2):
    founded_documents_indexes = set()
    for query_word in query_words:
        if query_word in index:
            founded_documents_indexes.update(index[query_word].keys())

    documents_with_rsv = {}
    for founded_document in founded_documents_indexes:
        document_rsv = rsv(query_words, documents[founded_document][field_name], index, field_name, b, k1, rsv_function_type, k2)
        documents_with_rsv[founded_document] = document_rsv

    return documents_with_rsv


def rsv(query_words, document_words, inverse_index, field_name, b, k1, rsv_function_type, k2):
    if field_name == 'title':
        L = L_title
    else:
        L = L_annotation

    rsv = 0
    for query_word in query_words:
        if query_word in inverse_index:
            Nt = len(inverse_index[query_word])
            ftd = document_words.count(query_word)
            ftq = query_words.count(query_word)
            Ld = len(document_words)
            idf_sum = 0
            if rsv_function_type == 1:
                rsv += math.log1p(1 + (N - Nt + 0.5) / (Nt + 0.5)) * ftd * (k1 + 1)/(k1 * ((1 - b) + b * Ld * L + ftd))
            elif rsv_function_type == 2:  # without -Nt
                rsv += math.log1p(1 + (N + 0.5) / (Nt + 0.5)) * ftd * (k1 + 1)/(k1 * ((1 - b) + b * Ld * L + ftd))
            elif rsv_function_type == 3:  # normalized
                rsv += math.log1p(1 + (N - Nt + 0.5) / (Nt + 0.5)) * ftd * (k1 + 1)/(k1 * ((1 - b) + b * Ld * L + ftd))
                idf_sum += math.log1p(1 + (N - Nt + 0.5) / (Nt + 0.5))
            elif rsv_function_type == 4:  # BM25 with k2
                rsv += math.log1p(1 + (N - Nt + 0.5) / (Nt + 0.5)) * ftd * (k1 + 1)/(k1 * ((1 - b) + b * Ld * L + ftd)) *\
                (k2+1) * ftq /(k2+ftq)
                
    if rsv_function_type == 3:
        rsv /= float(idf_sum)
    return rsv


In [5]:
title_index = create_inverted_index('title')
annotation_index = create_inverted_index('annotation')

In [26]:
def answer_for_field(field_name, b=0.75, k1=1.2, rsv_function_type=1, k2=0):
    with open('answer', 'w') as result_file:
        with open('cran.qry', 'r') as f:
            line = f.readline()
            query_number = 0
            while line:
                line = f.readline()  # skip .W line 

                query_words = []
                while not line.startswith('.I') and line:
                    query_words.extend(normalize_line(line))
                    line = f.readline()

                documents_with_rsv = search_in_index(title_index, query_words, field_name, b, k1, rsv_function_type, k2)
                top10 = [elem[0] for elem in
                         sorted(documents_with_rsv.items(), key=operator.itemgetter(1), reverse=True)[:10]]

                query_number += 1
                for top in top10:
                    result_file.write('%s %s\n' % (query_number, top))


In [7]:
answer_for_field('title')
%run ./eval.py


mean precision: 0.249777777778
mean recall: 0.365666349842
mean F-measure: 0.296811113057
MAP@10: 0.287847684835


In [8]:
answer_for_field('annotation')
%run ./eval.py

mean precision: 0.276888888889
mean recall: 0.409783052588
mean F-measure: 0.330476220923
MAP@10: 0.339283966014


Поиск по индексу, построенному по аннатациям получился более точным. Так как аннотации больше по размеру и там больше информации.

In [9]:
def eval():
    groundtruth_file = 'qrel_clean'
    answer_file = 'answer'
    
    q2reld = {} 
    for line in open(groundtruth_file):
        qid, did = [int(x) for x in line.split()]
        if qid not in q2reld.keys():
            q2reld[qid] = set()
        q2reld[qid].add(did)        
    
    q2retrd = {}
    for line in open(answer_file):
        qid, did = [int(x) for x in line.split()]
        if qid not in q2retrd.keys():
            q2retrd[qid] = []
        q2retrd[qid].append(did)               
    
    N = len(q2retrd.keys())
    precision = sum([len(q2reld[q].intersection(q2retrd[q]))*1.0/len(q2retrd[q]) for q in q2retrd.keys()]) / N
    recall = sum([len(q2reld[q].intersection(q2retrd[q]))*1.0/len(q2reld[q]) for q in q2retrd.keys()]) / N
    print("mean precision: {}\nmean recall: {}\nmean F-measure: {}"\
          .format(precision, recall, 2*precision*recall/(precision+recall)))

    # MAP@10
    import numpy as np
    
    MAP = 0.0
    for q in q2retrd.keys():
        n_results = min(10, len(q2retrd[q]))
        avep = np.zeros(n_results)
        for i in range(n_results):
            avep[i:] += q2retrd[q][i] in q2reld[q]
            avep[i] *= (q2retrd[q][i] in q2reld[q]) / (i+1.0)
        MAP += sum(avep) / min(n_results, len(q2reld[q]))
    print("MAP@10: {}".format(MAP/N))
    return MAP/N


In [20]:
best_result = 0
for k in np.arange(1.2, 2.1, 0.1):
    for b in np.arange(0, 1.1, 0.1):
        answer_for_field('annotation', b, k)
        eval_result = eval()
        if eval_result > best_result:
            best_result = eval_result
            best_params = (k, b)

mean precision: 0.248444444444
mean recall: 0.37180229298
mean F-measure: 0.297856348286
MAP@10: 0.283219218387
mean precision: 0.257333333333
mean recall: 0.382301518531
mean F-measure: 0.307609642643
MAP@10: 0.306401100893
mean precision: 0.265777777778
mean recall: 0.392914495219
mean F-measure: 0.317076564208
MAP@10: 0.321885651997
mean precision: 0.268
mean recall: 0.395561761533
mean F-measure: 0.319519774153
MAP@10: 0.325223753534
mean precision: 0.273333333333
mean recall: 0.402475485428
mean F-measure: 0.325565346183
MAP@10: 0.329914958288
mean precision: 0.278222222222
mean recall: 0.411236348022
mean F-measure: 0.33189837807
MAP@10: 0.3359585132
mean precision: 0.277333333333
mean recall: 0.410970611942
mean F-measure: 0.331178836019
MAP@10: 0.336652368355
mean precision: 0.277777777778
mean recall: 0.411026167498
mean F-measure: 0.331513593089
MAP@10: 0.338442805353
mean precision: 0.276444444444
mean recall: 0.408940668851
mean F-measure: 0.32988570606
MAP@10: 0.3381936241

KeyboardInterrupt: 

In [14]:
print best_result
print best_params

answer_for_field('annotation', 0.6, 2)
eval_result = eval()

0.338442805353
(1.2, 0.70000000000000007)
mean precision: 0.277333333333
mean recall: 0.410970611942
mean F-measure: 0.331178836019
MAP@10: 0.336652368355


В рузультате подбора параметров с помощью grid search, получилось, что лучший результат по MAP10 при k1 = 1.2 и b=0.7

k1 получился как и в BM25, а b чуть выше, так как в эксперементе чуть менее важным было учитывать длину документа.

Попробуйте заменить функцию вычисления IDF-составляющей в формуле BM25 выше на один из других вариантов, представленных на лекции

Для этого убрал Nt из числителя под лагорифмом (так было сказано на лекции)

In [15]:
best_result = 0
for k in np.arange(1.2, 2.1, 0.1):
    for b in np.arange(0, 1.1, 0.1):
        answer_for_field('annotation', b, k, 2)
        eval_result = eval()
        if eval_result > best_result:
            best_result = eval_result
            best_params = (k, b)
            
print best_params
print best_result

mean precision: 0.250666666667
mean recall: 0.37497590285
mean F-measure: 0.300471752491
MAP@10: 0.286383543294
mean precision: 0.261333333333
mean recall: 0.387668936408
mean F-measure: 0.312204810691
MAP@10: 0.309830772515
mean precision: 0.266222222222
mean recall: 0.392762633416
mean F-measure: 0.317343077551
MAP@10: 0.322786760589
mean precision: 0.268444444444
mean recall: 0.39487052371
mean F-measure: 0.319609245845
MAP@10: 0.329776634893
mean precision: 0.274222222222
mean recall: 0.40381845669
mean F-measure: 0.326635253642
MAP@10: 0.335506123149
mean precision: 0.28
mean recall: 0.415434698473
mean F-measure: 0.334529513203
MAP@10: 0.340987094426
mean precision: 0.279555555556
mean recall: 0.412242298281
mean F-measure: 0.333174276504
MAP@10: 0.342070290585
mean precision: 0.278666666667
mean recall: 0.410695561067
mean F-measure: 0.332037812381
MAP@10: 0.342368215475
mean precision: 0.277333333333
mean recall: 0.409443627815
mean F-measure: 0.3306819318
MAP@10: 0.34016316242

mean precision: 0.277333333333
mean recall: 0.409443627815
mean F-measure: 0.3306819318
MAP@10: 0.340163162425
mean precision: 0.278666666667
mean recall: 0.410384584195
mean F-measure: 0.331936133883
MAP@10: 0.340136085216
mean precision: 0.279111111111
mean recall: 0.409610703754
mean F-measure: 0.331997320777
MAP@10: 0.340561805381
mean precision: 0.250666666667
mean recall: 0.37497590285
mean F-measure: 0.300471752491
MAP@10: 0.286383543294
mean precision: 0.261333333333
mean recall: 0.387668936408
mean F-measure: 0.312204810691
MAP@10: 0.309830772515
mean precision: 0.266222222222
mean recall: 0.392762633416
mean F-measure: 0.317343077551
MAP@10: 0.322786760589
mean precision: 0.268444444444
mean recall: 0.39487052371
mean F-measure: 0.319609245845
MAP@10: 0.329776634893
mean precision: 0.274222222222
mean recall: 0.40381845669
mean F-measure: 0.326635253642
MAP@10: 0.335506123149
mean precision: 0.28
mean recall: 0.415434698473
mean F-measure: 0.334529513203
MAP@10: 0.34098709442

Результат получился лучше

Попробуем нормализовать RSV (q, d) на сумму IDF термов запроса

In [21]:
answer_for_field('annotation', 0.75, 1.2, 3)
eval()

mean precision: 0.276888888889
mean recall: 0.409783052588
mean F-measure: 0.330476220923
MAP@10: 0.339283966014


0.33928396601438915

С нормализацией результат таким же как и без нормализации

На лекции был предложен общий вариант BM25, включающий так-
(k +1)f
же множитель k 2 2 +f t,q t,q . Добавьте его в формулу вычисления RSV
и исследуйте вопрос оптимальности параметра k 2 (k 2 может варьи-
роваться от 0 до 1000 – тут скорее важен оптимальный порядок
константы)

In [27]:
best_result = 0
best_k2 = 0
for k2 in range(0, 1000, 50):
    answer_for_field('annotation', 0.75, 1.2, 4, k2)
    eval_result = eval()
    if eval_result > best_result:
        best_result = eval_result
        best_k2 = k2

mean precision: 0.276888888889
mean recall: 0.409783052588
mean F-measure: 0.330476220923
MAP@10: 0.339283966014
mean precision: 0.272
mean recall: 0.402624162096
mean F-measure: 0.324666023671
MAP@10: 0.327755873296
mean precision: 0.272
mean recall: 0.402624162096
mean F-measure: 0.324666023671
MAP@10: 0.327666984407
mean precision: 0.271555555556
mean recall: 0.402130334935
mean F-measure: 0.324188848395
MAP@10: 0.327384797458
mean precision: 0.271555555556
mean recall: 0.402130334935
mean F-measure: 0.324188848395
MAP@10: 0.327384797458
mean precision: 0.271555555556
mean recall: 0.402130334935
mean F-measure: 0.324188848395
MAP@10: 0.327384797458
mean precision: 0.271555555556
mean recall: 0.402130334935
mean F-measure: 0.324188848395
MAP@10: 0.327384797458
mean precision: 0.271555555556
mean recall: 0.402130334935
mean F-measure: 0.324188848395
MAP@10: 0.327384797458
mean precision: 0.271555555556
mean recall: 0.402130334935
mean F-measure: 0.324188848395
MAP@10: 0.327384797458
m

In [28]:
print best_result
print best_k2

0.339283966014
0
