In [2]:
import nltk
import string
import numpy as np
import operator
import math
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer


def normalize_line(line):
    words = [stemmer.stem(word) for word in tokenizer.tokenize(line) if word not in stopwords.words('english')]
    return words


documents = {}
N = 0
with open('cran.all.1400', 'r') as f:
    tokenizer = RegexpTokenizer(r'\w+')
    stemmer = SnowballStemmer("english")

    line = f.readline()
    while line:
        document = {}
        document_id = line.split()[1]
    
        f.readline()  # skip .T line

        line = f.readline()
        title_words = []
        while not line.startswith('.A'):
            title_words.extend(normalize_line(line))
            line = f.readline()
        document['title'] = title_words

        line = f.readline()
        while not line.startswith('.W'):
            line = f.readline()

        line = f.readline()
        annotation_words = []
        while not line.startswith('.I') and line:
            annotation_words.extend(normalize_line(line))
            line = f.readline()
        document['annotation'] = annotation_words

        documents[document_id] = document
        N += 1   
    
L_title = 0  # average length of annotation in documents
for document_index, document in documents.iteritems():
    L_title += len(document['annotation'])
    L_title /= float(N)


L_annotation = 0  # average length of annotation in documents
for document_index, document in documents.iteritems():
    L_annotation += len(document['annotation'])
    L_annotation /= float(N)


In [6]:
def create_inverted_index(document_field_name):
    invert_index = {}
    for document_index, document in documents.iteritems():
        for word in document[document_field_name]:
            if word in invert_index.keys() and document_index in invert_index[word].keys():
                pass
            else:
                if word not in invert_index:
                    invert_index[word] = {document_index: document[document_field_name].count(word)}
                else:
                    invert_index[word][document_index] = document[document_field_name].count(word)

    return invert_index

In [43]:
def search_in_index(index, query_words, field_name, b, k1, rsv_function_type, k2):
    founded_documents_indexes = set()
    for query_word in query_words:
        if query_word in index:
            founded_documents_indexes.update(index[query_word].keys())

    documents_with_rsv = {}
    for founded_document in founded_documents_indexes:
        document_rsv = rsv(query_words, documents[founded_document][field_name], index, field_name, b, k1, rsv_function_type)
        documents_with_rsv[founded_document] = document_rsv

    return documents_with_rsv


def rsv(query_words, document_words, inverse_index, field_name, b, k1, rsv_function_type):
    if field_name == 'title':
        L = L_title
    else:
        L = L_annotation

    rsv = 0
    for query_word in query_words:
        if query_word in inverse_index:
            Nt = len(inverse_index[query_word])
            ftd = document_words.count(query_word)
            Ld = len(document_words)
            if rsv_function_type == 1:
                rsv += math.log1p(1 + (N - Nt + 0.5) / (Nt + 0.5)) * ftd * (k1 + 1)/(k1 * ((1 - b) + b * Ld * L + ftd))
            elif rsv_function_type == 2:
                rsv += math.log1p(1 + (N + 0.5) / (Nt + 0.5)) * ftd * (k1 + 1)/(k1 * ((1 - b) + b * Ld * L + ftd))
            elif rsv_function_type == 3:
                 rsv += math.log1p(1 + (N + 0.5) / (Nt + 0.5)) * ftd * (k1 + 1)/(k1 * ((1 - b) + b * Ld * L + ftd))
    return rsv


In [18]:
title_index = create_inverted_index('title')
annotation_index = create_inverted_index('annotation')

In [44]:
def answer_for_field(field_name, b=0.75, k1=1.2, rsv_function_type=1):
    with open('answer', 'w') as result_file:
        with open('cran.qry', 'r') as f:
            line = f.readline()
            query_number = 0
            while line:
                line = f.readline()  # skip .W line 

                query_words = []
                while not line.startswith('.I') and line:
                    query_words.extend(normalize_line(line))
                    line = f.readline()

                documents_with_rsv = search_in_index(title_index, query_words, field_name, b, k1, rsv_function_type)
                top10 = [elem[0] for elem in
                         sorted(documents_with_rsv.items(), key=operator.itemgetter(1), reverse=True)[:10]]

                query_number += 1
                for top in top10:
                    result_file.write('%s %s\n' % (query_number, top))


In [45]:
answer_for_field('title')
%run ./eval.py


mean precision: 0.239555555556
mean recall: 0.351973187993
mean F-measure: 0.285082114808
MAP@10: 0.282763867193


In [46]:
answer_for_field('annotation')
%run ./eval.py

mean precision: 0.276888888889
mean recall: 0.409783052588
mean F-measure: 0.330476220923
MAP@10: 0.339283966014


Поиск по индексу, построенному по аннатациям получился более точным. Так как аннотации больше по размеру и там больше информации.

In [33]:
def eval():
    groundtruth_file = 'qrel_clean'
    answer_file = 'answer'
    
    q2reld = {} 
    for line in open(groundtruth_file):
        qid, did = [int(x) for x in line.split()]
        if qid not in q2reld.keys():
            q2reld[qid] = set()
        q2reld[qid].add(did)        
    
    q2retrd = {}
    for line in open(answer_file):
        qid, did = [int(x) for x in line.split()]
        if qid not in q2retrd.keys():
            q2retrd[qid] = []
        q2retrd[qid].append(did)               
    
    N = len(q2retrd.keys())
    precision = sum([len(q2reld[q].intersection(q2retrd[q]))*1.0/len(q2retrd[q]) for q in q2retrd.keys()]) / N
    recall = sum([len(q2reld[q].intersection(q2retrd[q]))*1.0/len(q2reld[q]) for q in q2retrd.keys()]) / N
    print("mean precision: {}\nmean recall: {}\nmean F-measure: {}"\
          .format(precision, recall, 2*precision*recall/(precision+recall)))

    # MAP@10
    import numpy as np
    
    MAP = 0.0
    for q in q2retrd.keys():
        n_results = min(10, len(q2retrd[q]))
        avep = np.zeros(n_results)
        for i in range(n_results):
            avep[i:] += q2retrd[q][i] in q2reld[q]
            avep[i] *= (q2retrd[q][i] in q2reld[q]) / (i+1.0)
        MAP += sum(avep) / min(n_results, len(q2reld[q]))
    print("MAP@10: {}".format(MAP/N))
    return MAP/N


best_result = 0
for k in np.arange(1.2, 2.1, 0.1):
    for b in np.arange(0, 1.1, 0.1):
        answer_for_field('annotation', b, k)
        eval_result = eval()
        if eval_result > best_result:
            best_result = eval_result
            best_params = (k, b)
            
print best_params

mean precision: 0.250222222222
mean recall: 0.373345427043
mean F-measure: 0.299628508699
MAP@10: 0.285003286582


mean precision: 0.257333333333
mean recall: 0.382614154697
mean F-measure: 0.307710797058
MAP@10: 0.303300482209


mean precision: 0.266222222222
mean recall: 0.392916869389
mean F-measure: 0.317393410431
MAP@10: 0.318745466952


mean precision: 0.268
mean recall: 0.395306029611
mean F-measure: 0.319436312068
MAP@10: 0.325556248425


mean precision: 0.274666666667
mean recall: 0.405992939663
mean F-measure: 0.327660776078
MAP@10: 0.331720813387


mean precision: 0.28
mean recall: 0.412755761446
mean F-measure: 0.333657602395
MAP@10: 0.336177164693


mean precision: 0.279111111111
mean recall: 0.412673433812
mean F-measure: 0.332998883778
MAP@10: 0.337086412754


mean precision: 0.278666666667
mean recall: 0.411393010531
mean F-measure: 0.332265520571
MAP@10: 0.338781418493


mean precision: 0.276888888889
mean recall: 0.410977705888
mean F-measure: 0.330864040224
MAP@10: 0.33803313877


mean precision: 0.276444444444
mean recall: 0.408075589486
mean F-measure: 0.329603880193
MAP@10: 0.338182065172


mean precision: 0.275111111111
mean recall: 0.405723497134
mean F-measure: 0.327888860962
MAP@10: 0.335661776686


mean precision: 0.252444444444
mean recall: 0.376109327535
mean F-measure: 0.302111655272
MAP@10: 0.286929388875


mean precision: 0.256888888889
mean recall: 0.382483875295
mean F-measure: 0.307350776406
MAP@10: 0.304352735086


mean precision: 0.267111111111
mean recall: 0.394546499018
mean F-measure: 0.318556764479
MAP@10: 0.318410683492


mean precision: 0.268
mean recall: 0.395768992574
mean F-measure: 0.31958736005
MAP@10: 0.325470133115


mean precision: 0.272888888889
mean recall: 0.404310256334
mean F-measure: 0.325847359364
MAP@10: 0.330573444192


mean precision: 0.279111111111
mean recall: 0.411644650335
mean F-measure: 0.332663445318
MAP@10: 0.336317209765


mean precision: 0.279111111111
mean recall: 0.411501904192
mean F-measure: 0.332616823484
MAP@10: 0.337495113519


mean precision: 0.28
mean recall: 0.412689306828
mean F-measure: 0.333635887757
MAP@10: 0.339244172895


mean precision: 0.276888888889
mean recall: 0.411348076258
mean F-measure: 0.330983999842
MAP@10: 0.338891457406


mean precision: 0.276444444444
mean recall: 0.408075589486
mean F-measure: 0.329603880193
MAP@10: 0.338399760645


mean precision: 0.275111111111
mean recall: 0.405723497134
mean F-measure: 0.327888860962
MAP@10: 0.335661776686


mean precision: 0.252444444444
mean recall: 0.376257475683
mean F-measure: 0.302159437966
MAP@10: 0.286434684079


mean precision: 0.257333333333
mean recall: 0.382547367358
mean F-measure: 0.307689196107
MAP@10: 0.303537438762


mean precision: 0.266222222222
mean recall: 0.393519995892
mean F-measure: 0.317590006881
MAP@10: 0.317755232916


mean precision: 0.266666666667
mean recall: 0.394097034902
mean F-measure: 0.318094176151
MAP@10: 0.325232197447


mean precision: 0.272444444444
mean recall: 0.4039434133
mean F-measure: 0.32541133778
MAP@10: 0.330332397609


mean precision: 0.277333333333
mean recall: 0.409713433403
mean F-measure: 0.330769891362
MAP@10: 0.335167454719


mean precision: 0.278666666667
mean recall: 0.410203429246
mean F-measure: 0.331876860272
MAP@10: 0.337627760981


mean precision: 0.28
mean recall: 0.412596714235
mean F-measure: 0.333605625356
MAP@10: 0.339213367487


mean precision: 0.277333333333
mean recall: 0.411665536576
mean F-measure: 0.331404245967
MAP@10: 0.339813251309


mean precision: 0.276888888889
mean recall: 0.409186700597
mean F-measure: 0.330282122299
MAP@10: 0.338668543714


mean precision: 0.275111111111
mean recall: 0.405723497134
mean F-measure: 0.327888860962
MAP@10: 0.335661776686


mean precision: 0.252
mean recall: 0.375779681872
mean F-measure: 0.301686985311
MAP@10: 0.285388020352


mean precision: 0.257777777778
mean recall: 0.382852321663
mean F-measure: 0.308105475473
MAP@10: 0.301819186893


mean precision: 0.264888888889
mean recall: 0.392358557983
mean F-measure: 0.316262689083
MAP@10: 0.31660600977


mean precision: 0.267111111111
mean recall: 0.394985923791
mean F-measure: 0.318699898702
MAP@10: 0.325390178047


mean precision: 0.272
mean recall: 0.403409021766
mean F-measure: 0.324920900918
MAP@10: 0.330659646286


mean precision: 0.277777777778
mean recall: 0.40991360977
mean F-measure: 0.331151134548
MAP@10: 0.335328964755


mean precision: 0.279555555556
mean recall: 0.411103942314
mean F-measure: 0.33280188382
MAP@10: 0.338806177039


mean precision: 0.279555555556
mean recall: 0.411527748718
mean F-measure: 0.332940667812
MAP@10: 0.339719981244


mean precision: 0.277777777778
mean recall: 0.412007416918
mean F-measure: 0.331832302518
MAP@10: 0.340407196047


mean precision: 0.276888888889
mean recall: 0.409186700597
mean F-measure: 0.330282122299
MAP@10: 0.338780713026


mean precision: 0.275111111111
mean recall: 0.405723497134
mean F-measure: 0.327888860962
MAP@10: 0.335661776686


mean precision: 0.252444444444
mean recall: 0.377175896768
mean F-measure: 0.30245515745
MAP@10: 0.285864566362


mean precision: 0.259555555556
mean recall: 0.384827630305
mean F-measure: 0.310014760064
MAP@10: 0.302317524005


mean precision: 0.264444444444
mean recall: 0.391284483909
mean F-measure: 0.315596898331
MAP@10: 0.315579773103


mean precision: 0.269333333333
mean recall: 0.397562372034
mean F-measure: 0.321120073217
MAP@10: 0.325871285098


mean precision: 0.272444444444
mean recall: 0.403242355099
mean F-measure: 0.325183619054
MAP@10: 0.330719375857


mean precision: 0.279111111111
mean recall: 0.412252234109
mean F-measure: 0.33286167083
MAP@10: 0.335692739565


mean precision: 0.280444444444
mean recall: 0.413474312684
mean F-measure: 0.334207924837
MAP@10: 0.339776400437


mean precision: 0.279111111111
mean recall: 0.411607113797
mean F-measure: 0.332651187504
MAP@10: 0.339384296352


mean precision: 0.277777777778
mean recall: 0.412007416918
mean F-measure: 0.331832302518
MAP@10: 0.339956284818


mean precision: 0.276888888889
mean recall: 0.409186700597
mean F-measure: 0.330282122299
MAP@10: 0.339059666583


mean precision: 0.275111111111
mean recall: 0.405723497134
mean F-measure: 0.327888860962
MAP@10: 0.335661776686


mean precision: 0.252888888889
mean recall: 0.377200588126
mean F-measure: 0.302781878127
MAP@10: 0.2857170229


mean precision: 0.259555555556
mean recall: 0.385420222898
mean F-measure: 0.310206874176
MAP@10: 0.302125006999


mean precision: 0.265333333333
mean recall: 0.392667199958
mean F-measure: 0.3166796736
MAP@10: 0.31530505445


mean precision: 0.269777777778
mean recall: 0.397255253227
mean F-measure: 0.32133532957
MAP@10: 0.32576318902


mean precision: 0.271111111111
mean recall: 0.400510161367
mean F-measure: 0.323345192623
MAP@10: 0.329835702388


mean precision: 0.278666666667
mean recall: 0.41180032001
mean F-measure: 0.332398289053
MAP@10: 0.335276249265


mean precision: 0.279555555556
mean recall: 0.412585423795
mean F-measure: 0.333286283587
MAP@10: 0.339778273285


mean precision: 0.279555555556
mean recall: 0.411977484168
mean F-measure: 0.333087756758
MAP@10: 0.339429471459


mean precision: 0.278222222222
mean recall: 0.411902710708
mean F-measure: 0.33211519261
MAP@10: 0.340239236024


mean precision: 0.276888888889
mean recall: 0.409186700597
mean F-measure: 0.330282122299
MAP@10: 0.338962135718


mean precision: 0.275111111111
mean recall: 0.405723497134
mean F-measure: 0.327888860962
MAP@10: 0.335661776686


mean precision: 0.253777777778
mean recall: 0.377245321171
mean F-measure: 0.303432566711
MAP@10: 0.285825387727


mean precision: 0.261333333333
mean recall: 0.387286825431
mean F-measure: 0.312080824743
MAP@10: 0.302374246942


mean precision: 0.266666666667
mean recall: 0.394169845461
mean F-measure: 0.318117891069
MAP@10: 0.315489400493


mean precision: 0.269777777778
mean recall: 0.397006575978
mean F-measure: 0.321253944329
MAP@10: 0.324488023152


mean precision: 0.270222222222
mean recall: 0.39847312433
mean F-measure: 0.322048878334
MAP@10: 0.329017360236


mean precision: 0.278666666667
mean recall: 0.411164322021
mean F-measure: 0.33219090748
MAP@10: 0.334877941547


mean precision: 0.280444444444
mean recall: 0.413775899986
mean F-measure: 0.33430640092
MAP@10: 0.340430231516


mean precision: 0.279555555556
mean recall: 0.411977484168
mean F-measure: 0.333087756758
MAP@10: 0.339259890541


mean precision: 0.277777777778
mean recall: 0.410055967413
mean F-measure: 0.331197578452
MAP@10: 0.339871746312


mean precision: 0.276888888889
mean recall: 0.409186700597
mean F-measure: 0.330282122299
MAP@10: 0.339091059881


mean precision: 0.275111111111
mean recall: 0.405723497134
mean F-measure: 0.327888860962
MAP@10: 0.335661776686


mean precision: 0.253333333333
mean recall: 0.375867896127
mean F-measure: 0.302669043099
MAP@10: 0.285042990118


mean precision: 0.260888888889
mean recall: 0.38678610393
mean F-measure: 0.311601337124
MAP@10: 0.301805421881


mean precision: 0.266666666667
mean recall: 0.394209528001
mean F-measure: 0.318130813754
MAP@10: 0.314572459478


mean precision: 0.268888888889
mean recall: 0.395708101033
mean F-measure: 0.320198596216
MAP@10: 0.32337829778


mean precision: 0.272
mean recall: 0.400332672189
mean F-measure: 0.323918474707
MAP@10: 0.33004425618


mean precision: 0.277777777778
mean recall: 0.410956065499
mean F-measure: 0.33149078923
MAP@10: 0.335697579855


mean precision: 0.281333333333
mean recall: 0.415148034024
mean F-measure: 0.335385799858
MAP@10: 0.341433053386


mean precision: 0.280444444444
mean recall: 0.412763808954
mean F-measure: 0.333975588206
MAP@10: 0.339606398197


mean precision: 0.277777777778
mean recall: 0.40996337482
mean F-measure: 0.331167372485
MAP@10: 0.340276655189


mean precision: 0.276888888889
mean recall: 0.409186700597
mean F-measure: 0.330282122299
MAP@10: 0.338817250357


mean precision: 0.275111111111
mean recall: 0.405723497134
mean F-measure: 0.327888860962
MAP@10: 0.335661776686


mean precision: 0.254666666667
mean recall: 0.377326988971
mean F-measure: 0.304093579635
MAP@10: 0.286574990902


mean precision: 0.260888888889
mean recall: 0.385600918745
mean F-measure: 0.311216028646
MAP@10: 0.301814427088


mean precision: 0.265777777778
mean recall: 0.39328889308
mean F-measure: 0.317198403893
MAP@10: 0.313513993869


mean precision: 0.269333333333
mean recall: 0.396011452003
mean F-measure: 0.320612971671
MAP@10: 0.323453858375


mean precision: 0.272
mean recall: 0.400074709274
mean F-measure: 0.323834000658
MAP@10: 0.329720461353


mean precision: 0.277777777778
mean recall: 0.409588987674
mean F-measure: 0.33104515527
MAP@10: 0.335962243498


mean precision: 0.281333333333
mean recall: 0.414585189003
mean F-measure: 0.335201979626
MAP@10: 0.341968552112


mean precision: 0.279555555556
mean recall: 0.411635061158
mean F-measure: 0.332975782442
MAP@10: 0.339820703508


mean precision: 0.277777777778
mean recall: 0.410280835138
mean F-measure: 0.331270901955
MAP@10: 0.340199386915


mean precision: 0.276888888889
mean recall: 0.409186700597
mean F-measure: 0.330282122299
MAP@10: 0.339124598975


mean precision: 0.275111111111
mean recall: 0.405723497134
mean F-measure: 0.327888860962
MAP@10: 0.335661776686
(2.0000000000000009, 0.60000000000000009)


In [37]:
print best_result

answer_for_field('annotation', 0.6, 2)
eval_result = eval()

0.341968552112


mean precision: 0.281333333333
mean recall: 0.414585189003
mean F-measure: 0.335201979626
MAP@10: 0.341968552112


7. Попробуйте заменить функцию вычисления IDF-составляющей в формуле BM25 выше на один из других вариантов, представленных на лекции

Для этого убрал Nt из числителя под лагорифмом (так было сказано на лекции)

In [None]:
на сколько важно учитывать длину жокумента