# BM25
Данный ноутбук реализует решение с использованием алгоритма BM25. Как и в случае TF-IDF метрика рассчитывалась с использованием IDF для всей базы, и для подмножества документов для каждого запроса.

In [1]:
import pickle
import math
import os
from parser import parse, text_handler
from tqdm.notebook import tqdm
from collections import defaultdict
PATH = "/Users/michelle/data/text-relevance-competition-ir-1-ts-fall-2020/"

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/michelle/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# IDF для всех документов

In [19]:
files = os.listdir(os.path.join(PATH, "texts"))
corpus_len = len(files)

sum_doc_len = 0
for file in tqdm(files, total=corpus_len):
    if file.endswith(".pkl"):
        with open(os.path.join(PATH, "texts", file), "rb") as f:
            sum_doc_len += len(pickle.load(f).split())

average_doc_len = sum_doc_len / corpus_len

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=38106.0), HTML(value='')))




In [20]:
with open(os.path.join(PATH, "inv_index"), "rb") as f:
    terms_count = pickle.load(f)

In [21]:
def bm25_score(query, doc, terms_count, k1=1.2, k2=1000, b=0.75):
    lemmed_query_list = text_handler(query).split()
    score = 0
    text_terms = doc.split()
    for term in set(lemmed_query_list):
        idf = (corpus_len - terms_count[term] + 0.5) / (terms_count[term] + 0.5)
        tf = text_terms.count(term)
        qf = lemmed_query_list.count(term)
        dl = len(text_terms)
        K = k1 * ((1 - b) + b * dl / average_doc_len)
        score += math.log(idf) * (k1 + 1) * tf / (K + tf) * (k2 + 1) * qf / (k2 + qf)
    return score

Создаем словарь запрос-документы

In [22]:
with open(PATH + "queries.numerate.txt", "r") as f:
    queries = f.read().split("\n")
    
with open(PATH + "sample.technosphere.ir1.textrelevance.submission.txt", "r") as f:
    sample = f.read().split("\n")
    
query_docid = defaultdict(list)
for item in sample[1:]:
    if item:
        qid = item.split(",")[0]
        docid = item.split(",")[1]
        query_docid[qid].append(docid)
        
queries = [None] + [item.split("\t")[1] for item in queries if item]

In [25]:
listdir = os.listdir(os.path.join(PATH, "texts"))
listdir = set(listdir)

Считаем bm25 для каждой пары запрос-документ.

In [11]:
new_query_docid = {}
for item in tqdm(query_docid.items(), total=len(query_docid)):
    query = queries[int(item[0])]
    docids = item[1]
    result = []
    for docid in docids:
        filename = f"{docid}.pkl_text.pkl"
        if filename in listdir:
            with open(os.path.join(PATH, "texts", filename), "rb") as f:
                text = pickle.load(f)
            score = bm25_score(query, text, terms_count)
        else:
            score = 0
        result.append((score, docid))
    result.sort(reverse=True)
    new_query_docid[item[0]] = [doc_item[1] for doc_item in result]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=399.0), HTML(value='')))




In [12]:
submit = ['QueryId,DocumentId']
for i in range(1, len(queries)):
    docids = new_query_docid[str(i)]
    for docid in docids:
        submit.append(f"{i},{docid}")
        
with open(PATH + "submit_19_07.txt", "w") as f:
    f.write("\n".join(submit))

# IDF for subset

Делаем аналогичные действия с IDF для подможеств документов.

In [37]:
terms_count_dicts = []
lost_files = []
for item in tqdm(query_docid.items(), total=len(queries)):
    terms_count = defaultdict(int)
    for docid in item[1]:
        filename = f"{docid}.pkl_text.pkl"
        try:
            with open(os.path.join(PATH, "texts", filename), "rb") as f:
                doc = pickle.load(f)
            for term in set(doc.split()):
                terms_count[term] += 1
        except:
            lost_files.append(docid)
    terms_count_dicts.append(terms_count)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=400.0), HTML(value='')))




In [38]:
with open(os.path.join(PATH, "term_count_subsets"), "wb") as f:
    pickle.dump(terms_count_dicts, f)

In [14]:
new_query_docid = {}
for item in tqdm(query_docid.items(), total=len(query_docid)):
    query = queries[int(item[0])]
    docids = item[1]
    result = []
    for docid in docids:
        filename = f"{docid}.pkl_text.pkl"
        if filename in listdir:
            with open(os.path.join(PATH, "texts", filename), "rb") as f:
                text = pickle.load(f)
            score = bm25_score(query, text, terms_count_dicts[int(item[0]) - 1])
        else:
            score = 0
        result.append((score, docid))
    result.sort(reverse=True)
    new_query_docid[item[0]] = [doc_item[1] for doc_item in result]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=399.0), HTML(value='')))




In [15]:
submit = ['QueryId,DocumentId']
for i in range(1, len(queries)):
    docids = new_query_docid[str(i)]
    for docid in docids:
        submit.append(f"{i},{docid}")
        
with open(PATH + "submit_19_08.txt", "w") as f:
    f.write("\n".join(submit))