# BM25f

In [23]:
import pickle
import math
import os
from smart_parser import clean_text, lemmatize_text
from tqdm.notebook import tqdm
from collections import defaultdict
from pymystem3 import Mystem

PATH = "/Users/michelle/data/text-relevance-competition-ir-1-ts-fall-2020/"

In [118]:
stemmer = Mystem()

with open(os.path.join(PATH, "stop_words.pkl"), "rb") as f:
    stop_words = set(pickle.load(f))

Считываем и преобразуем данные для запросов, считаем IDF слов.

In [7]:
with open(PATH + "queries.numerate.txt", "r") as f:
    queries = f.read().split("\n")
    
with open(PATH + "sample.technosphere.ir1.textrelevance.submission.txt", "r") as f:
    sample = f.read().split("\n")
    
query_docid = defaultdict(list)
for item in sample[1:]:
    if item:
        qid = item.split(",")[0]
        docid = item.split(",")[1]
        query_docid[qid].append(docid)
        
queries = [None] + [item.split("\t")[1] for item in queries if item]

listdir = os.listdir(os.path.join(PATH, "parsed_content"))
listdir = set(listdir)

terms_count_dicts = []
lost_files = []
for item in tqdm(query_docid.items(), total=len(queries)):
    terms_count = defaultdict(int)
    for docid in item[1]:
        filename = f"parsed_{docid}.pkl"
        try:
            with open(os.path.join(PATH, "parsed_content", filename), "rb") as f:
                doc = pickle.load(f)[0]
            for term in set(doc.split()):
                terms_count[term] += 1
        except:
            lost_files.append(docid)
    terms_count_dicts.append(terms_count)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=400.0), HTML(value='')))




In [9]:
with open(os.path.join(PATH, "term_count_subsets"), "wb") as f:
    pickle.dump(terms_count_dicts, f)

In [12]:
average_doc_len = 8682.4
corpus_len = 38115

Следующая функция считает tf для каждого поля, складывает их с весами, а потом поставляет в BM25.

In [153]:
def bm25f_score(query, docid, terms_count, k1=1.2, k2=1000, b=0.75):
    lemmed_query_list = lemmatize_text(clean_text(query), stemmer).split()
    score = 0
    
    filename = f"parsed_{docid}.pkl"
    if filename in listdir:
        with open(os.path.join(PATH, "parsed_content", filename), "rb") as f:
            doc = pickle.load(f)
    else:
        return 0
    
    text = doc[0].split()
    title = doc[1].split()
    meta = doc[2].split()
    headers = [header.split() for header in doc[3]]
    
    for term in set(lemmed_query_list):
        if term in stop_words:
            continue
        idf = math.log((corpus_len - terms_count[term] + 0.5) / (terms_count[term] + 0.5))
        
        text_tf = text.count(term)
        title_tf = title.count(term)
        meta_tf = meta.count(term)
        headers_tf = [header.count(term) for header in headers]
        
        
        
        tf = 0.1 * text_tf + 10 * title_tf + 10 * meta_tf
        for coef, item in enumerate(headers_tf):
            tf += 6 / (coef + 1) * item
        
        qf = lemmed_query_list.count(term)
        dl = len(text)
        K = k1 * ((1 - b) + b * dl / average_doc_len)
        
        score += idf * (k1 + 1) * tf / (K + tf) * ((k2 + 1) * qf / (k2 + qf))

    return score

In [123]:
new_query_docid = {}
for item in tqdm(query_docid.items(), total=len(query_docid)):
    query = queries[int(item[0])]
    docids = item[1]
    result = []
    for docid in docids:
        score = bm25f_score(query, docid, terms_count_dicts[int(item[0]) - 1])
        result.append((score, docid))
    result.sort(reverse=True)
    new_query_docid[item[0]] = [doc_item[1] for doc_item in result]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=399.0), HTML(value='')))




In [124]:
submit = ['QueryId,DocumentId']
for i in range(1, len(queries)):
    docids = new_query_docid[str(i)]
    for docid in docids:
        submit.append(f"{i},{docid}")
        
with open(PATH + "submit_20_2.txt", "w") as f:
    f.write("\n".join(submit))

# Сначала BM25, потом сложить

Судя по лидерборду, предыдущий пример не дал серьезного прироста качества, поэтому я попробовал сначала пересчитать tf через BM25 для каждого поля, а потом сложить с весами (получилось лучше)

In [174]:
def bm25f_score_base(query, docid, terms_count, k1=1, k2=0.001):
    lemmed_query_list = lemmatize_text(clean_text(query), stemmer).split()
    score = 0
    
    text_weight = 1
    headers_weight = 1.5
    meta_weight = 4
    title_weight = 5
    
    filename = f"parsed_{docid}.pkl"
    if filename in listdir:
        with open(os.path.join(PATH, "parsed_content", filename), "rb") as f:
            doc = pickle.load(f)
    else:
        return 0
    
    text = doc[0].split()
    title = doc[1].split()
    meta = doc[2].split()
    headers = [header.split() for header in doc[3]]
    
    for term in set(lemmed_query_list):
        if term in stop_words:
            continue
        
        idf = -math.log((terms_count[term] + 1) / corpus_len)
        
        text_tf = text.count(term)
        title_tf = title.count(term)
        meta_tf = meta.count(term)
        headers_tf = sum([header.count(term) for header in headers])
        
        dl = len(text)
        
        text_tf = text_tf / (text_tf + k1 + k2 * dl)
        title_tf = title_tf / (title_tf + k1 + k2 * dl)
        meta_tf = meta_tf / (meta_tf + k1 + k2 * dl)
        headers_tf = headers_tf / (headers_tf + k1 + k2 * dl)
        
        score += idf * (text_tf + title_weight * title_tf + meta_weight * meta_tf + header_weight * headers_tf)

    return score

In [178]:
new_query_docid = {}
for item in tqdm(query_docid.items(), total=len(query_docid)):
    query = queries[int(item[0])]
    docids = item[1]
    result = []
    for docid in docids:
        score = bm25f_score_base(query, docid, terms_count_dicts[int(item[0]) - 1])
        result.append((score, docid))
    result.sort(reverse=True)
    new_query_docid[item[0]] = [doc_item[1] for doc_item in result]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=399.0), HTML(value='')))




In [180]:
submit = ['QueryId,DocumentId']
for i in range(1, len(queries)):
    docids = new_query_docid[str(i)]
    for docid in docids:
        submit.append(f"{i},{docid}")
        
with open(PATH + "submit_20_3.txt", "w") as f:
    f.write("\n".join(submit))