В этом ноутбуке реализованы дополнения к BM25f. Были добавлены биграммы слов из запроса, проверку орфографии и расширения запросов.

In [1]:
import pickle
import math
import os
from smart_parser import clean_text, lemmatize_text
from tqdm.notebook import tqdm
from collections import defaultdict
from pymystem3 import Mystem
from joblib import Parallel, delayed
from ndcg_standart import mean_ndcg
import requests

PATH = "/Users/michelle/data/text-relevance-competition-ir-1-ts-fall-2020/"


In [2]:
stemmer = Mystem()

In [3]:
def get_parsed_doc(docid):
    filename = f"parsed_{docid}.pkl"
    with open(os.path.join(PATH, "parsed_content", filename), "rb") as f:
        return pickle.load(f)

In [4]:
listdir = os.listdir(os.path.join(PATH, "parsed_content"))

In [5]:
with open(PATH + "queries.numerate.txt", "r") as f:
    queries = f.read().split("\n")

with open(PATH + "sample.technosphere.ir1.textrelevance.submission.txt", "r") as f:
    sample = f.read().split("\n")
    
query_docid = defaultdict(list)
for item in sample[1:]:
    if item:
        qid = item.split(",")[0]
        docid = item.split(",")[1]
        query_docid[qid].append(docid)
        
queries = [None] + [item.split("\t")[1] for item in queries if item]

In [6]:
average_doc_len = 8682.4
corpus_len = 38115

Здесь и далее считаем общий IDF по всей базе.

In [7]:
words_count = defaultdict(int)
for filename in tqdm(listdir, total=len(listdir)):
    if filename.endswith(".pkl"):
        with open(os.path.join(PATH, "parsed_content", filename), "rb") as f:
            doc = pickle.load(f)
        for word in set(doc[0].split()):
            words_count[word] += 1

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=38115.0), HTML(value='')))




In [8]:
words_idf = {}
for word, count in words_count.items():
    words_idf[word] = -math.log(count / corpus_len)

Как в последнем примере из ноутбука BM25f, сначала считаем tf для разных полей, а потом их складываем с весами.

In [37]:
def tf_score(tf, field_len):
    return tf / (tf + 1 + 0.001 * field_len)

def word_score(word, doc, idf): # условие при добавлении веса источника
    text_weight = 1
    headers_weight = 1.5
    meta_weight = 4
    title_weight = 5
    headers_coef = 1.2
    
    text_tf = doc[0].split(' ').count(word)
    title_tf = doc[1].split(' ').count(word)
    meta_tf = doc[2].split(' ').count(word)
    headers_tf = [header.split(' ').count(word) for header in doc[3]]
        
    text_score = tf_score(text_tf, len(doc[0].split(' '))) * text_weight# if text_tf else 0
    title_score = tf_score(title_tf, len(doc[1].split(' '))) * title_weight# if title_tf else 0
    meta_score = tf_score(meta_tf, len(doc[2].split())) * meta_weight
    headers_score = sum([tf_score(header_tf, len(doc[3][i].split(' ')))\
                     * headers_weight ** 6 / headers_coef ** i\
                     for i, header_tf in enumerate(headers_tf)])# if header_tf])
    
    print(idf, text_score, title_score, meta_score, headers_score)
    
    return idf * (text_score + title_score + headers_score + meta_score)

In [16]:
with open(os.path.join(PATH, "words_idf"), "wb") as f:
    pickle.dump(words_idf, f)

Аналогично учитываем пары, отдельно рассматривая то, в какому порядке слова встретились в документе.

In [22]:
def tf_pair(word1, word2, text):
    bigram_weight = 1
    inverse_weight = 0.75
    skip_weight = 0.5
    skip_inverse_weight = 0.25

    text_len = len(text)
    bigram_tf = 0
    inverse_tf = 0
    skip_tf = 0
    skip_inverse_tf = 0
    for index, _ in enumerate(text):
        if index < text_len - 1:
            bigram_tf += (text[index] == word1 and text[index + 1] == word2)

        if index > 0:
            inverse_tf += (text[index] == word1 and text[index - 1] == word2)

        if index < text_len - 2:
            skip_tf += (text[index] == word1 and text[index + 2] == word2)

        if index > 1:
            skip_inverse_tf += (text[index] == word1 and text[index - 2] == word2)
#     print(bigram_tf, inverse_tf, skip_tf, skip_inverse_tf)

    return bigram_tf * bigram_weight + inverse_tf * inverse_weight +\
            skip_tf * skip_weight + skip_inverse_tf * skip_inverse_weight

def pair_score(word1, word2, doc, idf1, idf2): # check веса для особых тегов
    text_weight = 1
    headers_weight = 1.5
    meta_weight = 4
    title_weight = 5
    headers_coef = 1.2
    
    text = doc[0].split(" ")
    text_tf = tf_pair(word1, word2, text)
    
    headers = [header.split(" ") for header in doc[3]]
    headers_tf = [tf_pair(word1, word2, header) for header in headers]
    
    title = doc[1].split(" ")
    title_tf = tf_pair(word1, word2, title)
    
    meta = doc[2].split(" ")
    meta_tf = tf_pair(word1, word2, meta)
    
    text_score = text_weight * text_tf / (1 + text_tf)
    title_score = title_weight * title_tf / (1 + title_tf)
    headers_score = sum([(headers_weight ** 6 / headers_coef ** i) *\
                         header_tf / (1 + header_tf)\
                         for i, header_tf in enumerate(headers_tf)])
    meta_score = meta_weight * meta_tf / (1 + meta_tf)
    
#     print(text_score, title_score, headers_score, meta_score)
    
    return (idf1 + idf2) * (text_score + title_score + headers_score + meta_score)
    

In [163]:
with open(os.path.join(PATH, "urls.numerate.txt"), "r") as f:
    urls = f.read().split("\n")
    
urls_dict = {url.split("\t")[0]: url.split("\t")[1] for url in urls[:-1]}

In [168]:
for filename in tqdm(listdir, total=len(listdir)):
    if filename.endswith(".pkl"):
        docid = filename.split(".")[0].split("_")[1]
        if docid == "00":
            continue
        with open(os.path.join(PATH, "parsed_content", filename), "rb") as f:
            doc = pickle.load(f)
        new_doc = (doc[0], doc[1], doc[2], doc[3], urls_dict[docid])
        with open(os.path.join(PATH, "parsed_content", filename), "wb") as f:
            pickle.dump(new_doc, f)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=38115.0), HTML(value='')))




In [256]:
def total_score(query, doc, words_idf):
    score = 0
    query_words = query.split(" ")
    
    # single words
    for word in query_words:
        if word in words_idf:
            score += word_score(word, doc, words_idf[word])
        
    # bigrams
    for index, _ in enumerate(query_words[:-1]):
        word1 = query_words[index]
        word2 = query_words[index + 1]
        if word1 in words_idf and word2 in words_idf:
            score += pair_score(word1, word2, doc,
                                words_idf[word1], words_idf[word2])
        
    score += get_popularity(doc[4])
    return score

In [257]:
new_query_docid = {}
for item in tqdm(query_docid.items(), total=len(query_docid)):
    query = queries[int(item[0])]
    docids = item[1]
    result = []
    for docid in docids:
        filename = f"parsed_{docid}.pkl"
        if filename in listdir:
            with open(os.path.join(PATH, "parsed_content", filename), "rb") as f:
                doc = pickle.load(f)
            score = total_score(lemmatize_text(clean_text(query), stemmer)[:-1],
                                doc, words_idf)
        else:
            score = 0
            print("ERROR", filename)
        result.append((score, docid))
    result.sort(reverse=True)
    new_query_docid[item[0]] = [doc_item[1] for doc_item in result]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=399.0), HTML(value='')))

ERROR parsed_31733.pkl



In [258]:
submit = ['QueryId,DocumentId']
for i in range(1, len(queries)):
    docids = new_query_docid[str(i)]
    for docid in docids:
        submit.append(f"{i},{docid}")
        
with open(PATH + "submit_22_3.txt", "w") as f:
    f.write("\n".join(submit))

Прогоним запросы через спеллчекер для улучшения ранжирования.

In [300]:
def spellcheck(text):
    payload = {"text": text}
    response = requests.get("https://speller.yandex.net/services/spellservice.json/checkText",
                            params=payload).json()
    
    fixed_text = text
    for item in response:
        fixed_text = fixed_text.replace(item["word"], item["s"][0])
    return fixed_text

In [302]:
fix_queries = [spellcheck(query) for query in tqdm(queries, total=len(queries))]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=400.0), HTML(value='')))




In [315]:
new_query_docid = {}
for item in tqdm(query_docid.items(), total=len(query_docid)):
    query = fix_queries[int(item[0])]
    docids = item[1]
    result = []
    for docid in docids:
        filename = f"parsed_{docid}.pkl"
        if filename in listdir:
            with open(os.path.join(PATH, "parsed_content", filename), "rb") as f:
                doc = pickle.load(f)
            score = total_score(lemmatize_text(clean_text(query), stemmer)[:-1],
                                doc, words_idf)
        else:
            score = 0
            print("ERROR", filename)
        result.append((score, docid))
    result.sort(reverse=True)
    new_query_docid[item[0]] = [doc_item[1] for doc_item in result]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=399.0), HTML(value='')))

ERROR parsed_31733.pkl



In [316]:
submit = ['QueryId,DocumentId']
for i in range(1, len(queries)):
    docids = new_query_docid[str(i)]
    for docid in docids:
        submit.append(f"{i},{docid}")
        
with open(PATH + "submit_22_4.txt", "w") as f:
    f.write("\n".join(submit))

Перепишем фукнцию скора для учета расширений запросов (скор по расширенному запросу будет добавляться с некоторым весом)

In [354]:
def total_score(query_all, doc, words_idf):
    query = query_all.split("\t")[0]
    query_ext = query_all.split("\t")[1]
    ext_words_weight = 0.5
    score = 0
    query_words = query.split(" ")
    query_ext_words = query_ext.split(" ")
    
    # single words
    for word in query_words:
        if word in words_idf:
            score += word_score(word, doc, words_idf[word])
        
    # bigrams
    for index, _ in enumerate(query_words[:-1]):
        word1 = query_words[index]
        word2 = query_words[index + 1]
        if word1 in words_idf and word2 in words_idf:
            score += pair_score(word1, word2, doc,
                                words_idf[word1], words_idf[word2])
    try:
        for index, word in enumerate(query_ext_words):
            if word in words_idf and word != query_words[index]:
                score += ext_words_weight * word_score(word, doc, words_idf[word])

        for index, _ in enumerate(query_ext_words[:-1]):
            word1 = query_ext_words[index]
            word2 = query_ext_words[index + 1]
            if word1 in words_idf and word2 in words_idf and (word1 != query_words[index] or word2 != query_words[index + 1]):
                score += ext_words_weight * pair_score(word1, word2, doc,
                                    words_idf[word1], words_idf[word2])
    except:
        pass
        
    score += get_popularity(doc[4])
    return score

In [321]:
with open(PATH + "queries_all.pkl", "rb") as f:
    queries = pickle.load(f)

In [323]:
for item in queries:
    if len(item.split("\t")[0].split(" ")) != len(item.split("\t")[1].split(" ")):
        print(item)

In [355]:
new_query_docid = {}
for item in tqdm(query_docid.items(), total=len(query_docid)):
    query = queries[int(item[0])]
    docids = item[1]
    result = []
    for docid in docids:
        filename = f"parsed_{docid}.pkl"
        if filename in listdir:
            with open(os.path.join(PATH, "parsed_content", filename), "rb") as f:
                doc = pickle.load(f)
            base_query = query.split("\t")[0]
            ext_query = query.split("\t")[1]
            base_query = lemmatize_text(clean_text(base_query), stemmer)[:-1]
            ext_query = lemmatize_text(clean_text(ext_query), stemmer)[:-1]
            score = total_score(base_query + "\t" + ext_query,
                                doc, words_idf)
        else:
            score = 0
            print("ERROR", filename)
        result.append((score, docid))
    result.sort(reverse=True)
    new_query_docid[item[0]] = [doc_item[1] for doc_item in result]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=399.0), HTML(value='')))

ERROR parsed_31733.pkl



In [356]:
submit = ['QueryId,DocumentId']
for i in range(1, len(queries)):
    docids = new_query_docid[str(i)]
    for docid in docids:
        submit.append(f"{i},{docid}")
        
with open(PATH + "submit_23_1.txt", "w") as f:
    f.write("\n".join(submit))

$$score = \sum{WordScore(orig)} + \sum{PairScore(orig)} + \sum{WordScore(ext)}$$

$$WordScore(w) = idf_w * \sum_{f\in{Fields}}{
weight_f*\frac{tf_f(w)}{(1 + tf_f(w) + k*doclen)}
}$$

$$PairScore(w_1, w_2) = (idf(w_1) + idf(w_2)) \sum_{f\in{Fields}}{
weight_f*\frac{tf_f(w)}{(1 + tf_f(w))}
}$$