Данный ноутбук использует проиндексированные документы, что позволило значительно ускорить ранжирование.

In [2]:
import pickle
import math
import os
from smart_parser import clean_text, lemmatize_text
from tqdm.notebook import tqdm
from collections import defaultdict
from pymystem3 import Mystem
from joblib import Parallel, delayed
from ndcg_standart import mean_ndcg
import requests

PATH = "/Users/michelle/data/text-relevance-competition-ir-1-ts-fall-2020/"

In [24]:
stemmer = Mystem()

In [3]:
with open(PATH + "sample.technosphere.ir1.textrelevance.submission.txt", "r") as f:
    sample = f.read().split("\n")
    
query_docid = defaultdict(list)
for item in sample[1:]:
    if item:
        qid = item.split(",")[0]
        docid = item.split(",")[1]
        query_docid[qid].append(docid)
        
average_doc_len = 8682.4
corpus_len = 38115

listdir = os.listdir(os.path.join(PATH, "index"))

with open(PATH + "queries_all.pkl", "rb") as f:
    queries = pickle.load(f)

In [110]:
words_count = defaultdict(int)
for filename in tqdm(listdir, total=len(listdir)):
    with open(os.path.join(PATH, "index", filename), "rb") as f:
        doc = pickle.load(f)
    for word in doc[0][0].keys():
        words_count[word] += 1

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=38113.0), HTML(value='')))




In [111]:
words_idf = {}
for word, count in words_count.items():
    words_idf[word] = -math.log(count / corpus_len)

In [112]:
with open(os.path.join(PATH, "words_idf"), "wb") as f:
    pickle.dump(words_idf, f)

In [18]:
with open(os.path.join(PATH, "words_idf"), "rb") as f:
    words_idf = pickle.load(f)

In [104]:
def tf_score(tf, field_len):
    return tf / (tf + 1 + 0.001 * field_len)

def word_score(word, doc, idf): # условие при добавлении веса источника
    text_weight = 1
    headers_weight = 1.5
    meta_weight = 4
    title_weight = 5
    headers_coef = 1.2
    
    text_tf = len(doc[0][0][word])
    title_tf = len(doc[1][0][word])
    meta_tf = len(doc[2][0][word])
    headers_tf = [len(header[0][word]) for header in doc[3]]
    
    text_score = tf_score(text_tf, doc[0][1]) * text_weight# if text_tf else 0
    title_score = tf_score(title_tf, doc[1][1]) * title_weight# if title_tf else 0
    meta_score = tf_score(meta_tf, doc[2][1]) * meta_weight
    headers_score = sum([tf_score(header_tf, doc[3][i][1])\
                     * headers_weight ** 6 / headers_coef ** i\
                     for i, header_tf in enumerate(headers_tf)])# if header_tf])
    
#     print(idf, text_score, title_score, meta_score, headers_score)
    
    return idf * (text_score + title_score + headers_score + meta_score)

In [97]:
def tf_pair(word1, word2, text_index):
    bigram_weight = 1
    inverse_weight = 0.75
    skip_weight = 0.5
    skip_inverse_weight = 0.25

    bigram_tf = 0
    inverse_tf = 0
    skip_tf = 0
    skip_inverse_tf = 0
    
    for index in text_index[word1]:
        if index + 1 in text_index[word2]:
            bigram_tf += 1
        if index - 1 in text_index[word2]:
            inverse_tf += 1
        if index + 2 in text_index[word2]:
            skip_weight += 1
        if index - 1 in text_index[word2]:
            skip_inverse_weight += 1

    return bigram_tf * bigram_weight + inverse_tf * inverse_weight +\
            skip_tf * skip_weight + skip_inverse_tf * skip_inverse_weight


def pair_score(word1, word2, doc, idf1, idf2): # check веса для особых тегов
    text_weight = 1
    headers_weight = 1.5
    meta_weight = 4
    title_weight = 5
    headers_coef = 1.2
    
    text_tf = tf_pair(word1, word2, doc[0][0])
    headers_tf = [tf_pair(word1, word2, header[0]) for header in doc[3]]    
    title_tf = tf_pair(word1, word2, doc[1][0])
    meta_tf = tf_pair(word1, word2, doc[2][0])
    
    text_score = text_weight * text_tf / (1 + text_tf)
    title_score = title_weight * title_tf / (1 + title_tf)
    headers_score = sum([(headers_weight ** 6 / headers_coef ** i) *\
                         header_tf / (1 + header_tf)\
                         for i, header_tf in enumerate(headers_tf)])
    meta_score = meta_weight * meta_tf / (1 + meta_tf)
        
    return (idf1 + idf2) * (text_score + title_score + headers_score + meta_score)
    

In [126]:
def total_score(query_all, doc, words_idf):
    query = query_all.split("\t")[0]
    query_ext = query_all.split("\t")[1]
    ext_words_weight = 0.25
    score = 0
    query_words = query.split(" ")
    query_ext_words = query_ext.split(" ")
    
    # single words
    for word in query_words:
        if word in words_idf:
            score += word_score(word, doc, words_idf[word])
        
    # bigrams
    for index, _ in enumerate(query_words[:-1]):
        word1 = query_words[index]
        word2 = query_words[index + 1]
        if word1 in words_idf and word2 in words_idf:
            score += pair_score(word1, word2, doc,
                                words_idf[word1], words_idf[word2])
    try:
        for index, word in enumerate(query_ext_words):
            if word in words_idf and word != query_words[index]:
                score += ext_words_weight * word_score(word, doc, words_idf[word])
                    
    except:
        pass
        
    return score

In [128]:
new_query_docid = {}
for item in tqdm(query_docid.items(), total=len(query_docid)):
    query = queries[int(item[0])]
    docids = item[1]
    result = []
    for docid in docids:
        filename = f"parsed_{docid}.pkl"
        if filename in listdir:
            with open(os.path.join(PATH, "index", filename), "rb") as f:
                doc = pickle.load(f)
            base_query = query.split("\t")[0]
            ext_query = query.split("\t")[1]
            base_query = lemmatize_text(clean_text(base_query), stemmer)[:-1]
            ext_query = lemmatize_text(clean_text(ext_query), stemmer)[:-1]
            score = total_score(base_query + "\t" + ext_query,
                                doc, words_idf)
        else:
            score = 0
            print("ERROR", filename)
        result.append((score, docid))
    result.sort(reverse=True)
    new_query_docid[item[0]] = [doc_item[1] for doc_item in result]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=399.0), HTML(value='')))

ERROR parsed_31733.pkl



In [129]:
submit = ['QueryId,DocumentId']
for i in range(1, len(queries)):
    docids = new_query_docid[str(i)]
    for docid in docids:
        submit.append(f"{i},{docid}")
        
with open(PATH + "submit_24_4.txt", "w") as f:
    f.write("\n".join(submit))

In [131]:
def total_score(query_all, doc, words_idf):
    query = query_all.split("\t")[0]
    query_ext = query_all.split("\t")[1]
    ext_words_weight = 0.25
    score = 0
    query_words = query.split(" ")
    query_ext_words = query_ext.split(" ")
    
    # single words
    for word in query_words:
        if word in words_idf:
            score += word_score(word, doc, words_idf[word])
        
    # bigrams
    for index, _ in enumerate(query_words[:-1]):
        word1 = query_words[index]
        word2 = query_words[index + 1]
        if word1 in words_idf and word2 in words_idf:
            score += pair_score(word1, word2, doc,
                                words_idf[word1], words_idf[word2])
    try:
        for index, word in enumerate(query_ext_words):
            if word in words_idf and word not in query_words:
                score += ext_words_weight * word_score(word, doc, words_idf[word])
                    
    except:
        pass
        
    return score

In [132]:
new_query_docid = {}
for item in tqdm(query_docid.items(), total=len(query_docid)):
    query = queries[int(item[0])]
    docids = item[1]
    result = []
    for docid in docids:
        filename = f"parsed_{docid}.pkl"
        if filename in listdir:
            with open(os.path.join(PATH, "index", filename), "rb") as f:
                doc = pickle.load(f)
            base_query = query.split("\t")[0]
            ext_query = query.split("\t")[1]
            base_query = lemmatize_text(clean_text(base_query), stemmer)[:-1]
            ext_query = lemmatize_text(clean_text(ext_query), stemmer)[:-1]
            score = total_score(base_query + "\t" + ext_query,
                                doc, words_idf)
        else:
            score = 0
            print("ERROR", filename)
        result.append((score, docid))
    result.sort(reverse=True)
    new_query_docid[item[0]] = [doc_item[1] for doc_item in result]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=399.0), HTML(value='')))

ERROR parsed_31733.pkl



In [143]:
submit = ['QueryId,DocumentId']
for i in range(1, len(queries)):
    docids = new_query_docid[str(i)]
    for docid in docids:
        submit.append(f"{i},{docid}")
        
with open(PATH + "submit_25_1.txt", "w") as f:
    f.write("\n".join(submit))