In [1]:
import numpy as np
import pandas as pd
import math
from nltk.corpus import PlaintextCorpusReader

# Preprocess

In [2]:
# 使用 nltk PlaintextCorpusReader 存取指定目錄下的所有檔案
doc_corpus_root = './ntust-ir-2020/docs'
query_corpus_root = './ntust-ir-2020/queries'
doc_corpus = PlaintextCorpusReader(doc_corpus_root, '.*')
query_corpus = PlaintextCorpusReader(query_corpus_root, '.*')

# 將 document 依序 存取內容與檔名
doc_names = []
doc_terms = []
for docs in doc_corpus.fileids():
    words = doc_corpus.raw(docs)
    # 檔名篩掉.txt
    doc_names.append(docs[:len(docs) - 4])
    doc_terms.append(words)

# 將 query 依序 存取內容與檔名
query_names = []
query_terms = []
for queries in query_corpus.fileids():
    words = query_corpus.raw(queries)
    # 檔名篩掉.txt
    query_names.append(queries[:len(queries) - 4])
    query_terms.append(words)

# 確認長度相符
print(len(doc_names),len(doc_terms))
print(len(query_names),len(query_terms))

4191 4191
50 50


# Function

In [7]:
# 給一個指定的詞與文件 找出tf
def get_tf(term,document):
    split_doc = document.lower().split()
    term_in_doc = split_doc.count(term.lower())
    total_words_in_doc = len(split_doc)
    return term_in_doc

# 給一個指定的詞與一堆文件 找出idf
def get_idf(term,documents):
    df = 0
    for document in documents:
        split_doc = document.lower().split()
        if term.lower() in split_doc:
            df += 1
    if df == 0:
        return 0
    else:
        idf = math.log10((float(len(documents)) - df + 0.5) / (df + 0.5))
        return idf

# 給一個指定的詞與一堆文件 可指定BM25欲使用的參數 最後回傳bm25的矩陣
def get_bm25_matrix(query, documents, k1 = 3, k3 = 1000, b = 0.85):
    query_split = query.lower().split()
    tfidf_matrix = np.zeros((len(query_split), len(documents)))
    avg_doc_length = get_average_document_length(documents)
    
    # w_d為document weight
    # w_q為query weight
    # 最後算法為 idf * w_d * w_q
    for i, term in enumerate(query_split):
        idf = get_idf(term, documents)
        for j, doc in enumerate(documents):
            doc_length = len(doc.lower().split())
            tf = get_tf(term, doc)
            tf_q = get_tf(term,query)
            _f = tf / (1 - b + b * doc_length / avg_doc_length)
            w_d = (k1 + 1) * (_f + 0.5) / (k1 + _f + 0.5)
            w_q = (k3 + 1) * tf_q / (k3 + tf_q)
            tfidf_matrix[i][j] = idf * w_d * w_q
    return tfidf_matrix

# 取文件平均長度
def get_average_document_length(documents):
    size = 0
    for doc in documents:
        split_doc = doc.lower().split()
        size += len(split_doc)
    return size / len(documents)

# BM25 ranking and sort

In [8]:
# 讀檔、寫入答案
ans = "Query,RetrievedDocuments"
f = open("vsm_result.txt","a+")
f.write(ans+'\n')

buf = ""

# 依序讀取query 並找出每個query與所有文件的相似度排序
for i in range(len(query_terms)):
    
    buf = query_names[i] + ','
    
    first = True
    
    # 將query丟進函式 取得bm25的矩陣
    bm25_matrix = get_bm25_matrix(query_terms[i],doc_terms)
    
    # 將此query所有詞在文件中的bm25分數做加總 後存入v中以利排序
    v = []
    for d in range(len(bm25_matrix[0])):
        score = 0
        for q in range(len(bm25_matrix)):
            score += bm25_matrix[q][d]
        v.append(score)
    
    # 將計算後的value由大到小做排序 並取其index 
    sort = [index for index, value in sorted(enumerate(v), key=lambda item: item[1],reverse=True)]
    
    # 將排序後結果依序輸出至檔案
    for s in sort:
        if first == True:
            buf += doc_names[s]
        else:
            buf += (' ' + doc_names[s])
        first = False
        
    f.write(buf+'\n')