In [1]:
import numpy as np
import pandas as pd
from pylab import random
import math
from nltk.corpus import PlaintextCorpusReader
from collections import Counter
from tqdm import tqdm 
from numba import jit,njit
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

# Preprocess

In [3]:
# 使用 nltk PlaintextCorpusReader 存取指定目錄下的所有檔案
doc_corpus_root = './ntust-ir-2020/docs'
query_corpus_root = './ntust-ir-2020/queries'
doc_corpus = PlaintextCorpusReader(doc_corpus_root, '.*')
query_corpus = PlaintextCorpusReader(query_corpus_root, '.*')

# 將 document 依序 存取內容與檔名
all_words = {}
word2id = {}
id2word = {}
doc_names = []
doc_terms = []
for docs in doc_corpus.fileids():
    words = doc_corpus.words(docs)
    word_dict = dict(Counter(words))
    for w in range(len(word_dict)):
        k = list(word_dict.keys())[w]
        v = list(word_dict.values())[w]
        if k in all_words:
            all_words[k] += v
        else:
            all_words[k] = v
    # 檔名篩掉.txt
    doc_names.append(docs[:len(docs) - 4])
    doc_terms.append(word_dict)

query_words = {}
# 將 query 依序 存取內容與檔名
query_names = []
query_terms = []
for queries in query_corpus.fileids():
    words = query_corpus.words(queries)
    word_dict = dict(Counter(words))
    for w in range(len(word_dict)):
        k = list(word_dict.keys())[w]
        v = list(word_dict.values())[w]
        if k in query_words:
            query_words[k] += v
        else:
            query_words[k] = v
    # 檔名篩掉.txt
    query_names.append(queries[:len(queries) - 4])
    query_terms.append(word_dict)

filtered_word = {}
i = 0
for w in all_words.keys():
    if w in query_words:
        filtered_word[w] = all_words[w]
        word2id[w] = i
        id2word[i] = w
        i += 1

filtered_doc_terms = []
for doc in doc_terms:
    word_dict = {}
    for i in range(len(doc.keys())):
        word = list(doc.keys())[i]
        if word in filtered_word:
            word_dict[word] = doc[word]
    filtered_doc_terms.append(word_dict)


# 確認長度相符
print(len(doc_names),len(doc_terms),len(filtered_doc_terms))
print(len(query_names),len(query_terms))
print(len(word2id),len(id2word),len(filtered_word))
print(len(all_words),len(query_words))

30000 30000 30000
150 150
324 324 324
154240 324


# Function

In [4]:
def get_tfidf(terms):
    word_lens = len(filtered_word)
    doc_lens = len(terms)
    tf_words = np.zeros((doc_lens,word_lens))
    idf_words = np.zeros(word_lens)
    for j in range(doc_lens):
        for i in range(word_lens):
            if id2word[i] in terms[j]:
                tf_words[j][i] = terms[j][id2word[i]]
                idf_words[i] += 1
                if j == (doc_lens - 1):
                    idf_words[i] = math.log10((float(doc_lens) - idf_words[i] + 0.5) / (idf_words[i] + 0.5))
    tfidf = np.zeros((doc_lens,word_lens))
    for j in range(doc_lens):
        words = list(terms[j].keys())
        for w in words:
            i = word2id[w]
            tfidf[j][i] = tf_words[j][i] * idf_words[i]
    
    tfidf = normalize(tfidf,norm='l2')
    return tf_words, idf_words, tfidf

In [25]:
def get_bm25_matrix(doc_tf,doc_idf,query_tf,k1 = 3, k3 = 1000, b = 0.85):
    
    doc_len = len(doc_tf)
    query_len = len(query_tf)
    avg_doc_words = sum(all_words.values()) / doc_len
    tfidf = np.zeros((query_len,doc_len))
    for i in tqdm(range(query_len)):
        for j in range(doc_len):
            doc_words = sum(doc_terms[j].values())
            for q in range(len(query_tf[i])):
                q_tf = query_tf[i][q]
                if q_tf != 0:
                    _f = doc_tf[j][q] / (1 - b + b * doc_words / avg_doc_words)
                    w_d = (k1 + 1) * (_f + 0.5) / (k1 + _f + 0.5)
                    w_q = (k3 + 1) * q_tf / (k3 + q_tf)
                    tfidf[i][j] += doc_idf[q] * w_d * w_q
    return tfidf

In [14]:
def rocchio(a=1,b=0.8,r=0.1,it=1,rel_n=5,nrel_n=1):
#     _,_,doc_tfidf = get_tfidf(filtered_doc_terms)
#     _,_,query_tfidf = get_tfidf(query_terms)
#     sim = cosine_similarity(query_tfidf,doc_tfidf)
    doc_tf,doc_idf,_ = get_tfidf(filtered_doc_terms)
    query_tf,_,_ = get_tfidf(query_terms)
    sim = get_bm25_matrix(doc_tf,doc_idf,query_tf)
    rankings = np.flip(sim.argsort(), axis=1)
    
#     for _ in tqdm(range(it)):
    
#         rel_vecs = doc_tfidf[rankings[:, :rel_n]].mean(axis=1)
#         nrel_vecs = doc_tfidf[rankings[:, -nrel_n:]].mean(axis=1)
#         query_tfidf = a * query_tfidf + b * rel_vecs - r * nrel_vecs

#         cos_sim = cosine_similarity(query_tfidf, doc_tfidf)
#         rankings = np.flip(cos_sim.argsort(axis=1), axis=1)
    return rankings

In [26]:
ranking = rocchio()

100%|██████████| 150/150 [12:03<00:00,  4.83s/it]


In [22]:
# 讀檔、寫入答案
ans = "Query,RetrievedDocuments"0
f = open("vsm_result.txt","w+")
f.write(ans+'\n')

buf = ""
for i in range(len(query_names)):
    buf = query_names[i] + ','
    first = True
    for s in range(5000):
        if first == True:
            buf += doc_names[ranking[i][s]]
        else:
            buf += (' ' + doc_names[ranking[i][s]])
        first = False
    f.write(buf+'\n')

In [23]:
ranking

array([[ 6652,  7572,  6168, ..., 13910, 13911, 29999],
       [19892,  6908, 14402, ..., 19598, 19599,     0],
       [ 4214,  9223,  6911, ..., 19760, 19761,     0],
       ...,
       [11500, 21186, 17425, ..., 19403, 19404,     0],
       [ 9342, 25393,  9348, ..., 19959, 19960,     0],
       [18726, 18709, 18343, ..., 19483, 19484,     0]])

In [24]:
sim[-1]

NameError: name 'sim' is not defined

In [118]:
query_terms[0]

{'intern': 1, 'organ': 1, 'crime': 1}

In [120]:
filtered_doc_terms[ranking[0][6]]

{'growth': 1,
 'crime': 33,
 'loss': 2,
 'control': 5,
 'increas': 9,
 'drug': 2,
 'relat': 2,
 'new': 1,
 'organ': 13,
 'intern': 5,
 'secur': 1,
 'smuggl': 2,
 'cooper': 1,
 'endang': 1,
 'health': 1,
 'law': 6,
 'enforc': 4,
 'medic': 1,
 'declin': 1,
 'bank': 1,
 'abus': 3,
 'fund': 1,
 'legal': 3,
 'popul': 2,
 'behavior': 2,
 'foreign': 2,
 'treatment': 1,
 'danger': 2,
 'use': 1,
 'world': 1,
 'damag': 1}

In [124]:
doc_tf,_,_ = get_tfidf(filtered_doc_terms)
query_tf,_,_ = get_tfidf(query_terms)

In [127]:
for i in query_tf:
    print(i)

[0.         0.         0.         0.         0.         0.
 0.33333333 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.33333333 0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.33333333 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         

 0.         0.         0.         0.         0.         0.        ]
[0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.5 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

In [126]:
doc_tf

array([[0.01960784, 0.00980392, 0.00980392, ..., 0.        , 0.        ,
        0.        ],
       [0.00149477, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])