In [1]:
import numpy as np
import pandas as pd
import math
from nltk.corpus import PlaintextCorpusReader
from nltk.text import Text
import nltk

# Preprocess

In [2]:
# 使用 nltk PlaintextCorpusReader 存取指定目錄下的所有檔案
doc_corpus_root = './ntust-ir-2020/docs'
query_corpus_root = './ntust-ir-2020/queries'
doc_corpus = PlaintextCorpusReader(doc_corpus_root, '.*')
query_corpus = PlaintextCorpusReader(query_corpus_root, '.*')

# 將 document 依序 存取內容與檔名
doc_names = []
doc_terms = []
for docs in doc_corpus.fileids():
    words = doc_corpus.raw(docs)
    # 檔名篩掉.txt
    doc_names.append(docs[:len(docs) - 4])
    doc_terms.append(words)

# 將 query 依序 存取內容與檔名
query_names = []
query_terms = []
for queries in query_corpus.fileids():
    words = query_corpus.raw(queries)
    # 檔名篩掉.txt
    query_names.append(queries[:len(queries) - 4])
    query_terms.append(words)

# 確認長度相符
print(len(doc_names),len(doc_terms))
print(len(query_names),len(query_terms))

4191 4191
50 50


# Function

In [3]:
# 給一個指定的詞與文件 找出tf
def get_tf(term,document):
    split_doc = document.lower().split()
    term_in_doc = split_doc.count(term.lower())
    total_words_in_doc = len(split_doc)
    if total_words_in_doc == 0:
        return 0
    else:
        tf = term_in_doc / float(total_words_in_doc)
        if tf == 0:
            return 0
        else:
            return 4 + math.log10(tf)

# 給一個指定的詞與一堆文件 找出idf
def get_idf(term,documents):
    df = 0
    for document in documents:
        split_doc = document.lower().split()
        if term.lower() in split_doc:
            df += 1
    if df == 0:
        return 0
    else:
        idf = math.log10(1 + (float(len(documents)) + 0.0) / (df + 0.0))
        return idf

# 給一指定的query與一堆文件 作出document的tfidf矩陣
def get_doc_matrix(query, documents):
    query_split = query.lower().split()
    tfidf_matrix = np.zeros((len(query_split), len(documents)))
    for i, term in enumerate(query_split):
        idf = get_idf(term, documents)
        for j, doc in enumerate(documents):
            tfidf_matrix[i][j] = idf * get_tf(term, doc)
    return tfidf_matrix

# 給一指定的query與一堆文件 作出query的tfidf向量
def get_query_vector(query, documents):
    count = {}
    query_split = query.lower().split()
    for term in query_split:
        if term in count:
            count[term] += 1
        else:
            count[term] = 1
    vector = []
    for i, term in enumerate(query_split):
        tf = count[term] / float(len(count))
        if tf == 0:
            vector.append(0)
        else:
            vector.append((1 + math.log10(tf)) * get_idf(term, documents))
    return vector

# VSM ranking and sort

In [27]:
# 讀檔、寫入答案
ans = "Query,RetrievedDocuments"
f = open("vsm_result.txt","a+")
f.write(ans+'\n')

buf = ""

# 依序讀取query 並找出每個query與所有文件的相似度排序
for i in range(len(query_terms)):
    
    buf = query_names[i] + ','
    
    first = True
    
    # 取得 d矩陣 與 q向量
    doc_tfidf_matrix = get_doc_matrix(query_terms[i],doc_terms)
    q = get_query_vector(query_terms[i],doc_terms)
    
    # 為與 query vector 做內積 須轉換 matrix shape
    d = []
    for x in range(len(doc_tfidf_matrix[0])):
        buff = []
        for y in range(len(q)):
            buff.append(doc_tfidf_matrix[y][x])
        d.append(buff)
    
    # 依序將 d矩陣中每個 d向量與q向量做內積 後除以文件長度n
    v = []
    for it in range(len(d)):
        n = len(doc_terms[it].lower().split())
        if n == 0:
            v.append(0)
        else:
            v.append(np.dot(d[it],np.multiply(q,q)) / n)
    
    # 將計算後的value由大到小做排序 並取其index 
    sort = [index for index, value in sorted(enumerate(v), key=lambda item: item[1],reverse=True)]
    
    # 將排序後結果依序輸出至檔案
    for s in sort:
        if first == True:
            buf += doc_names[s]
        else:
            buf += (' ' + doc_names[s])
        first = False
        
    f.write(buf+'\n')