In [40]:
# dowload dataset : Microsoft Machine Reading Comprehension Dataset
!pip install datasets==2.13.1




In [41]:
from datasets import load_dataset
dataset = load_dataset ('ms_marco' , 'v1.1')



  0%|          | 0/3 [00:00<?, ?it/s]

In [42]:
subset = dataset["test"]
q_infos = [] # contain query, query_id, num of relevant_docs
queries = [] # list of query that their type is "entity"
corpus = [] # get all passage text from dataset
import json
print(json.dumps(subset[0]))

{"answers": ["Yes"], "passages": {"is_selected": [0, 0, 1, 0, 0, 0, 0], "passage_text": ["We have been feeding our back yard squirrels for the fall and winter and we noticed that a few of them have missing fur. One has a patch missing down his back and under both arms. Also another has some missing on his whole chest. They are all eating and seem to have a good appetite.", "Critters cannot stand the smell of human hair, so sprinkling a barrier of hair clippings around your garden, or lightly working it into the soil when you plant bulbs, apparently does have some merit. The whole thing kind of makes me laugh. It never occurred to me that we are the ones that stink.", "Spread some human hair around your vegetable and flower gardens. This will scare the squirrels away because humans are predators of squirrels. It is better if the hair hasn't been washed so the squirrels will easily pick up the human scent.", "1 You can sprinkle blood meal around your garden as well. 2  Don\u2019t trap an

In [43]:
# create my custom corpus for text retrieval

for sample in subset:
    qtype = sample["query_type"]
    if qtype != "entity":
        continue
    qstring =  sample["query"]
    qid = sample["query_id"]
    passage_dict = sample["passages"]
    is_selected_lst = passage_dict["is_selected"]
    passage_text_lst = passage_dict["passage_text"]
    query_info ={
        "query_id": qid,
        "query": qstring,
        "relevant_docs":[]
    }
    len_c = len(corpus)
    for i in range(len(is_selected_lst)):
        if is_selected_lst[i] :
            doc_idx = len_c + i
            query_info["relevant_docs"].append(doc_idx)

    if query_info["relevant_docs"] == []:
        continue
    queries.append(qstring)
    q_infos.append(query_info)
    corpus += passage_text_lst

In [44]:
print(q_infos[:5])
print(subset[7]["query"])

[{'query_id': 7, 'query': 'what class are spiders in', 'relevant_docs': [4]}, {'query_id': 13, 'query': 'what types of bacteria are found in the mouth', 'relevant_docs': [10]}, {'query_id': 18, 'query': 'Starch and _____ are common polysaccharide carbohydrates found in plants.', 'relevant_docs': [13, 15]}, {'query_id': 47, 'query': 'what are herbaceous perennials', 'relevant_docs': [23]}, {'query_id': 50, 'query': 'what is the element family for plutonium', 'relevant_docs': [30, 31]}]
what class are spiders in


In [45]:
import string
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from tqdm import tqdm
import numpy as np

nltk.download('punkt')
nltk.download("stopwords")

def remove_punctuation(text) -> str:
    for c in string.punctuation:
        text = text.replace(c, "")
    return text

def remove_stopwords(text) -> str:
    # remove words with a little bit meaning
    # (remove it without changing meaning of sentences)
    english_stopwords = stopwords.words("english")
    out = []
    for w in word_tokenize(text):
        if w not in english_stopwords:
            out.append(w)
    return " ".join(out)

def stemming(text) -> str:
    # change word form
    # e.x: flying ->fly, ...
    out = []
    for w in word_tokenize(text):
        out.append(PorterStemmer().stem(w))
    return " ".join(out)

def normalize_text(text : str) -> str:
    out = text.lower()
    out = remove_punctuation(out)
    out = remove_stopwords(out)
    out = stemming(out)

    return out

assert normalize_text("Hi i'm a apple.,") == "hi im appl"

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [46]:
def create_dictionary(docs):
    dictionary = []
    for doc in docs:
        for w in word_tokenize(doc):
            if w not in dictionary:
                dictionary.append(w)
    return dictionary


In [47]:
def vectorize(text, dictionary, typeV="count"):
    # ignore word that not in dictionary
    v = dict(zip(dictionary, [0]*len(dictionary)))
    for w in word_tokenize(text):
        if w in dictionary:
            if typeV == "binary":
                v[w] = 1
            elif typeV == "count":
                v[w] +=1
    return np.array(list(v.values()))
print(vectorize("a b d c e a", ["a", "b", "c"], "count"))


[2 1 1]


In [48]:

def create_doc_term(docs, dictionary):
    print(docs[0])
    t = np.array(docs)
    print(t.shape)
    return t.reshape((-1,len(dictionary)))

print(create_doc_term([[[1,3,7], [1,24,6]]], ["a", "b", "c"]))

[[1, 3, 7], [1, 24, 6]]
(1, 2, 3)
[[ 1  3  7]
 [ 1 24  6]]


In [49]:
def cosine_similarity(v1, v2):
    v1 = np.array(v1)
    v2 = np.array(v2)
    return (v1.dot(v2)) / (np.linalg.norm(v1) * np.linalg.norm(v2) )
print(cosine_similarity([1,3,7], [1,24,6]))

0.6047022677170876


In [50]:
def ranking(matrix, topk, decend = True):
    # return sort idx
    idx_sort = np.argsort(np.array(matrix))
    if decend:
        return idx_sort[::-1][:topk]
    return idx_sort[:topk]


In [90]:
# retrieval text
class TextRetrievalModel():
    def __init__(self, corpus):
        self.corpus = corpus
        self.normal_corpus = self.__normalize_corpus()
        self.dictionary = create_dictionary(self.normal_corpus)
        self.docs_term = create_doc_term(self.__vectorize_docs(self.normal_corpus),
                                         self.dictionary)

    def retrieve_text(self, query, topk = 5):
        q_vector = vectorize(normalize_text(query), self.dictionary)
        self.cosin_vector = []
        for doc in self.docs_term:
            cosin = cosine_similarity(q_vector, doc)
            self.cosin_vector.append(cosin)
        self.idx = ranking(self.cosin_vector, topk)
        score = []
        top = []
        for i in self.idx:
            top.append(self.corpus[i])
            score.append(self.cosin_vector[i])
        return (top,score)

    def get_top(self, k = 10):
        score = []
        top = []
        for i in self.idx:
            top.append(self.corpus[i])
            score.append(self.cosin_vector[i])
        return (top,score)

    def __normalize_corpus(self):
        m = []
        for text in self.corpus:
            m.append(normalize_text(text))
        return m
    def __vectorize_docs(self, docs):
        v = []
        for text in docs:
            v.append(vectorize(text, self.dictionary))
        return v

In [91]:
model = TextRetrievalModel(corpus)


[1 1 1 ... 0 0 0]
(7303, 21559)


In [92]:
retrievaltexts = model.retrieve_text("what is the official language in Fiji", 5)

In [108]:
def print_result(retrievaltexts):
    t,c = retrievaltexts
    for i in range(len(t)):
        print("top%2d \n___text: %.1000s  \n___score: %.4f" % (i+1, t[i], c[i]))
        print()

print_result(retrievaltexts)

top 1 
___text: The official languages in Fiji are Fijian and English. A dialect of Hindustani is also widely spoken among Indo-Fijians.  _________________________________________   T … he official and everyday language of Fiji is English. Fijian and Fiji-Hindi are second languages in the island nation.  
___score: 0.6556

top 2 
___text: The official languages in Fiji are Fijian and English. A dialect of Hindustani is also widely spoken among Indo-Fijians.  _________________________________________   T … he official and everyday language of Fiji is English. Fijian and Fiji-Hindi are second languages in the island nation.  
___score: 0.6556

top 3 
___text: The official languages. Fiji’s 1997 Constitution established Fijian as one of the official languages of the country. Fijian is an Austronesian language, a grouping that includes thousands of other languages spanning the globe. The language is of the Malayo-Polynesian family, not too different from Hawaiian and Maori.  
___score: 0.6