<a href="https://colab.research.google.com/github/B34R-e/My-Projects/blob/main/Text_Retrieval_Basic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
!pip install datasets==2.13.1



In [23]:
from datasets import load_dataset

dataset = load_dataset('ms_marco', 'v1.1')



  0%|          | 0/3 [00:00<?, ?it/s]

In [24]:
# select a test sample
subset = dataset['test']

# declare a list contains related queries and documents
queries_infos = []
queries = []
corpus = []

# split the data
# Duyet qua tung sample trong bo test duoc lay ra tu dataset va chi lay cac sample co cau truy van thuoc kieu 'entity'
for sample in subset:
  query_type = sample['query_type']
  if query_type != 'entity':
    continue
  # Lay ra noi dung cau truy van va id cua chung
  query_id = sample['query_id']
  query_str = sample['query']
  # Lay danh sach cac tai lieu va nhan tuong ung cua cau truy van
  passages_dict = sample['passages']
  is_selected_lst = passages_dict['is_selected']
  passage_text_lst = passages_dict['passage_text']
  # Khai bao mot dictionary chua cac thong tin cua cau truy van
  query_info = {
      'query_id': query_id,
      'query': query_str,
      'relevant_docs': []
  }
  # Tu danh sach cac tai lieu va nhan
  # Chon cac tai lieu duoc gan co lien quan den cau truy van
  # Dua vao key 'relevant_docs'
  # Luu tru nhan duoi dang chi muc trong list corpus
  current_len_corpus = len(corpus)
  for idx in range(len(is_selected_lst)):
    if is_selected_lst[idx] == 1:
      doc_idx = current_len_corpus + idx
      query_info['relevant_docs'].append(doc_idx)
    # Bo qua cac sample khong chua tai lieu lien quan de thuan tien cho viec danh gia
    if query_info['relevant_docs'] == []:
      continue
  # Dua thong tin cau truy van va tai lieu vao cac danh sanh da khai bao
  queries.append(query_str)
  queries_infos.append(query_info)
  corpus += passage_text_lst

In [25]:
# construct a normalize docs function
# 4 abilities:
# Lowercasing
# Punctuations Removal
# Stopwords Removal
# Stemming (on English normalizing)

# import module string trong Python
import string
# import thu vien nltk
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# tai bo stopwords cua tieng Anh ve va khai bao thanh list
nltk.download('stopwords')
english_stopwords = stopwords.words('english')
# khai bao danh sach cac dau cau can xoa
remove_chars = string.punctuation
# Khai bao thuc the PorterStemmer
stemmer = PorterStemmer()

def tokenize(text):
  return text.split()

# Khai bao ham text_normalize voi tham so la text (string)
def text_normalize(text):
  text = text.lower()
  for char in remove_chars:
    text = text.replace(char, '')
  text = ' '.join([word for word in tokenize(text) if word not in english_stopwords])
  text = ' '.join([stemmer.stem(word) for word in tokenize(text)])

  return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
# construct a corpus
def create_dictionary(corpus):
  dictionary = []
  for doc in corpus:
    normalized_doc = text_normalize(doc)
    tokens = tokenize(normalized_doc)
    for token in tokens:
      if token not in dictionary:
        dictionary.append(token)

  return dictionary

In [27]:
# construct a vectorization function to vectorize docs
# Dau vao la string can vectorize va mot bo tu vung kieu list
def vectorize(text, dictionary):
  word_count_dict = {word: 0 for word in dictionary}
  tokens = tokenize(text)
  for token in tokens:
    try:
      word_count_dict[token] += 1
    except:
      pass
  vector = list(word_count_dict.values())

  return vector

In [28]:
# construct document-term matrix
# Dau vao la list cac tai lieu va list cac tu vung
def create_doc_term_matrix(corpus, dictionary):
  doc_term_matrix = {}
  for idx, doc in enumerate(corpus):
    normalized_doc = text_normalize(doc)
    vector = vectorize(normalized_doc, dictionary)
    doc_term_matrix[(doc, idx)] = vector

  return doc_term_matrix

In [29]:
# Construct a similarity calculation function (use Cosine Similarity)
from scipy import spatial
def similarity(a, b):
  return 1 - spatial.distance.cosine(a, b)

In [31]:
# Construct a query function
# Dau vao la cau truy van, bo tu vung va ma tran doc term
def ranking(query, dictionary, doc_term_matrix):
  normalized_query = text_normalize(query)
  query_vec = vectorize(normalized_query, dictionary)
  scores = []
  for doc_info, doc_vec in doc_term_matrix.items():
    sim = similarity(query_vec, doc_vec)
    scores.append((sim, doc_info))

  scores.sort(reverse=True)
  return scores

In [32]:
# Test a query
query_lst = ['what is the official language in Fiji ?']
top_k = 10
dictionary = create_dictionary(corpus)
doc_term_matrix = create_doc_term_matrix(corpus, dictionary)
for query in query_lst:
  scores = ranking(query, dictionary, doc_term_matrix)
  print(f'Query: {query}')
  print(f'=== Relevant Docs ===')
  for idx in range(top_k):
    doc_score = scores[idx][0]
    doc_content = scores[idx][1][0]

    print(f'Top {idx + 1}; Score: {doc_score:.4f}')
    print(doc_content)
    print('\n')

Query: what is the official language in Fiji ?
=== Relevant Docs ===
Top 1; Score: 0.6556
The official languages in Fiji are Fijian and English. A dialect of Hindustani is also widely spoken among Indo-Fijians.  _________________________________________   T … he official and everyday language of Fiji is English. Fijian and Fiji-Hindi are second languages in the island nation.


Top 2; Score: 0.6556
The official languages in Fiji are Fijian and English. A dialect of Hindustani is also widely spoken among Indo-Fijians.  _________________________________________   T … he official and everyday language of Fiji is English. Fijian and Fiji-Hindi are second languages in the island nation.


Top 3; Score: 0.5715
The official languages. Fiji’s 1997 Constitution established Fijian as one of the official languages of the country. Fijian is an Austronesian language, a grouping that includes thousands of other languages spanning the globe. The language is of the Malayo-Polynesian family, not too di