<a href="https://colab.research.google.com/github/Felix-Think/Capstone_SamSung/blob/master/Text_Retrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Step1. Install and load Datasets
## We use MS_MACRO

In [None]:
!pip install datasets
from datasets import load_dataset
#Load MSMACRO
ds = load_dataset("microsoft/ms_marco", "v2.1")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


#2.Extract dataset

In [None]:

subset = ds['test']

In [None]:
subset

Dataset({
    features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
    num_rows: 101092
})

In [None]:
#Extract text
#We only use sample with type == entity
#Open this link https://huggingface.co/datasets/microsoft/ms_marco and get features in dataset
corpus = []
for text in subset:
    query_type = text['query_type']
    if query_type != 'ENTITY':
        continue
    query_id = text['query_id']
    query_sdtr = text['query']
    passages_dict = text['passages']
    is_selected_lst = passages_dict['is_selected']
    passage_text_lst = passages_dict['passage_text']
    corpus += passage_text_lst

In [None]:
print(len(corpus))

86133


#Text Presentation

In [None]:
def tokenize(text):
    return text.split()

def create_dictionary(corpus):
    dictionary = []
    for doc in corpus:
        normalized_doc = text_normalize(doc)
        tokens = tokenize(normalized_doc)
        for token in tokens:
            if token not in dictionary:
                dictionary.append(token)
    return dictionary

In [None]:
def vectorize(text, dictionary):
    count_word_lst = {work: 0 for work in dictionary}
    normalized_text = text_normalize(text)
    tokens = tokenize(normalized_text)
    for token in tokens:
        try:
            count_word_lst[token] += 1
        except:
            continue
    vector = list(count_word_lst.values())
    return vector

#Indexing

In [None]:
def create_doc_term_matrix(corpus, dictionary):
    doc_term_matrix = {}
    for idx, doc in enumerate(corpus[:10000]):
        vector = vectorize(doc, dictionary)
        doc_term_matrix[(doc, idx)] = vector
    return doc_term_matrix

#Text Normalization

In [None]:
#Lowercase
def text_lowercase(text):
    return text.lower()

In [None]:
#Remove punctuation
import string
remove_charts = string.punctuation
def remove_punctuation(text):
    for char in remove_charts:
        text = text.replace(char, ' ')
    return text

In [None]:
#Stopword Removal
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords_list = stopwords.words('english')
def remove_stopwords(text):
    tokens = tokenize(text)
    non_stop_words = [token for token in tokens if token not in stopwords_list]
    new_text = ' '.join(non_stop_words)
    return new_text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#Steaming
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
def stemming(text):
    tokens = tokenize(text)
    stemmed_words = [stemmer.stem(token) for token in tokens]
    new_text = ' '.join(stemmed_words)
    return new_text

In [None]:
#Normalization
def text_normalize(text):
    text = text_lowercase(text)
    text = remove_punctuation(text)
    text = remove_stopwords(text)
    text = stemming(text)
    return text

#Ranking

In [None]:
import numpy as np
def cosine_similarity(a, b):
    a = np.array(a)
    b = np.array(b)
    dot_product = a.dot(b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

In [None]:
def ranking(query, dictionary, doc_term_matrix):
    query_vector = vectorize(query, dictionary)
    scores = []
    for doc_infor, doc_vector in doc_term_matrix.items():
        sim = cosine_similarity(query_vector, doc_vector)
        scores.append((sim, doc_infor))
    scores.sort(reverse=True)
    return scores

In [None]:
dictionary = create_dictionary(corpus)


In [None]:
doc_term_matrix = create_doc_term_matrix(corpus, dictionary)

#Ranking code and Result

In [None]:
query_list = ['What is the official languages in Fiji']
top_k = 10

for query in query_list:
    scores = ranking(query, dictionary, doc_term_matrix)
    print(f'Query: {query}')
    print('==Relevent docs==')
    for idx, (doc_score, doc_content) in enumerate(scores[:top_k]):
        print(f'Top {idx+1}: {doc_score}')
        print(doc_content)
        print('\n')

Query: What is the official languages in Fiji
==Relevent docs==
Top 1: 0.5319951765989316
('New Zealand Language Official Languages. While English is the predominant language spoken in New Zealand, there are two actual official languages in New Zealand. Maori became an official language in 1987 while in April 2006, New Zealand became the first country to declare sign language as an official language, alongside Maori. New Zealand Sign Language, or NZSL, is the main language of the deaf community in New Zealand. Maori is only used in New Zealand and nowhere else in the world. Despite its official status, the language continues to struggle against being lost.', 7925)


Top 2: 0.4479546293064525
('While English is the predominant language spoken in New Zealand, there are two actual official languages in New Zealand. Maori became an official language in 1987 while in April 2006, New Zealand became the first country to declare sign language as an official language, alongside Maori. New Zeala