<a href="https://colab.research.google.com/github/Aananda-giri/scripts/blob/main/semantic_search_(working).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install rank-bm25

Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2


In [None]:
import os
import click
import glob
import pandas as pd

from rank_bm25 import BM25Okapi as BM25
import gensim
from gensim import corpora
import gensim.downloader as api
import numpy as np
import logging
logging.basicConfig(level=logging.DEBUG)


In [None]:
class Retriever(object):
    def __init__(self, documents):
        self.corpus = documents
        self.bm25 = BM25(self.corpus)

    def query(self, tokenized_query, n=100):
        scores = self.bm25.get_scores(tokenized_query)
        best_docs = sorted(range(len(scores)), key=lambda i: -scores[i])[:n]
        return best_docs, [scores[i] for i in best_docs]


class Ranker(object):
    def __init__(self, query_embedding, document_embedding):
        self.query_embedding = query_embedding
        self.document_embedding = document_embedding

    def _create_mean_embedding(self, word_embeddings):
        return np.mean(
            word_embeddings,
            axis=0,
        )

    def _create_max_embedding(self, word_embeddings):
        return np.amax(
            word_embeddings,
            axis=0,
        )

    def _embed(self, tokens, embedding):
        word_embeddings = np.array([embedding[token] for token in tokens if token in embedding])
        mean_embedding = self._create_mean_embedding(word_embeddings)
        max_embedding = self._create_max_embedding(word_embeddings)
        embedding = np.concatenate([mean_embedding, max_embedding])
        unit_embedding = embedding / (embedding**2).sum()**0.5
        return unit_embedding

    def rank(self, tokenized_query, tokenized_documents):
        """
        Re-ranks a set of documents according to embedding distance
        """
        query_embedding = self._embed(tokenized_query, self.query_embedding) # (E,)
        document_embeddings = np.array([self._embed(document, self.document_embedding) for document in tokenized_documents]) # (N, E)
        scores = document_embeddings.dot(query_embedding)
        index_rankings = np.argsort(scores)[::-1]
        return index_rankings, np.sort(scores)[::-1]


class TSVDocumentReader(object):
    def __init__(self, path):
        self.path = path

    @property
    def corpus(self):
        df = pd.read_csv(self.path, delimiter="\t", header=None)
        return df[3].values.tolist()

class DocumentReader(object):
    def __init__(self, path):
        self.path = path

    @property
    def corpus(self):
        documents = []
        glob_path = os.path.join(self.path, "**")
        for document_path in glob.glob(glob_path, recursive=True):
            if os.path.isfile(document_path):
                with open(document_path, 'r', encoding='ISO-8859-1') as f:
                    documents.append(f.read())
        return documents

In [None]:
# Generate a TSV File
import pandas as pd

# Sample data for the TSV file
data = {
    "Title": ["Article 1", "Article 2", "Article 3", "Article 4", "Article 5"],
    "Description": ["An investment bonanza is coming", "Who governs a country’s airspace?",
                    "What is a supermoon, and how noticeable is it to the naked eye?", "What the evidence says about police body-cameras?",
                    "Who controls Syria?"],
    "Category": ["Science", "Technology", "Health", "Education", "Environment"],
    "Content": [
        "An investment bonanza is coming. This is the content of the first article, discussing recent scientific discoveries. ",
        "Who governs a country’s airspace? The second article covers the latest advancements in technology and its applications.",
        "What is a supermoon, and how noticeable is it to the naked eye? Healthcare improvements and medical research are the main topics of the third article.",
        "What the evidence says about police body-cameras? This article focuses on educational reforms and learning methodologies.",
        "Who controls Syria? Environmental issues and sustainability practices are explored in this article."
    ]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Save as a TSV file
tsv_path = '/content/sample_documents.tsv'
df.to_csv(tsv_path, sep='\t', index=False, header=False)

tsv_path


'/content/sample_documents.tsv'

In [None]:

def tokenize(document):
    return list(gensim.utils.tokenize(document.lower()))


def show_scores(documents, scores, n=10):
    for i in range(n):
        print("======== RANK: {} | SCORE: {} =======".format(i + 1, scores[i]))
        print(documents[i])
        print("")
    print("\n")

# @click.command()
# @click.option("--path", prompt="Path to document TSV", help="Document TSV")
# @click.option("--query", prompt="Search query", help="Search query")
def main(path="/content/sample_documents.tsv", query="astrology"):
    print('Query: "{}"'.format(query))

    print("Reading documents...", end="")
    reader = TSVDocumentReader(path)
    documents = [doc for doc in reader.corpus]
    print(" [DONE]")
    print("Tokening documents...", end="")
    corpus = [list(gensim.utils.tokenize(doc.lower())) for doc in documents]
    tokenized_query = tokenize(query)
    print(" [DONE]")

    retriever = Retriever(corpus)
    retrieval_indexes, retrieval_scores = retriever.query(tokenized_query)

    retrieved_documents = [documents[idx] for idx in retrieval_indexes]
    print("======== BM25 ========")
    show_scores(retrieved_documents, retrieval_scores, 5)

    tokenzed_retrieved_documents = [corpus[idx] for idx in retrieval_indexes]

    print("Loading glove embeddings...", end="")
    query_embedding = api.load('glove-wiki-gigaword-50')
    print(" [DONE]")
    ranker = Ranker(query_embedding=query_embedding, document_embedding=query_embedding)
    ranker_indexes, ranker_scores = ranker.rank(tokenized_query, tokenzed_retrieved_documents)
    reranked_documents = [retrieved_documents[idx] for idx in ranker_indexes]

    print("======== Embedding ========")
    show_scores(reranked_documents, ranker_scores, 5)

    print("======== Samples ========")
    documents = [
        "An investment bonanza is coming",
        "Who governs a country's airspace?",
        "What is a supermoon, and how noticeable is it to the naked eye?",
        "What the evidence says about police body-cameras",
        "Who controls Syria?",
    ]
    corpus = [list(gensim.utils.tokenize(doc.lower())) for doc in documents]
    queries = [
        "banking",
        "astrology",
        "middle east",
    ]
    for query in queries:
        tokenized_query = tokenize(query)
        indexes, scores = ranker.rank(tokenized_query, corpus)
        print(query)
        for rank, index in enumerate(indexes):
            document = documents[index]
            print("Rank: {} | Top Article: {}".format(rank, document))

if __name__ == "__main__":
    main()

Query: "astrology"
Reading documents... [DONE]
Tokening documents... [DONE]
An investment bonanza is coming. This is the content of the first article, discussing recent scientific discoveries. 

Who governs a country’s airspace? The second article covers the latest advancements in technology and its applications.

What is a supermoon, and how noticeable is it to the naked eye? Healthcare improvements and medical research are the main topics of the third article.

What the evidence says about police body-cameras? This article focuses on educational reforms and learning methodologies.

Who controls Syria? Environmental issues and sustainability practices are explored in this article.



Loading glove embeddings... [DONE]
Who controls Syria? Environmental issues and sustainability practices are explored in this article.

An investment bonanza is coming. This is the content of the first article, discussing recent scientific discoveries. 

What is a supermoon, and how noticeable is it to th

In [None]:
import os
import pandas as pd
import numpy as np
import gensim.downloader as api
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

# Download NLTK resources
import nltk
nltk.download('punkt')
nltk.download('stopwords')

# Function to preprocess and tokenize text
def preprocess(text):
    lower = text.lower()
    tokens = word_tokenize(lower)
    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words and t not in string.punctuation]
    return tokens

# Function to create an averaged word vector for a document
def document_vector(doc, model):
    doc = [word for word in doc if word in model.key_to_index]
    return np.mean(model[doc], axis=0)

# Load word embeddings
model = api.load('glove-wiki-gigaword-50')

# Function to load documents from a TSV file
def load_documents(path):
    df = pd.read_csv(path, delimiter="\t", header=None)
    return df[3].tolist()  # Assuming the content is in the fourth column

# Function to compute cosine similarity
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# Semantic search function
def semantic_search(query, documents, model):
    processed_query = preprocess(query)
    query_vector = document_vector(processed_query, model)

    scores = []
    for doc in documents:
        processed_doc = preprocess(doc)
        doc_vector = document_vector(processed_doc, model)
        scores.append(cosine_similarity(query_vector, doc_vector))

    sorted_docs = sorted(zip(documents, scores), key=lambda x: x[1], reverse=True)
    return sorted_docs

# Example usage
path_to_tsv = '/content/sample_documents.tsv'  # Replace with your TSV file path
documents = load_documents(path_to_tsv)

# query = "Enter your search query here"
query = "Banking"
top_docs = semantic_search(query, documents, model)

# Display top 5 documents
for doc, score in top_docs[:5]:
    print(f"Score: {score}\nDocument: {doc}\n")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Score: 0.6276920437812805
Document: Healthcare improvements and medical research are the main topics of the third article.

Score: 0.5873264074325562
Document: Environmental issues and sustainability practices are explored in this article.

Score: 0.5339373350143433
Document: The second article covers the latest advancements in technology and its applications.

Score: 0.4885958731174469
Document: This article focuses on educational reforms and learning methodologies.

Score: 0.4656783938407898
Document: This is the content of the first article, discussing recent scientific discoveries. 



In [None]:
!wget https://huggingface.co/fse/word2vec-google-news-300/resolve/main/word2vec-google-news-300.model?download=true -O /content/drive/MyDrive/Research/word2vec/word2vec-google-news-300.model

!wget https://huggingface.co/fse/word2vec-google-news-300/resolve/main/word2vec-google-news-300.model.vectors.npy?download=true -O -O /content/drive/MyDrive/Research/word2vec/word2vec-google-news-300.model.vectors.npy


--2023-11-25 10:00:59--  https://huggingface.co/fse/word2vec-google-news-300/resolve/main/word2vec-google-news-300.model.vectors.npy?download=true
Resolving huggingface.co (huggingface.co)... 13.33.33.102, 13.33.33.20, 13.33.33.110, ...
Connecting to huggingface.co (huggingface.co)|13.33.33.102|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.huggingface.co/fse/word2vec-google-news-300/f22370268ca0b4fb12567df754f079b4708a189d5f063de19fa19535e91d41de?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27word2vec-google-news-300.model.vectors.npy%3B+filename%3D%22word2vec-google-news-300.model.vectors.npy%22%3B&Expires=1701165659&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwMTE2NTY1OX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9mc2Uvd29yZDJ2ZWMtZ29vZ2xlLW5ld3MtMzAwL2YyMjM3MDI2OGNhMGI0ZmIxMjU2N2RmNzU0ZjA3OWI0NzA4YTE4OWQ1ZjA2M2RlMTlmYTE5NTM1ZTkxZDQxZGU%7EcmVzcG9uc2UtY29ud

# References:

* https://dev.to/mage_ai/how-to-build-a-search-engine-with-word-embeddings-56jd

