In [6]:
# from transformers import AutoModel, AutoTokenizer
# import torch

# # Define the model name
# model_name = "sentence-transformers/all-MiniLM-L6-v2"

# # Load the model and tokenizer
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModel.from_pretrained(model_name)

# # Save the model and tokenizer locally
# model.save_pretrained("./my_local_model/all-MiniLM-L6-v2")
# tokenizer.save_pretrained("./my_local_model/all-MiniLM-L6-v2")

In [10]:
from transformers import AutoModel, AutoTokenizer
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
class Search_engine:
    def __init__(self):
        self.model = AutoModel.from_pretrained("./my_local_model/all-MiniLM-L6-v2")
        self.tokenizer = AutoTokenizer.from_pretrained("./my_local_model/all-MiniLM-L6-v2")
        self.vector_data=[]
        self.base_data=[]
        self.threshold=0.7
        
    def get_cls_vector(self,sentence):
        # Still use return_tensors="pt" for PyTorch model
        inputs = self.tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
        
        with torch.no_grad():
            outputs = self.model(**inputs)
    
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token
        return cls_embedding.squeeze().cpu().numpy()  # convert to NumPy

    def similarity(self,X,Y):
        result=cosine_similarity([X],[Y])[0][0]
        return result
    
    def add_one(self,X):        
        vector=self.get_cls_vector(X)
        check=self.search_one(X)
        n=len(check)
        if n>0 and round(check[0][0],2)==1.0:
            print(f"Already present similar sentence/word = {check[0][1]}")
        else:
            meta_data=len(self.vector_data)
            self.vector_data.append((meta_data,vector))
            self.base_data.append(X)
        
    def add_more(self,sentences):
        for X in sentences:
            self.add_one(X)
        
    def search_one(self,X,metadata=False,threshold=None):
        if threshold==None:
            threshold=self.threshold
        vector=self.get_cls_vector(X)
        result=[]
        for i in self.vector_data:
            score=self.similarity(vector,i[1])
            if score>threshold:
                data=self.base_data[i[0]]
                if metadata:
                    index=i[0]
                    result.append((score,data,index))
                else:
                    result.append((score,data))
        result=sorted(result,key=lambda x: x[0],reverse=True)
        return result

    

    def remove_one(self,X):
        vector=self.get_cls_vector(X)
        check=self.search_one(X,metadata=True)
        
        n=len(check)
        if n>0 and round(check[0][0],2)==1.0:
            print(f"Similar sentence/word found = {check[0][1]}")
            option=input("Would you like to delete the record(y/n) : ")
            if option=="y":
                index=check[0][2]
                vector=self.vector_data.pop(index)
                base=self.base_data.pop(index)
                print(f"'{check[0][1]}' at index {index} deleted")
                
        else:
            print(f" '{X}' not found")
        
        
    def remove_all(self):
        self.vector_data=[]
        self.base_data=[]
        print('All vectors removed')
        

In [12]:
obj=Search_engine()


In [13]:
books = [
    "To Kill a Mockingbird – A classic novel by Harper Lee that explores racial injustice in the Deep South through the eyes of a young girl.",
    "1984 – George Orwell’s dystopian vision of a totalitarian future where Big Brother watches everyone.",
    "Pride and Prejudice – Jane Austen’s romantic novel exploring manners, marriage, and morality in 19th-century England.",
    "The Great Gatsby – F. Scott Fitzgerald’s portrayal of the American Dream and excess during the Jazz Age.",
    "Moby-Dick – Herman Melville’s epic tale of obsession and revenge between a whaling captain and a legendary whale.",
    "War and Peace – Leo Tolstoy’s monumental work on history, love, and conflict during the Napoleonic Wars in Russia.",
    "Crime and Punishment – Fyodor Dostoevsky’s deep psychological exploration of guilt and redemption after a murder.",
    "The Catcher in the Rye – J.D. Salinger’s coming-of-age novel about teenage rebellion and alienation.",
    "Brave New World – Aldous Huxley’s futuristic society ruled by technology, consumerism, and engineered happiness.",
    "The Lord of the Rings – J.R.R. Tolkien’s legendary fantasy epic about good vs. evil in Middle-earth.",
    "Jane Eyre – Charlotte Brontë’s Gothic romance about a strong-willed orphan and her complex relationship with Mr. Rochester.",
    "The Hobbit – A prelude to Lord of the Rings, following Bilbo Baggins on a magical adventure with dwarves and dragons.",
    "Wuthering Heights – Emily Brontë’s dark, intense love story set on the Yorkshire moors.",
    "One Hundred Years of Solitude – Gabriel García Márquez’s magical realist chronicle of the Buendía family in the fictional town of Macondo.",
    "The Brothers Karamazov – Dostoevsky’s exploration of faith, doubt, free will, and family conflict.",
    "The Alchemist – Paulo Coelho’s philosophical fable about following your dreams and listening to your heart.",
    "The Book Thief – Markus Zusak’s WWII novel narrated by Death, centered on a girl who finds solace in stolen books.",
    "The Kite Runner – Khaled Hosseini’s story of friendship, betrayal, and redemption set against Afghanistan’s turbulent history.",
    "Anna Karenina – Tolstoy’s tragic love story that delves into themes of passion, family, and Russian society.",
    "Fahrenheit 451 – Ray Bradbury’s dystopian novel where books are banned and 'firemen' burn them to suppress ideas."
]

obj.remove_all()
obj.add_more(books)

All vectors removed


In [29]:

result=obj.search_one("Fyodor Dostoevsky’s",threshold=0.7)

In [30]:
len(result),len(books)

(5, 20)

In [31]:
for idx,title in result:
    print(idx,title[:100])

0.8869642 Crime and Punishment – Fyodor Dostoevsky’s deep psychological exploration of guilt and redemption af
0.85153997 The Brothers Karamazov – Dostoevsky’s exploration of faith, doubt, free will, and family conflict.
0.7733222 Anna Karenina – Tolstoy’s tragic love story that delves into themes of passion, family, and Russian 
0.76929617 War and Peace – Leo Tolstoy’s monumental work on history, love, and conflict during the Napoleonic W
0.723937 One Hundred Years of Solitude – Gabriel García Márquez’s magical realist chronicle of the Buendía fa
