#Vector DataBase search Tool -  Adaya.ai

#Objective: Create a basic search tool that uses a vector database to store and retrieve high-dimensional data, such as image embeddings or word vectors.

##Task:
Create the 512-dimensional vectors for the column - “Product Name” in the data file given below using TF-IDF technique.

Implement a simple vector database using libraries like Faiss / Annoy / ChromaDB / Weaviate / Pinecone.

Develop a method to add the documents and index vectors in the database. Each document should contain “Product Name”, “MRP”, “Short description” corresponding to that product name.

Create an user interface where users can input a product name as a query and can retrieve 10 most similar product names and their details from the database.

Document the procedure along with the results. Feel free to add reasoning  wherever it is required.  


In [None]:
!pip install pinecone-client faiss-cpu pandas numpy sklearn Sentence-Transformers

#library

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import pinecone
import faiss

#Load the Dataset

In [None]:
data=pd.read_csv('/content/product_vdb.csv')
data

In [None]:
data.isnull().sum()

In [None]:
data[data.duplicated()=='True']

In [None]:
data.info()

#PineCone Vector Database

In [None]:

from pinecone import Pinecone, PodSpec

pc = Pinecone(api_key='API KEY')


#Encoder

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=512)

#or
#from sentence_transformers import SentenceTransformer
#model = SentenceTransformer('bert-base-nli-mean-tokens')
#embeding = model.encode("This is sentence")

#Create a vector Database in Pinecone

In [None]:
index_name = "testing3"

pc.create_index(
    name=index_name,
    dimension=512,
    metric="cosine",
    spec=ServerlessSpec(
        cloud='aws',
        region='us-east-1'
    )
)

In [None]:
index=pc.Index('testing3')

#Insert the data into the DataBase

In [None]:
question_list = []
for i,row in data.iterrows():

  encoded_vector = model.encode(row['Product Name']).tolist()
  question_list.append(
      (
        str(i),
        encoded_vector,
        {
            'MRP': int(row['MRP']),
            'Short description': row['Short description']
        }
      )
  )
  if len(question_list)==50:
    index.upsert(vectors=question_list)
    question_list = []


#Testing

In [None]:
query = "Men Grey Pure Cotton Printed Spread Collar Roll Up Sleeves Casual Shirt"
xq = model.encode([query]).tolist()
result = index.query(vector=xq, top_k=5, includeMetadata=True)
result

{'matches': [{'id': '0',
              'metadata': {'MRP': 2199.0,
                           'Short description': 'Grey abstract printed opaque '
                                                'Casual shirt ,has a spread '
                                                'collar, button placket, 1 '
                                                'patch pocket, long roll-up '
                                                'sleeves, curved hem'},
              'score': 1.00023592,
              'values': []},
             {'id': '836',
              'metadata': {'MRP': 1899.0,
                           'Short description': 'Grey striped opaque Casual '
                                                'shirt ,has a spread collar, '
                                                'button placket, short regular '
                                                'sleeves, curved hem'},
              'score': 0.962794363,
              'values': []},
             {'id': '1787',
              'metadata': {'MRP': 1499.0,
                           'Short description': 'Grey solid opaque Casual '
                                                'shirt ,has a spread collar, '
                                                'button placket, 1 patch '
                                                'pocket, short regular '
                                                'sleeves, curved hem'},
              'score': 0.958695352,
              'values': []},
             {'id': '3518',
              'metadata': {'MRP': 1799.0,
                           'Short description': 'Grey geometric printed opaque '
                                                'Casual shirt ,has a mandarin '
                                                'collar, button placket, short '
                                                'regular sleeves, curved hem'},
              'score': 0.952561378,
              'values': []},
             {'id': '1527',
              'metadata': {'MRP': 1399.0,
                           'Short description': 'Grey Micro disty printed '
                                                'opaque Casual shirt ,has a '
                                                'spread collar, button '
                                                'placket, 1 patch pocket, long '
                                                'regular sleeves, curved hem'},
              'score': 0.952561378,
              'values': []}],
 'namespace': '',
 'usage': {'read_units': 6}}

In [None]:
ids = [match['id'] for match in result['matches']]


print(ids)

for i in ids:
  print(data.iloc[int(i)])

#Faiss - Vector DataBase

In [None]:
product = data['Product Name']
vectors = model.encode(product)

In [None]:
import faiss

tfidf_vectorizer = TfidfVectorizer()


tfidf_vectors = tfidf_vectorizer.fit_transform(product)


normalized_tfidf_vectors = tfidf_vectors / np.linalg.norm(tfidf_vectors, axis=1)[:, np.newaxis]


vector_dimension = normalized_tfidf_vectors.shape[1]


index = faiss.IndexFlatL2(vector_dimension)

index.add(normalized_tfidf_vectors)

In [None]:
search_text = 'Men Grey Pure Cotton Printed Spread Collar Roll Up Sleeves Casual Shirt'
search_vector = model.encode(search_text)
_vector = np.array([search_vector])
faiss.normalize_L2(_vector)

k = 5
distances, ann = index.search(_vector, k=k)