In [1]:
import numpy as np

from numpy.linalg import norm
from sentence_transformers import SentenceTransformer

In [2]:
# Define the model we want to use (it'll download itself)
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

sentences = [
  "That is a very happy person",
  "That is a happy dog",
  "Today is a sunny day", 
  "You are a happy person",
]

# vector embeddings created from dataset
embeddings = model.encode(sentences)

In [3]:
# query vector embedding
query_embedding = model.encode("That is a happy person")

In [4]:
# define our distance metric
def cosine_similarity(a, b):
    return np.dot(a, b)/(norm(a)*norm(b))

# run semantic similarity search
print("Query: That is a happy person")
for e, s in zip(embeddings, sentences):
    print(s, " -> similarity score = ",
         cosine_similarity(e, query_embedding))

Query: That is a happy person
That is a very happy person  -> similarity score =  0.9429152
That is a happy dog  -> similarity score =  0.69457746
Today is a sunny day  -> similarity score =  0.2568761
You are a happy person  -> similarity score =  0.7832136


---

In [5]:
import os
from dotenv import load_dotenv

load_dotenv()

import redis
from redis.commands.search.field import TagField, VectorField
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
from redis.commands.search.query import Query

In [6]:
r = redis.Redis(
  host='redis-13273.c266.us-east-1-3.ec2.cloud.redislabs.com',
  port=13273,
  password=os.getenv('REDIS_PASSWORD')
)

In [7]:
INDEX_NAME = "index" # Vector Index Name
DOC_PREFIX = "doc:" # RediSearch Key Prefix for the Index

In [8]:
# r.ft(INDEX_NAME).dropindex(delete_documents=True)

b'OK'

In [9]:
def create_index(vector_dimensions: int):
    try:
        # check to see if index exists
        r.ft(INDEX_NAME).info()
        print("Index already exists!")
    except:
        # schema
        schema = (
            TagField("tag"),                       # Tag Field Name
            VectorField("vector",                  # Vector Field Name
                "FLAT", {                          # Vector Index Type: FLAT or HNSW
                    "TYPE": "FLOAT32",             # FLOAT32 or FLOAT64
                    "DIM": vector_dimensions,      # Number of Vector Dimensions
                    "DISTANCE_METRIC": "COSINE",   # Vector Search Distance Metric
                }
            ),
        )

        # index Definition
        definition = IndexDefinition(prefix=[DOC_PREFIX], index_type=IndexType.HASH)

        # create Index
        r.ft(INDEX_NAME).create_index(fields=schema, definition=definition)

In [10]:
# define vector dimensions
VECTOR_DIMENSIONS = len(query_embedding)

# create the index
create_index(vector_dimensions=VECTOR_DIMENSIONS)

In [11]:
# Write to Redis
pipe = r.pipeline()
for i, embedding in enumerate(embeddings):
    pipe.hset(f"doc:{i}", mapping = {
        "vector": embedding.tobytes(),
        "content": sentences[i],
        "tag": "mytag"
    })
res = pipe.execute()

In [12]:
query = (
    Query("(@tag:{ mytag })=>[KNN 3 @vector $vec as score]")
     .sort_by("score")
     .return_fields("content", "tag", "score")
     .paging(0, 3)
     .dialect(2)
)

query_params = {"vec": query_embedding.tobytes()}
r.ft(INDEX_NAME).search(query, query_params).docs

[Document {'id': 'doc:0', 'payload': None, 'score': '0.0570849180222', 'content': 'That is a very happy person', 'tag': 'mytag'},
 Document {'id': 'doc:3', 'payload': None, 'score': '0.216786384583', 'content': 'You are a happy person', 'tag': 'mytag'},
 Document {'id': 'doc:1', 'payload': None, 'score': '0.305422663689', 'content': 'That is a happy dog', 'tag': 'mytag'}]